lxml
nltk

[huggingface]
transformers

[pdf]
layoutparser[layoutmodels,tesseract]
