|
# for generate (gradio server) and finetune |
|
datasets==2.13.0 |
|
sentencepiece==0.1.99 |
|
gradio==3.37.0 |
|
huggingface_hub==0.16.4 |
|
appdirs==1.4.4 |
|
fire==0.5.0 |
|
docutils==0.20.1 |
|
torch==2.0.1; sys_platform != "darwin" and platform_machine != "arm64" |
|
evaluate==0.4.0 |
|
rouge_score==0.1.2 |
|
sacrebleu==2.3.1 |
|
scikit-learn==1.2.2 |
|
alt-profanity-check==1.2.2 |
|
better-profanity==0.7.0 |
|
numpy==1.24.3 |
|
pandas==2.0.2 |
|
matplotlib==3.7.1 |
|
loralib==0.1.1 |
|
bitsandbytes==0.39.0 |
|
accelerate==0.20.3 |
|
peft==0.4.0 |
|
transformers==4.30.2 |
|
tokenizers==0.13.3 |
|
APScheduler==3.10.1 |
|
|
|
# optional for generate |
|
pynvml==11.5.0 |
|
psutil==5.9.5 |
|
boto3==1.26.101 |
|
botocore==1.29.101 |
|
|
|
# optional for finetune |
|
tensorboard==2.13.0 |
|
neptune==1.2.0 |
|
|
|
# for gradio client |
|
gradio_client==0.2.10 |
|
beautifulsoup4==4.12.2 |
|
markdown==3.4.3 |
|
|
|
# data and testing |
|
pytest==7.2.2 |
|
pytest-xdist==3.2.1 |
|
nltk==3.8.1 |
|
textstat==0.7.3 |
|
# pandoc==2.3 |
|
pypandoc==1.11; sys_platform == "darwin" and platform_machine == "arm64" |
|
pypandoc_binary==1.11; platform_machine == "x86_64" |
|
openpyxl==3.1.2 |
|
lm_dataformat==0.0.20 |
|
bioc==2.0 |
|
|
|
# falcon |
|
einops==0.6.1 |
|
instructorembedding==1.0.1 |
|
|
|
# for gpt4all .env file, but avoid worrying about imports |
|
python-dotenv==1.0.0 |
|
|
|
text-generation==0.6.0 |
|
# for tokenization when don't have HF tokenizer |
|
tiktoken==0.4.0 |
|
# optional: for OpenAI endpoint or embeddings (requires key) |
|
openai==0.27.8 |
|
# optional for chat with PDF |
|
langchain==0.0.235 |
|
pypdf==3.12.2 |
|
# avoid textract, requires old six |
|
#textract==1.6.5 |
|
|
|
# for HF embeddings |
|
sentence_transformers==2.2.2 |
|
|
|
# local vector db |
|
chromadb==0.3.25 |
|
# server vector db |
|
#pymilvus==2.2.8 |
|
|
|
# weak url support, if can't install opencv etc. If comment-in this one, then comment-out unstructured[local-inference]==0.6.6 |
|
# unstructured==0.8.1 |
|
|
|
# strong support for images |
|
# Requires on Ubuntu: sudo apt-get install libmagic-dev poppler-utils tesseract-ocr libtesseract-dev libreoffice |
|
unstructured[local-inference]==0.7.4 |
|
#pdf2image==1.16.3 |
|
#pytesseract==0.3.10 |
|
pillow |
|
|
|
pdfminer.six==20221105 |
|
urllib3 |
|
requests_file |
|
|
|
#pdf2image==1.16.3 |
|
#pytesseract==0.3.10 |
|
tabulate==0.9.0 |
|
# FYI pandoc already part of requirements.txt |
|
|
|
# JSONLoader, but makes some trouble for some users |
|
# jq==1.4.1 |
|
|
|
# to check licenses |
|
# Run: pip-licenses|grep -v 'BSD\|Apache\|MIT' |
|
pip-licenses==4.3.0 |
|
|
|
# weaviate vector db |
|
weaviate-client==3.22.1 |
|
# optional for chat with PDF |
|
langchain==0.0.235 |
|
pypdf==3.12.2 |
|
# avoid textract, requires old six |
|
#textract==1.6.5 |
|
|
|
# for HF embeddings |
|
sentence_transformers==2.2.2 |
|
|
|
# local vector db |
|
chromadb==0.3.25 |
|
# server vector db |
|
#pymilvus==2.2.8 |
|
|
|
# weak url support, if can't install opencv etc. If comment-in this one, then comment-out unstructured[local-inference]==0.6.6 |
|
# unstructured==0.8.1 |
|
|
|
# strong support for images |
|
# Requires on Ubuntu: sudo apt-get install libmagic-dev poppler-utils tesseract-ocr libtesseract-dev libreoffice |
|
unstructured[local-inference]==0.7.4 |
|
#pdf2image==1.16.3 |
|
#pytesseract==0.3.10 |
|
pillow |
|
|
|
pdfminer.six==20221105 |
|
urllib3 |
|
requests_file |
|
|
|
#pdf2image==1.16.3 |
|
#pytesseract==0.3.10 |
|
tabulate==0.9.0 |
|
# FYI pandoc already part of requirements.txt |
|
|
|
# JSONLoader, but makes some trouble for some users |
|
# jq==1.4.1 |
|
|
|
# to check licenses |
|
# Run: pip-licenses|grep -v 'BSD\|Apache\|MIT' |
|
pip-licenses==4.3.0 |
|
|
|
# weaviate vector db |
|
weaviate-client==3.22.1 |
|
faiss-gpu==1.7.2 |
|
arxiv==1.4.8 |
|
pymupdf==1.22.5 # AGPL license |
|
# extract-msg==0.41.1 # GPL3 |
|
|