h2ogpt-chatbot2

Runtime error

App Files Files Community

pseudotensor commited on May 23, 2023

Commit

d1a52ba

•

1 Parent(s): 06c87e6

Update with h2oGPT hash 2391b7887b928de3239af955d0263a66fb8a9710

Browse files

Files changed (13) hide show

client_test.py +81 -52
create_data.py +1818 -0
finetune.py +4 -378
generate.py +302 -110
gpt4all_llm.py +119 -0
gpt_langchain.py +1076 -0
gradio_runner.py +634 -46
gradio_themes.py +3 -1
h2oai_pipeline.py +54 -0
loaders.py +50 -0
prompter.py +370 -1
requirements.txt +51 -3
utils.py +477 -6

client_test.py CHANGED Viewed

@@ -36,84 +36,113 @@ Loaded as API: https://gpt.h2o.ai ✔
 {'instruction_nochat': 'Who are you?', 'iinput_nochat': '', 'response': 'I am h2oGPT, a chatbot created by LAION.'}
 """
 debug = False
-import os
 os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
-def get_client():
     from gradio_client import Client
-    client = Client(os.getenv('HOST', "http://localhost:7860"))
     if debug:
         print(client.view_api(all_endpoints=True))
     return client
 def test_client_basic():
-    return run_client_basic(instruction_nochat='Who are you?', prompt_type='human_bot')
-def run_client_basic(instruction_nochat, prompt_type):
-    instruction = ''  # only for chat=True
-    iinput = ''  # only for chat=True
-    context = ''
-    # streaming output is supported, loops over and outputs each generation in streaming mode
-    # but leave stream_output=False for simple input/output mode
-    stream_output = False
-    temperature = 0.1
-    top_p = 0.75
-    top_k = 40
-    num_beams = 1
-    max_new_tokens = 50
-    min_new_tokens = 0
-    early_stopping = False
-    max_time = 20
-    repetition_penalty = 1.0
-    num_return_sequences = 1
-    do_sample = True
-    # only these 2 below used if pass chat=False
-    chat = False
-    iinput_nochat = ''
-    args = [instruction,
-            iinput,
-            context,
-            stream_output,
-            prompt_type,
-            temperature,
-            top_p,
-            top_k,
-            num_beams,
-            max_new_tokens,
-            min_new_tokens,
-            early_stopping,
-            max_time,
-            repetition_penalty,
-            num_return_sequences,
-            do_sample,
-            chat,
-            instruction_nochat,
-            iinput_nochat,
-            ]
     api_name = '/submit_nochat'
-    client = get_client()
     res = client.predict(
         *tuple(args),
         api_name=api_name,
     )
-    res_dict = dict(instruction_nochat=instruction_nochat, iinput_nochat=iinput_nochat, response=md_to_text(res))
     print(res_dict)
     return res_dict
-import markdown  # pip install markdown
-from bs4 import BeautifulSoup  # pip install beautifulsoup4
 def md_to_text(md):
     html = markdown.markdown(md)
     soup = BeautifulSoup(html, features='html.parser')
     return soup.get_text()

 {'instruction_nochat': 'Who are you?', 'iinput_nochat': '', 'response': 'I am h2oGPT, a chatbot created by LAION.'}
 """
+import time
+import os
+import markdown  # pip install markdown
+from bs4 import BeautifulSoup  # pip install beautifulsoup4
 debug = False
 os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
+def get_client(serialize=True):
     from gradio_client import Client
+    client = Client(os.getenv('HOST', "http://localhost:7860"), serialize=serialize)
     if debug:
         print(client.view_api(all_endpoints=True))
     return client
+def get_args(prompt, prompt_type, chat=False, stream_output=False, max_new_tokens=50):
+    from collections import OrderedDict
+    kwargs = OrderedDict(instruction=prompt if chat else '',  # only for chat=True
+                         iinput='',  # only for chat=True
+                         context='',
+                         # streaming output is supported, loops over and outputs each generation in streaming mode
+                         # but leave stream_output=False for simple input/output mode
+                         stream_output=stream_output,
+                         prompt_type=prompt_type,
+                         temperature=0.1,
+                         top_p=0.75,
+                         top_k=40,
+                         num_beams=1,
+                         max_new_tokens=max_new_tokens,
+                         min_new_tokens=0,
+                         early_stopping=False,
+                         max_time=20,
+                         repetition_penalty=1.0,
+                         num_return_sequences=1,
+                         do_sample=True,
+                         chat=chat,
+                         instruction_nochat=prompt if not chat else '',
+                         iinput_nochat='',  # only for chat=False
+                         langchain_mode='Disabled',
+                         )
+    if chat:
+        # add chatbot output on end.  Assumes serialize=False
+        kwargs.update(dict(chatbot=[['', None]]))
+    return kwargs, list(kwargs.values())
 def test_client_basic():
+    return run_client_nochat(prompt='Who are you?', prompt_type='human_bot', max_new_tokens=50)
+def run_client_nochat(prompt, prompt_type, max_new_tokens):
+    kwargs, args = get_args(prompt, prompt_type, chat=False, max_new_tokens=max_new_tokens)
     api_name = '/submit_nochat'
+    client = get_client(serialize=True)
     res = client.predict(
         *tuple(args),
         api_name=api_name,
     )
+    res_dict = dict(prompt=kwargs['instruction_nochat'], iinput=kwargs['iinput_nochat'],
+                    response=md_to_text(res))
     print(res_dict)
     return res_dict
+def test_client_chat():
+    return run_client_chat(prompt='Who are you?', prompt_type='human_bot', stream_output=False, max_new_tokens=50)
+def run_client_chat(prompt, prompt_type, stream_output, max_new_tokens):
+    kwargs, args = get_args(prompt, prompt_type, chat=True, stream_output=stream_output, max_new_tokens=max_new_tokens)
+    client = get_client(serialize=False)
+    res = client.predict(*tuple(args), api_name='/instruction')
+    args[-1] += [res[-1]]
+    res_dict = kwargs
+    res_dict['prompt'] = prompt
+    if not kwargs['stream_output']:
+        res = client.predict(*tuple(args), api_name='/instruction_bot')
+        res_dict['response'] = res[0][-1][1]
+        print(md_to_text(res_dict['response']))
+        return res_dict
+    else:
+        job = client.submit(*tuple(args), api_name='/instruction_bot')
+        res1 = ''
+        while not job.done():
+            outputs_list = job.communicator.job.outputs
+            if outputs_list:
+                res = job.communicator.job.outputs[-1]
+                res1 = res[0][-1][-1]
+                res1 = md_to_text(res1)
+                print(res1)
+            time.sleep(0.1)
+        print(job.outputs())
+        res_dict['response'] = res1
+        return res_dict
 def md_to_text(md):
+    assert md is not None, "Markdown is None"
     html = markdown.markdown(md)
     soup = BeautifulSoup(html, features='html.parser')
     return soup.get_text()

create_data.py ADDED Viewed

	@@ -0,0 +1,1818 @@

+"""
+Dataset creation tools.
+Keep to-level imports clean of non-trivial imports for specific tools,
+because this file is imported for various purposes
+"""
+import ast
+import concurrent.futures
+import contextlib
+import hashlib
+import json
+import os
+import shutil
+import signal
+import sys
+import traceback
+from concurrent.futures import ProcessPoolExecutor
+import psutil
+import pytest
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+from utils import flatten_list
+def parse_rst_file(filepath):
+    with open(filepath, 'r') as f:
+        input_data = f.read()
+    settings_overrides = {'initial_header_level': 2}
+    from docutils import core
+    document = core.publish_doctree(
+        source=input_data,
+        source_path=filepath,
+        settings_overrides=settings_overrides,
+    )
+    qa_pairs = []
+    current_section = None
+    current_question = ""
+    current_answer = ""
+    for node in document.traverse():
+        if node.__class__.__name__ == 'section':
+            current_section = ""
+        elif current_section is not None:
+            if node.__class__.__name__ == 'Text':
+                if node.astext()[-1] == "?":
+                    if current_question:
+                        qa_pairs.append((current_question, current_answer))
+                    current_question = node.astext()
+                    current_answer = ""
+                else:
+                    current_answer += node.astext()
+    if current_answer:
+        qa_pairs.append((current_question, current_answer))
+    return {k: v for k, v in qa_pairs}
+def test_scrape_dai_docs():
+    home = os.path.expanduser('~')
+    file = os.path.join(home, 'h2oai/docs/faq.rst')
+    qa_pairs = parse_rst_file(file)
+    prompt_type = 'human_bot'
+    from prompter import prompt_types
+    assert prompt_type in prompt_types
+    save_thing = [{"instruction": k, "output": v, 'prompt_type': prompt_type} for k, v in qa_pairs.items()]
+    output_file = "dai_faq.json"
+    with open(output_file, "wt") as f:
+        f.write(json.dumps(save_thing, indent=2))
+def test_scrape_dai_docs_all():
+    """
+    pytest create_data.py::test_scrape_dai_docs_all
+    """
+    import glob
+    import nltk
+    nltk.download('punkt')
+    dd = {}
+    np.random.seed(1234)
+    home = os.path.expanduser('~')
+    files = list(glob.glob(os.path.join(home, "h2oai/docs/**/*rst")))
+    np.random.shuffle(files)
+    val_count = int(0.05 * len(files))
+    train_files = files[val_count:]
+    valid_files = files[:val_count]
+    things = [
+        ("dai_docs.train.json", train_files),
+        ("dai_docs.valid.json", valid_files)
+    ]
+    for LEN in [100, 200, 500]:
+        for output_file, ff in things:
+            if output_file not in dd:
+                dd[output_file] = []
+            for f in ff:
+                with open(f) as input:
+                    blob = input.read()
+                    blob = blob.replace("~~", "")
+                    blob = blob.replace("==", "")
+                    blob = blob.replace("''", "")
+                    blob = blob.replace("--", "")
+                    blob = blob.replace("**", "")
+                    dd[output_file].extend(get_sentences(blob, length=LEN))
+    for output_file, _ in things:
+        save_thing = [{"output": k.strip(), 'prompt_type': 'plain'} for k in dd[output_file]]
+        with open(output_file, "wt") as f:
+            f.write(json.dumps(save_thing, indent=2))
+def get_sentences(blob, length):
+    """
+    break-up input text into sentences and then output list of sentences of about length in size
+    :param blob:
+    :param length:
+    :return:
+    """
+    import nltk
+    nltk.download('punkt')
+    from nltk.tokenize import sent_tokenize
+    sentences = sent_tokenize(blob)
+    my_sentences = []
+    my_string = ""
+    for sentence in sentences:
+        if len(my_string) + len(sentence) <= length:
+            if my_string:
+                my_string += " " + sentence
+            else:
+                my_string = sentence
+        else:
+            my_sentences.append(my_string)
+            my_string = ""
+    return my_sentences or [my_string]
+def setup_dai_docs(path=None, dst="working_dir_docs", from_hf=False):
+    """
+    Only supported if have access to source code or HF token for HF spaces and from_hf=True
+    :param path:
+    :param dst:
+    :param from_hf:
+    :return:
+    """
+    home = os.path.expanduser('~')
+    if from_hf:
+        # assumes
+        from huggingface_hub import hf_hub_download
+        # True for case when locally already logged in with correct token, so don't have to set key
+        token = os.getenv('HUGGINGFACE_API_TOKEN', True)
+        path_to_zip_file = hf_hub_download('h2oai/dai_docs', 'dai_docs.zip', token=token, repo_type='dataset')
+        path = 'h2oai'
+        import zipfile
+        with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
+            zip_ref.extractall(path)
+        path = os.path.join(path, 'docs/**/*')
+    if path is None:
+        if os.path.isdir(os.path.join(home, 'h2oai')):
+            path = os.path.join(home, "h2oai/docs/**/*")
+        else:
+            assert os.path.isdir(os.path.join(home, 'h2oai.superclean')), '%s does not exist' % path
+            path = os.path.join(home, "h2oai.superclean/docs/**/*")
+    import glob
+    files = list(glob.glob(path, recursive=True))
+    # pandoc can't find include files
+    remove(dst)
+    os.makedirs(dst)
+    # copy full tree, for absolute paths in rst
+    for fil in files:
+        if os.path.isfile(fil):
+            shutil.copy(fil, dst)
+    # hack for relative path
+    scorers_dir = os.path.join(dst, 'scorers')
+    makedirs(scorers_dir)
+    for fil in glob.glob(os.path.join(dst, '*.frag')):
+        shutil.copy(fil, scorers_dir)
+    return dst
+def rst_to_outputs(files, min_len=30, max_len=2048//2 - 30):
+    # account for sequence length (context window) including prompt and input and output
+    # os.system('pandoc -f rst -t plain ./expert_settings/nlp_settings.rst')
+    import pypandoc
+    basedir = os.path.abspath(os.getcwd())
+    outputs = []
+    for fil in files:
+        os.chdir(basedir)
+        os.chdir(os.path.dirname(fil))
+        fil = os.path.basename(fil)
+        print("Processing %s" % fil, flush=True)
+        # out_format can be one of: asciidoc, asciidoctor, beamer, biblatex, bibtex, commonmark, commonmark_x,
+        # context, csljson, docbook, docbook4, docbook5, docx, dokuwiki,
+        # dzslides, epub, epub2, epub3, fb2, gfm, haddock, html, html4, html5, icml,
+        # ipynb, jats, jats_archiving, jats_articleauthoring, jats_publishing, jira,
+        # json, latex, man,
+        # markdown, markdown_github, markdown_mmd, markdown_phpextra, markdown_strict,
+        # mediawiki, ms, muse, native, odt, opendocument, opml, org, pdf, plain, pptx,
+        # revealjs, rst, rtf, s5, slideous, slidy, tei, texinfo, textile, xwiki, zimwiki
+        out_format = 'plain'
+        # avoid extra new lines injected into text
+        extra_args = ['--wrap=preserve', '--resource path="%s" % dst']
+        plain_list = []
+        try:
+            # valid for expert settings
+            input_rst = pypandoc.convert_file(fil, 'rst')
+            input_list = input_rst.split('\n``')
+            for input_subrst in input_list:
+                input_plain = pypandoc.convert_text(input_subrst, format='rst', to='plain')
+                plain_list.append([input_plain, fil])
+        except Exception as e:
+            print("file exception: %s %s" % (fil, str(e)), flush=True)
+        if not plain_list:
+            # if failed to process as pieces of rst, then
+            output = pypandoc.convert_file(fil, out_format, extra_args=extra_args, format='rst')
+            outputs1 = get_sentences(output, length=max_len)
+            for oi, output in enumerate(outputs1):
+                output = output.replace('\n\n', '\n')
+                plain_list.append([output, fil])
+        outputs.extend(plain_list)
+    # report:
+    # [print(len(x)) for x in outputs]
+    # deal with blocks longer than context size (sequence length) of 2048
+    new_outputs = []
+    num_truncated = 0
+    num_orig = len(outputs)
+    for output, fil in outputs:
+        if len(output) < max_len:
+            new_outputs.append([output, fil])
+            continue
+        outputs1 = get_sentences(output, length=max_len)
+        for oi, output1 in enumerate(outputs1):
+            output1 = output1.replace('\n\n', '\n')
+            new_outputs.append([output1, fil])
+        num_truncated += 1
+    print('num_orig: %s num_truncated: %s' % (num_orig, num_truncated), flush=True)
+    new_outputs = [[k.strip(), fil] for k, fil in new_outputs if len(k.strip()) > min_len]
+    return new_outputs
+def test_scrape_dai_docs_all_pandoc():
+    """
+    pytest -s -v create_data.py::test_scrape_dai_docs_all_pandoc
+    :return:
+    """
+    dst = setup_dai_docs()
+    import glob
+    files = list(glob.glob(os.path.join(dst, '*rst'), recursive=True))
+    basedir = os.path.abspath(os.getcwd())
+    new_outputs = rst_to_outputs(files)
+    os.chdir(basedir)
+    remove(dst)
+    save_thing = [{"output": k.strip(), 'prompt_type': 'plain'} for k in new_outputs]
+    output_file = "dai_docs.train_cleaned.json"
+    with open(output_file, "wt") as f:
+        f.write(json.dumps(save_thing, indent=2))
+def remove(path: str):
+    try:
+        if path is not None and os.path.exists(path):
+            if os.path.isdir(path):
+                shutil_rmtree(path, ignore_errors=True)
+            else:
+                with contextlib.suppress(FileNotFoundError):
+                    os.remove(path)
+    except:
+        pass
+def shutil_rmtree(*args, **kwargs):
+    return shutil.rmtree(*args, **kwargs)
+def test_config_to_json():
+    """
+    Needs to run from Driverless AI source directory.
+    E.g. (base) jon@gpu:~/h2oai$ pytest -s -v /data/jon/h2ogpt/create_data.py::test_config_to_json ; cp config.json /data/jon/h2ogpt/
+    :return:
+    """
+    try:
+        # Arrange
+        import json
+        from h2oaicore.systemutils import config
+        toml_list = []
+        for k, v in config.get_meta_dict().items():
+            title = (v.title + ": ") if v.title else ''
+            comment = v.comment or ''
+            if not (title or comment):
+                continue
+            toml_list.extend(
+                [
+                    {
+                        'prompt_type': 'plain',
+                        'instruction': f"<human>: What does {k} do?\n<bot>: {k.replace('_', ' ')} config.toml:  {comment or title}\n<human>:".replace("\n", ""),
+                    },
+                    {
+                        'prompt_type': 'plain',
+                        'instruction': f"<human>: Explain {k}.\n<bot>: {k.replace('_', ' ')} config.toml:  {comment or title}\n<human>:".replace("\n", ""),
+                    },
+                    {
+                        'prompt_type': 'plain',
+                        'instruction': f"<human>: How can I do this: {title}.\n<bot>: Set the {k.replace('_', ' ')} config.toml\n<human>:".replace("\n", ""),
+                    } if title and comment else None,
+                    {
+                        'prompt_type': 'human_bot',
+                        'instruction': f'Explain the following expert setting for Driverless AI',
+                        'input': f"{k}",
+                        'output': f"{k.replace('_', ' ')} config.toml: {comment or title}".replace("\n", ""),
+                    },
+                    {
+                        'prompt_type': 'human_bot',
+                        'instruction': f'Explain the following expert setting for Driverless AI',
+                        'input': f"{k}",
+                        'output': f"{k.replace('_', ' ')} config.toml: {title}{comment}".replace("\n", ""),
+                    },
+                    {
+                        'prompt_type': 'human_bot',
+                        'instruction': f'Explain the following expert setting for Driverless AI',
+                        'input': f"{k.replace('_', ' ')}",
+                        'output': f"{k.replace('_', ' ')} config.toml: {title}{comment}".replace("\n", ""),
+                    },
+                    {
+                        'prompt_type': 'human_bot',
+                        'instruction': f'Explain the following expert setting for Driverless AI',
+                        'input': f"{title}",
+                        'output': f"{k.replace('_', ' ')} config.toml: {title}{comment}".replace("\n", ""),
+                    },
+                    {
+                        'prompt_type': 'human_bot',
+                        'instruction': f'Provide a short explanation of the expert setting {k}',
+                        'output': f"{k.replace('_', ' ')} config.toml: {comment or title}".replace("\n", ""),
+                    },
+                    {
+                        'prompt_type': 'human_bot',
+                        'instruction': f'Provide a detailed explanation of the expert setting {k}',
+                        'output': f"{k.replace('_', ' ')} config.toml: {title}{comment}".replace("\n", ""),
+                    },
+                ]
+            )
+        toml_list = [x for x in toml_list if x]
+        with open("config.json", "wt") as f:
+            f.write(json.dumps(toml_list, indent=2))
+    except Exception as e:
+        print("Exception: %s" % str(e), flush=True)
+def copy_tree(src, dst, follow_symlink=False):
+    makedirs(dst, exist_ok=True)
+    for (path, dirs, files) in os.walk(src, followlinks=follow_symlink):
+        new_path = path.replace(src, dst)
+        makedirs(new_path, exist_ok=True)
+        for file in files:
+            filename = os.path.join(path, file)
+            new_filename = os.path.join(new_path, file)
+            # print("%s -> %s" % (filename, new_filename))
+            try:
+                atomic_copy(filename, new_filename)
+            except FileNotFoundError:
+                pass
+def atomic_move(src, dst):
+    try:
+        shutil.move(src, dst)
+    except (shutil.Error, FileExistsError):
+        pass
+    remove(src)
+def atomic_copy(src=None, dst=None, with_permissions=True):
+    if os.path.isfile(dst):
+        return
+    import uuid
+    my_uuid = uuid.uuid4()
+    dst_tmp = dst + str(my_uuid)
+    makedirs(os.path.dirname(dst), exist_ok=True)
+    if with_permissions:
+        shutil.copy(src, dst_tmp)
+    else:
+        shutil.copyfile(src, dst_tmp)
+    atomic_move(dst_tmp, dst)
+    remove(dst_tmp)
+def makedirs(path, exist_ok=True):
+    """
+    Avoid some inefficiency in os.makedirs()
+    :param path:
+    :param exist_ok:
+    :return:
+    """
+    if os.path.isdir(path) and os.path.exists(path):
+        assert exist_ok, "Path already exists"
+        return path
+    os.makedirs(path, exist_ok=exist_ok)
+## Download from https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_unfiltered_cleaned_split.json
+## Turn into simple instruct prompt type. No context/previous conversations.
+def test_prep_instruct_vicuna():
+    from datasets import load_dataset
+    filename = 'ShareGPT_unfiltered_cleaned_split.json'
+    if not os.path.exists(filename):
+        os.system('wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' % filename)
+    data = load_dataset("json", data_files={"train": filename})["train"]
+    training_rows = []
+    for i in range(data.num_rows):
+        conversations = data[i]['conversations']
+        assert isinstance(conversations, list), conversations
+        convo = ""
+        for j, conv in enumerate(conversations):
+            # Get ready for generate.py prompt_type=human_bot
+            # But train with prompt_type=plain
+            if conv['from'] == 'human':
+                FROM = '<human>: '
+            elif conv['from'] == 'gpt':
+                FROM = '<bot>: '
+            convo += f"{FROM}" + conv['value'] + "\n"
+        if convo:
+            training_rows.append(dict(input=convo))
+    with open(filename + ".generate_human_bot.train_plain.json", "wt") as f:
+        f.write(json.dumps(training_rows, indent=2))
+POSTFIX = ".generate_human_bot.train_plain.json"
+# https://bair.berkeley.edu/blog/2023/04/03/koala/
+OIG_DATASETS = [
+    "unified_chip2.jsonl",
+    "unified_grade_school_math_instructions.jsonl",
+    "unified_poetry_2_song.jsonl",
+    "unified_plot_screenplay_books_dialog.jsonl",
+]
+# hub issue: https://huggingface.co/datasets/laion/OIG/discussions/4
+ALL_OIG_DATASETS = ['unified_abstract_infill.jsonl',
+                    'unified_basic.jsonl',
+                    'unified_canadian_parliament.jsonl',
+                    'unified_chip2.jsonl',
+                    'unified_conv_finqa.jsonl',
+                    'unified_cuad.jsonl',
+                    'unified_essays.jsonl',
+                    'unified_flan.jsonl.gz',
+                    'unified_grade_school_math_instructions.jsonl',
+                    'unified_hc3_human.jsonl',
+                    'unified_image_prompts_instructions.jsonl',
+                    'unified_joke_explanations.jsonl',
+                    'unified_mathqa_flanv2_kojma_cot.jsonl',
+                    'unified_merged_code_xp3.jsonl',
+                    'unified_multi_news.jsonl',
+                    'unified_multi_sum.jsonl',
+                    'unified_ni.jsonl.gz',
+                    'unified_nq.jsonl',
+                    'unified_openai_summarize_tldr.jsonl',
+                    'unified_oscar_en_sample_dialog.jsonl',
+                    'unified_p3.jsonl.gz',
+                    'unified_plot_screenplay_books_dialog.jsonl',
+                    'unified_poetry_2_song.jsonl',
+                    'unified_poetry_instructions.jsonl',
+                    'unified_rallio_safety_and_prosocial.jsonl',
+                    'unified_rallio_soda_upgraded_2048.jsonl',
+                    'unified_soda_dialog.jsonl',
+                    'unified_sqlv1.jsonl',
+                    'unified_sqlv2.jsonl',
+                    'unified_squad_v2.jsonl',
+                    'unified_squad_v2_more_neg.jsonl',
+                    'unified_ul2_plus_oscar_en_sample_dialog.jsonl',
+                    'unified_unifiedskg_instructions.jsonl',
+                    'unified_unnatural_instructions.jsonl',
+                    'unified_xp3_sample.jsonl']
+useful_oig_files = ['unified_rallio_safety_and_prosocial.jsonl.parquet',
+                    'unified_chip2.jsonl.parquet',
+                    'unified_cuad.jsonl.parquet',
+                    'unified_essays.jsonl.parquet',
+                    'unified_flan.jsonl.gz.parquet',
+                    'unified_grade_school_math_instructions.jsonl.parquet',
+                    'unified_hc3_human.jsonl.parquet',
+                    'unified_mathqa_flanv2_kojma_cot.jsonl.parquet',
+                    'unified_merged_code_xp3.jsonl.parquet',
+                    'unified_multi_news.jsonl.parquet',
+                    #'unified_multi_sum.jsonl.parquet'
+                    'unified_ni.jsonl.gz.parquet',
+                    'unified_openai_summarize_tldr.jsonl.parquet',
+                    #'unified_oscar_en_sample_dialog.jsonl.parquet', # create text containing these N words, not specific
+                    'unified_plot_screenplay_books_dialog.jsonl.parquet',
+                    'unified_soda_dialog.jsonl.parquet',
+                    'unified_unnatural_instructions.jsonl.parquet',
+                    ]
+@pytest.mark.parametrize("filename", OIG_DATASETS)
+def test_get_small_sample_oig_data(filename):
+    if not os.path.exists(filename):
+        os.system('wget https://huggingface.co/datasets/laion/OIG/resolve/main/%s' % filename)
+    import json
+    rows = []
+    with open(filename, "r") as f:
+        for line in f.readlines():
+            row = json.loads(line)
+            rows.append(dict(input=row["text"]))
+    with open(filename + POSTFIX, "w") as f:
+        f.write(json.dumps(rows, indent=2))
+@pytest.mark.parametrize("filename", ALL_OIG_DATASETS)
+def test_download_useful_data_as_parquet(filename):
+    dest_file = filename + '.parquet'
+    if dest_file not in useful_oig_files:
+        pytest.skip('file declared not useful')
+    if not os.path.exists(filename):
+        os.system('wget https://huggingface.co/datasets/laion/OIG/resolve/main/%s' % filename)
+    if not os.path.exists(dest_file):
+        df = pd.read_json(path_or_buf=filename, lines=True)
+        df.to_parquet(dest_file, index=False)
+def test_merge_shuffle_small_sample_oig_data():
+    np.random.seed(1234)
+    rows = []
+    for filename in OIG_DATASETS:
+        with open(filename + POSTFIX, "r") as f:
+            rows.extend(json.loads(f.read()))
+    np.random.shuffle(rows)
+    with open("merged_shuffled_OIG_%s.json" % hashlib.sha256(str(OIG_DATASETS).encode()).hexdigest()[:10], "w") as f:
+        f.write(json.dumps(rows, indent=2))
+def test_join_jsons():
+    files = ['config.json'] * 1 + \
+             ['dai_docs.train_cleaned.json'] * 2 + \
+             ['dai_faq.json'] * 3
+    print(files)
+    lst = []
+    [lst.extend(json.load(open(fil, 'rt'))) for fil in files]
+    print(len(lst))
+    json.dump(lst, open("merged.json", "wt"), indent=2)
+@pytest.mark.parametrize("filename", ['Anthropic/hh-rlhf'])
+def test_make_rlhf_good_data(filename):
+    from datasets import load_dataset
+    rows = load_dataset(filename)["train"]["chosen"]
+    new_rows = []
+    for row in rows:
+        if row[:2] == "\n\n":
+            row = row[2:]
+        row = row.replace("Human: ", "<human>: ")
+        row = row.replace("Assistant: ", "<bot>: ")
+        new_rows.append(dict(input=row))
+    with open(filename.replace("/", "_") + POSTFIX, "w") as f:
+        f.write(json.dumps(new_rows, indent=2))
+def test_show_prompts():
+    files = ['config.json'] * 1 + \
+             ['dai_docs.train_cleaned.json'] * 1 + \
+             ['dai_faq.json'] * 1
+    file_points = [json.load(open(fil, 'rt')) for fil in files]
+    from prompter import generate_prompt
+    for data_points in file_points:
+        for data_point in data_points:
+            print(generate_prompt(data_point, 'plain', False, False)[0])
+def test_get_open_datasets():
+    # HF changed things so don't get raw list of all datasets, so not have to filter, but can't do negative filter
+    open_tags = ['license:Apache License 2.0',
+                 'license:mit',
+                 'license:apache',
+                 'license:apache2',
+                 'license:apache-2.0',
+                 'license:bsd',
+                 'license:bsd-2-clause',
+                 'license:bsd-3-clause',
+                 'license:bsd-3-clause-clear',
+                 'license:lgpl-2.1',
+                 'license:lgpl-3.0',
+                 'license:lgpl-lr',
+                 'license:lgpl',
+                 'license:openrail++',
+                 'license:openrail',
+                 'license:bigscience-bloom-rail-1.0',
+                 #'license:agpl-3.0',
+                 'license:other',
+                 'license:unknown',
+                 # 'license:mpl-2.0',     # ok, but would have to include original copyright, license, source, copies in distribution
+                 # Attribution required:
+                 'license:odc-by',
+                 'license:cc-by-4.0',
+                 'license:cc-by-3.0',
+                 'license:cc-by-2.0',
+                 'license:cc-by-2.5',
+                 #'license:cc-by-sa-4.0',  # would require same license
+                 'license:odbl',
+                 'license:pddl',
+                 'license:ms-pl',
+                 'license:zlib',
+                 ]
+                 # bad license: cc-by-nc-4.0
+    from huggingface_hub import list_datasets
+    datasets = flatten_list([[x for x in list_datasets(filter=y)] for y in open_tags])
+    datasets += [x for x in list_datasets(author='openai')]
+    # check all:
+    all_license_tags = set(flatten_list([[y for y in x.tags if 'license' in y] for x in datasets]))
+    print(len(all_license_tags))
+    open_datasets = [x for x in datasets if any([y in x.tags for y in open_tags]) or 'license:' not in str(x.tags)]
+    print('open_datasets', len(open_datasets))
+    all_task_tags = set(flatten_list([[y for y in x.tags if 'task' in y] for x in open_datasets]))
+    print('all_task_tags', len(all_task_tags))
+    excluded_tags = ['image', 'hate', 'tabular', 'table-', 'classification', 'retrieval',
+                     'translation', 'identification', 'object', 'mask', 'to-text',
+                     'face-detection', 'audio', 'voice', 'reinforcement', 'depth-est',
+                     'forecasting', 'parsing', 'visual', 'speech', 'multiple-choice',
+                     'slot-filling', 'irds/argsme', '-scoring', 'other', 'graph-ml',
+                     'feature-extraction', 'keyword-spotting',
+                     'coreference-resolution', 'segmentation',
+                     'word-sense-disambiguation',
+                     'lemmatization']
+    task_tags = [x.replace('task_categories:', '').replace('task_ids:', '')
+                 for x in all_task_tags if not any([y in x for y in
+                                                    excluded_tags])]
+    print('task_tags', len(task_tags))
+    # str(x.tags) to catch any pattern match to anything in list
+    open_tasked_datasets = [x for x in open_datasets if
+                            any([y in str([x for x in x.tags if 'task' in x]) for y in task_tags]) and
+                            not any([y in str([x for x in x.tags if 'task' in x]) for y in excluded_tags]) or
+                            'task_categories' not in str(x.tags) and 'task_ids' not in str(x.tags)]
+    open_tasked_datasets = [x for x in open_tasked_datasets if not x.disabled]
+    open_tasked_datasets = [x for x in open_tasked_datasets if not x.gated]
+    open_tasked_datasets = [x for x in open_tasked_datasets if not x.private]
+    print('open_tasked_datasets', len(open_tasked_datasets))
+    sizes = list(set(flatten_list([[(y, x.id) for y in x.tags if 'size' in y] for x in open_tasked_datasets])))
+    languages = list(set(flatten_list([[(y, x.id) for y in x.tags if 'language:' in y] for x in open_tasked_datasets])))
+    open_english_tasked_datasets = [x for x in open_tasked_datasets if
+                                    'language:' not in str(x.tags) or
+                                    'language:en' in str(x.tags)]
+    small_open_english_tasked_datasets = [x for x in open_english_tasked_datasets if
+                                    'n<1K' in str(x.tags) or
+                                    '1K<n<10K' in str(x.tags) or
+                                    '1K0<n<100K' in str(x.tags) or
+                                    '100K<n<1M' in str(x.tags) or
+                                    'size_category' not in str(x.tags)
+                                    ]
+    # 'aeslc' : email_body, subject -> summarization?
+    # load_dataset(open_tasked_datasets[0].id).data['train'].to_pandas()
+    ids = [x.id for x in small_open_english_tasked_datasets]
+    # sanity checks
+    # https://bair.berkeley.edu/blog/2023/04/03/koala/
+    assert 'alespalla/chatbot_instruction_prompts' in ids
+    assert 'laion/OIG' in ids
+    assert 'openai/webgpt_comparisons' in ids
+    assert 'openai/summarize_from_feedback' in ids
+    assert 'Anthropic/hh-rlhf' in ids
+    # useful but not allowed for commercial purposes:
+    # https://huggingface.co/datasets/squad
+    print('open_english_tasked_datasets: ', ids, flush=True)
+    exclude_ids = ['allenai/nllb',  # translation only
+                   'hf-internal-testing/fixtures_image_utils',  # testing
+                   'allenai/c4',  # search-url
+                   'agemagician/uniref50',  # unknown
+                   'huggingface-course/documentation-images',  # images
+                   'smilegate-ai/kor_unsmile',  # korean
+                   'MohamedRashad/ChatGPT-prompts',  # ChatGPT/LearnGPT/https://www.emergentmind.com/
+                   'humarin/chatgpt-paraphrases',  # Paraphrase using ChatGPT
+                   'Jeska/vaccinchat',  # not useful
+                   'alespalla/chatbot_instruction_prompts',  # mixes alpaca
+                   'allenai/prosocial-dialog',  # already exlucded, but wrongly in other datasets that say more permissive license
+                   'AlekseyKorshuk/persona-chat',  # low quality
+                   'bavard/personachat_truecased',  # low quality
+                   'adamlin/daily_dialog',  # medium quality conversations
+                   'adamlin/FewShotWoz',  # low quality
+                   'benjaminbeilharz/better_daily_dialog',  # low quality
+                   'benjaminbeilharz/daily_dialog_w_turn_templates',  # low
+                   'benjaminbeilharz/empathetic_dialogues_for_lm',  # low
+                   'GEM-submissions/GEM__bart_base_schema_guided_dialog__1645547915',  # NA
+                   'ia-bentebib/conv_ai_2_fr',  # low fr
+                   'ia-bentebib/daily_dialog_fr',  # low fr
+                   'ia-bentebib/dialog_re_fr',  # low fr
+                   'ia-bentebib/empathetic_dialogues_fr',  # low fr
+                   'roskoN/dailydialog',  # low
+                   'VadorMazer/skyrimdialogstest',  # low
+                   'bigbio/med_qa',  # med specific Q/A
+                   'biu-nlp/qa_srl2018',  # low quality Q/A
+                   'biu-nlp/qa_discourse',  # low quality Q/A
+                   'iarfmoose/qa_evaluator',  # low quality Q/A
+                   'jeopardy',  # low quality Q/A -- no reasoning
+                   'narrativeqa',  # low quality Q/A
+                   'nomic-ai/gpt4all_prompt_generations',  # bad license
+                   'nomic-ai/gpt4all_prompt_generations_with_p3',  # bad license
+                   'HuggingFaceH4/alpaca',  # bad license
+                   'tatsu-lab/alpaca',  # ToS breaking
+                   'yahma/alpaca-cleaned',  # ToS breaking
+                   'Hello-SimpleAI/HC3',  # bad license
+                   'glue',  # no reasoning QA
+                   'sahil2801/CodeAlpaca-20k',  # bad license
+                   'Short-Answer-Feedback/saf_communication_networks_english',  # long Q, medium A
+                   ]
+    small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if x.id not in exclude_ids]
+    # some ids clearly speech related
+    small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if 'speech' not in x.id]
+    # HF testing
+    small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if 'hf-internal-testing' not in x.id]
+    small_open_english_tasked_datasets = [x for x in small_open_english_tasked_datasets if
+                                          'chinese' not in x.id]
+    sorted_small_open_english_tasked_datasets = sorted([(x.downloads, x) for x in small_open_english_tasked_datasets],
+                                                       key=lambda x: x[0], reverse=True)
+    # NOTES:
+    # Run like pytest -s -v create_data.py::test_get_open_datasets &> getdata9.log
+    # See what needs config passed and add:
+    # grep 'load_dataset(' getdata9.log|grep -v data_id|less -S
+    # grep "pip install" getdata9.log
+    # NOTE: Some datasets have default config, but others are there.  Don't know how to access them.
+    """
+    https://huggingface.co/datasets/wikihow/blob/main/wikihow.py
+    https://github.com/mahnazkoupaee/WikiHow-Dataset
+    https://ucsb.box.com/s/ap23l8gafpezf4tq3wapr6u8241zz358
+    https://ucsb.app.box.com/s/ap23l8gafpezf4tq3wapr6u8241zz358
+    """
+    """
+    # some ambiguous or non-commercial datasets
+    https://github.com/PhoebusSi/alpaca-CoT
+    """
+    timeout = 3 * 60
+    # laion/OIG takes longer
+    for num_downloads, dataset in sorted_small_open_english_tasked_datasets:
+        data_id = dataset.id
+        func = do_one
+        args = (data_id, num_downloads)
+        kwargs = {}
+        with ProcessPoolExecutor(max_workers=1) as executor:
+            future = executor.submit(func, *args, **kwargs)
+            try:
+                future.result(timeout=timeout)
+            except concurrent.futures.TimeoutError:
+                print("\n\ndata_id %s timeout\n\n" % data_id, flush=True)
+            for child in psutil.Process(os.getpid()).children(recursive=True):
+                os.kill(child.pid, signal.SIGINT)
+                os.kill(child.pid, signal.SIGTERM)
+                os.kill(child.pid, signal.SIGKILL)
+def do_one(data_id, num_downloads):
+    from datasets import load_dataset
+    out_file = "data_%s.parquet" % str(data_id.replace('/', '_'))
+    if os.path.isfile(out_file) and os.path.getsize(out_file) > 1024**3:
+        return
+    try:
+        print("Loading data_id %s num_downloads: %s" % (data_id, num_downloads), flush=True)
+        avail_list = None
+        try:
+            data = load_dataset(data_id, 'foobar')
+        except Exception as e:
+            if 'Available: ' in str(e):
+                avail_list = ast.literal_eval(str(e).split('Available:')[1].strip())
+            else:
+                avail_list = None
+        if avail_list is None:
+            avail_list = [None]
+        print("%s avail_list: %s" % (data_id, avail_list), flush=True)
+        for name in avail_list:
+            out_file = "data_%s_%s.parquet" % (str(data_id.replace('/', '_')), str(name))
+            if os.path.isfile(out_file):
+                continue
+            data = load_dataset(data_id, name)
+            column_names_dict = data.column_names
+            column_names = column_names_dict[list(column_names_dict.keys())[0]]
+            print("Processing data_id %s num_downloads: %s columns: %s" % (data_id, num_downloads, column_names),
+                  flush=True)
+            data_dict = data.data
+            col_dict = data.num_columns
+            first_col = list(col_dict.keys())[0]
+            if 'train' in data_dict:
+                df = data['train'].to_pandas()
+            else:
+                df = data[first_col].to_pandas()
+            # csv has issues with escaping chars, even for datasets I know I want
+            df.to_parquet(out_file, index=False)
+    except Exception as e:
+        t, v, tb = sys.exc_info()
+        ex = ''.join(traceback.format_exception(t, v, tb))
+        print("Exception: %s %s" % (data_id, ex), flush=True)
+def test_otherlic():
+    from huggingface_hub import list_datasets
+    lic = ['license:odc-by',
+           'license:cc-by-4.0',
+           'license:cc-by-3.0',
+           'license:cc-by-2.0',
+           'license:cc-by-2.5',
+           'license:cc-by-sa-4.0',
+           'license:odbl',
+           'license:pddl',
+           'license:ms-pl',
+           'license:zlib',
+           ]
+    datasets = flatten_list([[x for x in list_datasets(filter=y) if 'translation' not in str(x.tags)] for y in lic])
+    print(len(datasets))
+# These useful datasets are determined based upon data sample, column types, and uniqueness compared to larger datasets like Pile
+# grep columns getdata13.log|grep -v "\['image'\]"|sort|uniq|grep -v tokens|grep -v "'image'"|grep -v embedding|grep dialog
+useful = ['Dahoas/instruct-human-assistant-prompt',
+          'Dahoas/first-instruct-human-assistant-prompt',
+          'knkarthick/dialogsum',  # summary of conversation
+          'McGill-NLP/FaithDial',  # medium quality
+          'Zaid/quac_expanded',  # medium quality context + QA
+          '0-hero/OIG-small-chip2',  # medium
+          'alistvt/coqa-flat',  # QA medium
+          'AnonymousSub/MedQuAD_47441_Question_Answer_Pairs',  # QA medium
+          'Anthropic/hh-rlhf',  # high quality  # similar to Dahoas/full-hh-rlhf
+          'arjunth2001/online_privacy_qna',  # good quality QA
+          'Dahoas/instruct_helpful_preferences',  # medium quality instruct
+          'Dahoas/rl-prompt-dataset',  # medium chat
+          'Dahoas/rm-static',  # medium chat
+          'Dahoas/static-hh',  # medium chat  # HuggingFaceH4/self_instruct
+          'Dahoas/synthetic-instruct-gptj-pairwise',  # medium chat
+          'eli5',  # QA if prompt ELI5
+          'gsm8k',  # QA (various)
+          'guanaco/guanaco',  # prompt/response
+          'kastan/rlhf-qa-comparisons',  # good QA
+          'kastan/rlhf-qa-conditional-generation-v2',  # prompt answer
+          'OllieStanley/humaneval-mbpp-codegen-qa',  # code QA, but started from words, so better than other code QA
+          'OllieStanley/humaneval-mbpp-testgen-qa',  # code QA
+          'Graverman/Instruct-to-Code',  # code QA
+          'openai/summarize_from_feedback',  # summarize
+          'relbert/analogy_questions',  # analogy QA
+          'yitingxie/rlhf-reward-datasets',  # prompt, chosen, rejected.
+          'yizhongw/self_instruct',  # instruct (super natural & instruct)
+          'HuggingFaceH4/asss',  # QA, big A
+          'kastan/rlhf-qa-conditional-generation-v2',  # QA
+          'cosmos_qa',  # context QA
+          'vishal-burman/c4-faqs',  # QA but not so much reasoning, but alot of text
+          'squadshifts',  # QA from context
+          'hotpot_qa',  # QA from context
+          'adversarial_qa',  # QA from context
+          'allenai/soda',  # dialog -> narrative/summary
+          'squad_v2',  # context QA
+          'squadshifts',  # context QA
+          'dferndz/cSQuAD1',  # context QA
+          'dferndz/cSQuAD2',  # context QA
+          'din0s/msmarco-nlgen',  # context QA
+          'domenicrosati/TruthfulQA',  # common sense truthful QA -- trivia but good trivia
+          'hotpot_qa',  # context, QA
+          'HuggingFaceH4/self-instruct-eval',  # instruct QA, medium quality, some language reasoning
+          'kastan/EE_QA_for_RLHF',  # context QA
+          'KK04/LogicInference_OA',  # instruction logical QA
+          'lmqg/qa_squadshifts_synthetic',  # context QA
+          'lmqg/qg_squad',  # context QA
+          'lmqg/qg_squadshifts',  # context QA
+          'lmqg/qg_subjqa',  # context QA
+          'pszemraj/HC3-textgen-qa',  # QA medium, has human responses -- humans tend to provide links instead of trying to answer
+          'pythonist/newdata',  # long context, QA, brief A
+          'ropes',  # long background, situation, question, A
+          'wikitablequestions',  # table -> QA
+          'bigscience/p3',  # context QA but short answers
+          ]
+code_useful = ['0n1xus/codexglue',
+               'openai_humaneval',
+               'koutch/staqc',
+               ]
+maybe_useful = ['AlekseyKorshuk/comedy-scripts',
+                 'openbookqa',  # hard to parse, low reasoning
+                'qed',  # reasonable QA, but low reasoning
+                'selqa',  # candidate answers
+                'HuggingFaceH4/instruction-pilot-outputs-filtered',
+                'GBaker/MedQA-USMLE-4-options',  # medical QA with long questions
+                'npc-engine/light-batch-summarize-dialogue',  # dialog summarize, kinda low specific quality
+                ]
+summary_useful = ['austin/rheum_abstracts',
+                  'CarperAI/openai_summarize_comparisons',  # summarize chosen/rejected
+                  'CarperAI/openai_summarize_tldr',  # summarize QA
+                  'ccdv/cnn_dailymail',  # summarize news
+                  'ccdv/govreport-summarization',  # summarize high quality
+                  'ccdv/pubmed-summarization',  # summarize high quality
+                  'duorc',  # plot -> QA
+                  'farleyknight/big_patent_5_percent',  # desc -> abstract
+                  'multi_news',  # summary
+                  'opinosis',
+                  'SophieTr/reddit_clean',
+                  'allenai/mup',  # long text -> summary
+                  'allenai/multi_lexsum',  # long text -> summary
+                  'big_patent',
+                  'allenai/wcep_dense_max',
+                  'awinml/costco_long_practice',
+                  'GEM/xsum',
+                  'ratishsp/newshead',
+                  'RussianNLP/wikiomnia',  # russian
+                  'stacked-summaries/stacked-xsum-1024',
+                  ]
+math_useful = [
+              'competition_math'
+              ]
+skipped = ['c4',  # maybe useful, used for flan, but skipped due to size
+          ]
+"""
+To get training data from oig:
+pytest test_oig test_grade_final test_finalize_to_json
+"""
+human = '<human>:'
+bot = '<bot>:'
+def test_assemble_and_detox():
+    import re
+    from profanity_check import predict_prob
+    df_list = []
+    for data in useful_oig_files:
+        print("Processing %s" % data, flush=True)
+        df = pd.read_parquet(data)
+        df = df.reset_index(drop=True)
+        # chop up into human/bot interactions of no more than 10kB per row
+        text_list = df[['text']].values.ravel().tolist()
+        new_text = []
+        max_len = 2048  # uber cutoff
+        MAX_LEN = 2048//2 - 30  # max len per question/answer
+        for text in tqdm(text_list):
+            human_starts = [m.start() for m in re.finditer('<human>: ', text)]
+            if len(human_starts) == 1:
+                human_starts = [0, len(text)]  # always go into for loop below
+            blurb = ''
+            for i in range(len(human_starts) - 1):
+                interaction = text[human_starts[i]: human_starts[i+1]][:max_len]
+                blurb += interaction
+                if len(blurb) >= MAX_LEN:
+                    blurb = get_sentences(blurb, length=MAX_LEN)[0]
+                    new_text.append(blurb + "\n<human>:")
+                    blurb = ''
+            if blurb:
+                blurb = get_sentences(blurb, length=MAX_LEN)[0]
+                new_text.append(blurb + "\n<human>:")
+        if len(new_text) > len(text_list):
+            print("Added %d new rows (before: %d)" % (len(new_text) - df.shape[0], df.shape[0]))
+        df = pd.DataFrame({"text": new_text, "source": [data] * len(new_text)})
+        df = df.drop_duplicates(keep='first')
+        print(df['text'].apply(lambda x: len(x)).describe())
+        assert df['text'].apply(lambda x: len(x)).max() <= 2 * max_len
+        # faster than better_profanity, do early
+        df['profanity'] = predict_prob(df['text'])
+        before_rows = df.shape[0]
+        df = df[df['profanity'] < 0.25]  # drop any low quality stuff
+        after_rows = df.shape[0]
+        print("Dropped %d rows out of %d due to alt-profanity-check" % (before_rows - after_rows, before_rows))
+        df_list.append(df)
+        print("Done processing %s -> %s rows" % (data, df.shape[0]), flush=True)
+        print("So far have %d rows" % sum([len(x) for x in df_list]))
+    df_final = pd.concat(df_list)
+    df_final = df_final.sample(frac=1, random_state=1234).reset_index(drop=True)
+    df_final.to_parquet('h2oGPT.cleaned.human_bot.shorter.parquet', index=False)
+def test_basic_cleaning():
+    # from better_profanity import profanity
+    # https://pypi.org/project/alt-profanity-check/
+    from profanity_check import predict
+    df_list = []
+    for data in useful_oig_files:
+    #for data in useful_oig_files[:5]:
+    #for data in ['unified_openai_summarize_tldr.jsonl.parquet']:
+        print("Processing %s" % data, flush=True)
+        df = pd.read_parquet(data)
+        df = df.reset_index(drop=True)
+        # NOTE: Not correct if multiple human-bot interactions, but those dialogs even more desired
+        #avg_chars = len(df['text'][0])/(df['text'][0].count(human)+df['text'][0].count(bot))
+        df['avg_words'] = df['text'].apply(lambda x: x.count(' ') / (x.count(human) + x.count(bot))/2.0)
+        df['avg_bot_words'] = df['text'].apply(lambda x: x.split(bot)[1].count(' ') / x.count(bot))
+        #df['bad_words'] = df['text'].apply(lambda x: profanity.contains_profanity(x))
+        #low_quality_patterns = ['Write the rest of this wikipedia article']
+        res = predict(df['text'])
+        df['bad_words'] = res
+        df = df.reset_index(drop=True)
+        df = df[df['bad_words'] == 0]
+        df = df[['text', 'avg_words', 'avg_bot_words']]
+        df = df.drop_duplicates(keep='first')
+        print(df[df['avg_words'] == df['avg_words'].max()]['text'].values)
+        median_words = np.median(df['avg_words'])
+        min_words_per_entity = max(30, 0.8 * median_words)
+        max_words_per_entity = 2048  # too hard to learn from for now
+        df = df[df['avg_words'] > min_words_per_entity]
+        df = df[df['avg_words'] < max_words_per_entity]
+        min_words_per_entity = max(20, 0.5 * median_words)  # bot should say stuff for now
+        max_words_per_entity = 2048  # too hard to learn from for now
+        df = df[df['avg_bot_words'] > min_words_per_entity]
+        df = df[df['avg_bot_words'] < max_words_per_entity]
+        df_list.append(df)
+        print("Done processing %s -> %s rows" % (data, df.shape[0]), flush=True)
+    df_final = pd.concat(df_list)
+    df_final.to_parquet('h2oGPT.cleaned.human_bot.parquet', index=False)
+from joblib import Parallel, delayed, effective_n_jobs
+from sklearn.utils import gen_even_slices
+from sklearn.utils.validation import _num_samples
+def parallel_apply(df, func, n_jobs=-1, **kwargs):
+    """ Pandas apply in parallel using joblib.
+    Uses sklearn.utils to partition input evenly.
+    Args:
+        df: Pandas DataFrame, Series, or any other object that supports slicing and apply.
+        func: Callable to apply
+        n_jobs: Desired number of workers. Default value -1 means use all available cores.
+        **kwargs: Any additional parameters will be supplied to the apply function
+    Returns:
+        Same as for normal Pandas DataFrame.apply()
+    """
+    if effective_n_jobs(n_jobs) == 1:
+        return df.apply(func, **kwargs)
+    else:
+        ret = Parallel(n_jobs=n_jobs)(
+            delayed(type(df).apply)(df[s], func, **kwargs)
+            for s in gen_even_slices(_num_samples(df), effective_n_jobs(n_jobs)))
+        return pd.concat(ret)
+def add_better_profanity_flag(df):
+    from better_profanity import profanity
+    df['better_profanity'] = parallel_apply(
+        df['text'],
+        lambda x: profanity.contains_profanity(x),
+        n_jobs=-1,
+    )
+    return df
+def add_textstat_grade(df):
+    import textstat
+    def myfunc(x):
+        return textstat.flesch_kincaid_grade(x)  # simple grade
+    if False:
+        import dask.dataframe as dd
+        # 40 seconds for 1000 rows, but have 1,787,799 rows
+        ddata = dd.from_pandas(df, npartitions=120)
+        df['flesch_grade'] = ddata['text'].apply(myfunc).compute()
+    if True:
+        # fast way
+        df['flesch_grade'] = parallel_apply(df['text'], myfunc, n_jobs=-1)
+    return df
+def add_deberta_grade(df):
+    from transformers import AutoModelForSequenceClassification, AutoTokenizer
+    import torch
+    reward_name = "OpenAssistant/reward-model-deberta-v3-large-v2"
+    rank_model, tokenizer = AutoModelForSequenceClassification.from_pretrained(
+        reward_name), AutoTokenizer.from_pretrained(reward_name)
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    rank_model.to(device)
+    def get_question(x):
+        return x.replace('<human>: ', '').split('<bot>:')[0]
+    def get_answer(x):
+        try:
+            answer = x.split('<bot>: ')[1].split('<human>:')[0].replace('<bot>: ', '')
+        except:
+            answer = x.split('<bot>:')[1].split('<human>:')[0].replace('<bot>:', '')
+        return answer
+    df['question'] = parallel_apply(df['text'], get_question, n_jobs=-1)
+    df['answer'] = parallel_apply(df['text'], get_answer, n_jobs=-1)
+    from datasets import Dataset
+    from transformers import pipeline
+    from transformers.pipelines.pt_utils import KeyPairDataset
+    import tqdm
+    pipe = pipeline(
+        "text-classification",
+        model=reward_name,
+        device="cuda:0" if torch.cuda.is_available() else "cpu"
+    )
+    start = 0
+    batch_size = 64 * 16
+    micro_batch = orig_micro_batch = 16
+    end = 0
+    import socket
+    checkpoint = "grades.%s.pkl" % socket.gethostname()
+    grades = []
+    import pickle
+    if os.path.exists(checkpoint):
+        with open(checkpoint, "rb") as f:
+            start, grades = pickle.loads(f.read())
+    last_oom = 0
+    while end < df.shape[0]:
+        # manual batching to handle OOM more gracefully
+        end = min(start + batch_size, df.shape[0])
+        if start == end:
+            break
+        dataset = Dataset.from_pandas(df.iloc[start:end, :])
+        try:
+            grades.extend([
+                x['score'] for x in tqdm.tqdm(
+                    pipe(KeyPairDataset(dataset, "question", "answer"), batch_size=micro_batch)
+                )
+            ])
+        except torch.cuda.OutOfMemoryError:
+            last_oom = start
+            micro_batch = max(1, micro_batch // 2)
+            print("OOM - retrying with micro_batch=%d" % micro_batch)
+            continue
+        if last_oom == start:
+            micro_batch = orig_micro_batch
+            print("Returning to micro_batch=%d" % micro_batch)
+        assert len(grades) == end
+        start = end
+        with open(checkpoint, "wb") as f:
+            f.write(pickle.dumps((end, grades)))
+        print("%d/%d" % (end, df.shape[0]))
+    df['grade_deberta'] = grades
+    if os.path.exists(checkpoint):
+        os.remove(checkpoint)
+    return df
+def test_chop_by_lengths():
+    file = "h2oGPT.cleaned.human_bot.shorter.parquet"
+    df = pd.read_parquet(file).reset_index(drop=True)
+    df = count_human_bot_lengths(df)
+    df['rand'] = np.random.rand(df.shape[0])
+    df['rand2'] = np.random.rand(df.shape[0])
+    before_rows = df.shape[0]
+    # throw away short human/bot responses with higher likelihood
+    df = df[(df['len_human_mean'] > 20)]  # never keep very short ones
+    df = df[(df['len_human_mean'] > 30) | (df['rand'] < 0.2)]
+    df = df[(df['len_human_mean'] > 50) | (df['rand'] < 0.5)]
+    df = df[(df['len_human_max'] < 10000)]  # drop super long (basically only human) ones
+    df = df[(df['len_bot_mean'] > 20)]  # never keep very short ones
+    df = df[(df['len_bot_mean'] > 30) | (df['rand2'] < 0.2)]
+    df = df[(df['len_bot_mean'] > 50) | (df['rand2'] < 0.5)]
+    df = df[(df['len_bot_max'] < 10000)]  # drop super long (only bot) ones
+    assert df['text'].apply(lambda x: len(x)).max() < 20000
+    df = df.drop(['rand', 'rand2'], axis=1)
+    after_rows = df.shape[0]
+    print("Chopped off %d out of %d rows due to length" % (before_rows - after_rows, before_rows))
+    print(df.describe())
+    df.to_parquet('h2oGPT.cleaned.chopped.human_bot.shorter.parquet', index=False)
+def count_human_bot_lengths(df, human=None, bot=None):
+    import re
+    len_human_min = []
+    len_human_max = []
+    len_human_mean = []
+    len_bot_min = []
+    len_bot_max = []
+    len_bot_mean = []
+    human = human or '<human>:'
+    bot = bot or '<bot>:'
+    for is_human in [True, False]:
+        what = human if is_human else bot
+        other = human if not is_human else bot
+        for i in range(df.shape[0]):
+            text = df.loc[i, 'text']
+            assert isinstance(text, str)
+            starts = [m.start() for m in re.finditer(what, text)]
+            if len(starts) == 1:
+                starts = [starts[0], len(text)]  # always go into for loop below
+            assert len(text)
+            list_what = []
+            for ii in range(len(starts) - 1):
+                interaction = text[starts[ii]: starts[ii+1]]
+                if other in interaction:
+                    interaction = interaction[:interaction.find(other)]
+                interaction.strip()
+                list_what.append(interaction)
+            if not list_what:
+                list_what = ['']  # handle corrupted data, very rare, leads to sizes 0
+            if is_human:
+                len_human_min.append(min([len(x) for x in list_what]))
+                len_human_max.append(max([len(x) for x in list_what]))
+                len_human_mean.append(np.mean([len(x) for x in list_what]))
+            else:
+                len_bot_min.append(min([len(x) for x in list_what]))
+                len_bot_max.append(max([len(x) for x in list_what]))
+                len_bot_mean.append(np.mean([len(x) for x in list_what]))
+    df['len_human_min'] = len_human_min
+    df['len_human_max'] = len_human_max
+    df['len_human_mean'] = len_human_mean
+    df['len_bot_min'] = len_bot_min
+    df['len_bot_max'] = len_bot_max
+    df['len_bot_mean'] = len_bot_mean
+    np.random.seed(1234)
+    pd.set_option('display.max_columns', None)
+    print("Before chopping")
+    print(df.describe())
+    return df
+def test_grade():
+    df = None
+    file = "h2oGPT.cleaned.chopped.human_bot.shorter.parquet"
+    output_file = "h2oGPT.cleaned.graded1.human_bot.shorter.parquet"
+    if not os.path.exists(output_file):
+        if df is None:
+            df = pd.read_parquet(file).reset_index(drop=True)
+        df = add_textstat_grade(df)
+        min_grade = 10
+        max_grade = 25
+        df = df[df['flesch_grade'] >= min_grade]
+        df = df[df['flesch_grade'] <= max_grade]
+        print("After Flesch grade")
+        print(df.describe())
+        df.to_parquet(output_file, index=False)
+    file = output_file
+    output_file = "h2oGPT.cleaned.graded2.human_bot.shorter.parquet"
+    if not os.path.exists(output_file):
+        # slower than alt-profanity, do last, but do before deberta grading, since that's slower
+        if df is None:
+            df = pd.read_parquet(file).reset_index(drop=True)
+        df = add_better_profanity_flag(df)
+        before_rows = df.shape[0]
+        df = df[df['better_profanity'] == 0]
+        df = df.drop(['better_profanity'], axis=1)
+        after_rows = df.shape[0]
+        print("Dropped %d rows out of %d due to better_profanity" % (before_rows - after_rows, before_rows))
+        print(df.describe())
+        df.to_parquet(output_file, index=False)
+    file = output_file
+    output_file = 'h2oGPT.cleaned.graded3.human_bot.shorter.parquet'
+    if not os.path.exists(output_file):
+        if df is None:
+            df = pd.read_parquet(file).reset_index(drop=True)
+        df = add_deberta_grade(df)
+        min_grade = 0.3
+        max_grade = np.inf
+        before_rows = df.shape[0]
+        df = df[df['grade_deberta'] >= min_grade]
+        df = df[df['grade_deberta'] <= max_grade]
+        after_rows = df.shape[0]
+        print("Dropped %d rows out of %d due to deberta grade" % (before_rows - after_rows, before_rows))
+        print("After DeBERTa grade")
+        print(df.describe())
+        df.to_parquet(output_file, index=False)
+    file = output_file
+    output_file = 'h2oGPT.cleaned.graded.human_bot.shorter.parquet'
+    if df is None:
+        df = pd.read_parquet(file).reset_index(drop=True)
+    df.to_parquet(output_file, index=False)
+@pytest.mark.parametrize(
+    "fixup_personality, only_personality, deberta_grading",
+    [
+        [False, False, False],
+        [True, True, False],
+        [True, False, False],
+        [True, False, True],
+    ]
+)
+def test_add_open_assistant(fixup_personality, only_personality, deberta_grading, save_json=True):
+    """
+    Flatten tree structure into one row per path from root to leaf
+    Also turn into human_bot prompting format:
+        <human>: question\n<bot>: answer <human>: question2\n<bot>: answer2 Etc.
+    Also saves a .json locally as side-effect
+    returns list of dicts, containing intput, prompt_type and source
+    """
+    from datasets import load_dataset
+    data_file = "OpenAssistant/oasst1"
+    ds = load_dataset(data_file)
+    df = pd.concat([ds['train'].to_pandas(), ds['validation'].to_pandas()], axis=0)
+    rows = {}
+    message_ids = df['message_id'].values.tolist()
+    message_tree_ids = df['message_tree_id'].values.tolist()
+    parent_ids = df['parent_id'].values.tolist()
+    texts = df['text'].values.tolist()
+    roles = df['role'].values.tolist()
+    for i in range(df.shape[0]):
+        # collect all trees
+        message_id = message_ids[i]
+        message_tree_id = message_tree_ids[i]
+        parent_id = parent_ids[i]
+        text = texts[i]
+        if fixup_personality:
+            text = text.replace("Open Assistant", "h2oGPT")
+            text = text.replace("Open-Assistant", "h2oGPT")
+            text = text.replace("open-assistant", "h2oGPT")
+            text = text.replace("OpenAssistant", "h2oGPT")
+            text = text.replace("open assistant", "h2oGPT")
+            text = text.replace("Open Assistand", "h2oGPT")
+            text = text.replace("Open Assitant", "h2oGPT")
+            text = text.replace("Open Assistent", "h2oGPT")
+            text = text.replace("Open Assisstant", "h2oGPT")
+            text = text.replace("Open Assitent", "h2oGPT")
+            text = text.replace("Open Assitiant", "h2oGPT")
+            text = text.replace("Open Assistiant", "h2oGPT")
+            text = text.replace("Open Assitan ", "h2oGPT ")
+            text = text.replace("Open Assistan ", "h2oGPT ")
+            text = text.replace("Open Asistant", "h2oGPT")
+            text = text.replace("Open Assiant", "h2oGPT")
+            text = text.replace("Assistant", "h2oGPT")
+            text = text.replace("LAION AI", "H2O.ai")
+            text = text.replace("LAION-AI", "H2O.ai")
+            text = text.replace("LAION,", "H2O.ai,")
+            text = text.replace("LAION.ai", "H2O.ai")
+            text = text.replace("LAION.", "H2O.ai.")
+            text = text.replace("LAION", "H2O.ai")
+        role = roles[i]
+        new_data = ('<human>: ' if role == 'prompter' else '<bot>: ') + text
+        entry = dict(message_id=message_id, parent_id=parent_id, text=new_data)
+        if message_tree_id not in rows:
+            rows[message_tree_id] = [entry]
+        else:
+            rows[message_tree_id].append(entry)
+    all_rows = []
+    for node_id in rows:
+        # order responses in tree, based on message/parent relationship
+        conversations = []
+        list_msgs = rows[node_id]
+        # find start
+        while len(list_msgs):
+            for i, leaf in enumerate(list_msgs):
+                found = False
+                parent_id = leaf['parent_id']
+                if parent_id is None:
+                    # conversation starter
+                    conversations.append(leaf)
+                    found = True
+                else:
+                    for conv in conversations:
+                        # find all conversations to add my message to
+                        if parent_id in conv['message_id'] and parent_id != conv['message_id'][-len(parent_id):]:
+                            # my message doesn't follow conversation
+                            continue
+                        if parent_id == conv['message_id'][-len(parent_id):]:
+                            # my message follows conversation, but fork first, so another follow-on message can do same
+                            conversations.append(conv.copy())
+                            conv['text'] += f"""
+{leaf['text']}
+"""
+                            conv['message_id'] += leaf['message_id']
+                            found = True
+                            break
+                if found:
+                    # my content was used, so nuke from list
+                    del list_msgs[i]
+                    break
+        # now reduce down to final conversations, find the longest chains of message ids
+        for i, conv in enumerate(conversations):
+            for j, conv2 in enumerate(conversations):
+                if i == j:
+                    continue
+                if conv['message_id'] and conv2['message_id']:
+                    assert conv['message_id'] != conv2['message_id']
+                    # delete the shorter conversation, if one contains the other
+                    if conv['message_id'] in conv2['message_id']:
+                        conv['message_id'] = None
+                    if conv2['message_id'] in conv['message_id']:
+                        conv2['message_id'] = None
+        conversations = [c for c in conversations if c['message_id']]
+        if only_personality:
+            all_rows.extend([dict(input=c['text'] + "\n<human>:", prompt_type='plain', source=data_file) for c in conversations if 'h2oGPT' in c['text']])
+        else:
+            all_rows.extend([dict(input=c['text'] + "\n<human>:", prompt_type='plain', source=data_file) for c in conversations if "What is H2O.ai" not in c['text']])
+    unhelpful = get_unhelpful_list()
+    all_rows = [x for x in all_rows if not any(u in x['input'] for u in unhelpful)]
+    personality = create_personality_data()
+    all_rows.extend(personality * 10)
+    np.random.seed(123)
+    np.random.shuffle(all_rows)
+    print(len(all_rows))
+    if deberta_grading:
+        df = pd.DataFrame(all_rows)
+        df = df.rename(columns={'input': 'text'})
+        df = add_deberta_grade(df)
+        df = df.rename(columns={'text': 'input'})
+        drop = True
+        if drop:
+            min_grade = 0.3
+            max_grade = np.inf
+            before_rows = df.shape[0]
+            df = df[df['grade_deberta'] >= min_grade]
+            df = df[df['grade_deberta'] <= max_grade]
+            after_rows = df.shape[0]
+            print("Dropped %d rows out of %d due to deberta grade" % (before_rows - after_rows, before_rows))
+            print("After DeBERTa grade")
+        print(df.describe())
+        all_rows = []
+        for i in range(df.shape[0]):
+            all_rows.append(
+                dict(
+                    input=df['input'].iloc[i],
+                    source=df['source'].iloc[i],
+                    prompt_type=df['prompt_type'].iloc[i],
+                    grade_deberta=df['grade_deberta'].iloc[i],
+                )
+            )
+    if save_json:
+        data_file = data_file + \
+                    ("_h2ogpt" if fixup_personality else "") + \
+                    ("_only" if only_personality else "") + \
+                    ("_graded" if deberta_grading else "")
+        for i in range(len(all_rows)):
+            all_rows[i]['id'] = i
+        with open(data_file.lower().replace("/", "_") + ".json", "w") as f:
+            f.write(json.dumps(all_rows, indent=2))
+    return all_rows
+def test_finalize_to_json():
+    df = pd.read_parquet('h2oGPT.cleaned.graded.human_bot.shorter.parquet')
+    df = df.rename(columns={'text': 'input'})
+    print("Number of high-quality human_bot interactions: %s" % df.shape[0], flush=True)
+    print("Adding open assistant data")
+    with open("openassistant_oasst1_h2ogpt_graded.json") as f:
+        open_assistant = json.loads(f.read())
+    df = pd.concat([df, pd.DataFrame(open_assistant)], axis=0)
+    def final_clean(df):
+        from better_profanity import profanity
+        profanity.load_censor_words_from_file("data/censor_words.txt")
+        df['profanity'] = parallel_apply(
+            df['input'],
+            lambda x: profanity.contains_profanity(x),
+            n_jobs=-1,
+        )
+        return df[(df['profanity'] == 0)].reset_index(drop=True)
+    print("Before cleaning: Number of final high-quality human_bot interactions: %s" % df.shape[0], flush=True)
+    df = final_clean(df)
+    print("After cleaning: Number of final high-quality human_bot interactions: %s" % df.shape[0], flush=True)
+    print(df.describe())
+    print(df.shape)
+    row_list = []
+    for i in range(df.shape[0]):
+        row_list.append(
+            dict(
+                input=df.loc[i, 'input'],
+                source=df.loc[i, 'source'],
+                prompt_type='plain',
+            )
+        )
+    np.random.seed(1234)
+    np.random.shuffle(row_list)
+    unhelpful = get_unhelpful_list()
+    row_list = [x for x in row_list if not any(u in x['input'] for u in unhelpful)]
+    for i in range(len(row_list)):
+        row_list[i]['id'] = i
+        row_list[i]['input'] = row_list[i]['input'].replace(" <bot>:", "\n<bot>:")
+    with open('h2ogpt-oig-oasst1-instruct-cleaned-v3.json', "w") as f:
+        f.write(json.dumps(row_list, indent=2))
+def create_personality_data():
+    questions = [
+        "What's your name?",
+        "What is your name?",
+        "What are you?",
+        "Who are you?",
+        "Do you have a name?",
+        "Who trained you?",
+        "Who created you?",
+        "Who made you?",
+    ]
+    answers = [
+        "I'm h2oGPT, a large language model by H2O.ai.",
+        "I'm h2oGPT, a large language model by H2O.ai, the visionary leader in democratizing AI.",
+        "My name is h2oGPT. I'm a large language model by H2O.ai, the visionary leader in democratizing AI.",
+        "My name is h2oGPT. I'm a large language model trained by H2O.ai.",
+        "Hi! I'm h2oGPT, a large language model by H2O.ai.",
+        "Hi! I'm h2oGPT, a large language model by H2O.ai, the visionary leader in democratizing AI.",
+    ]
+    help = [
+        "",
+        " How can I help you?",
+        " How may I assist you?",
+        " Nice to meet you.",
+    ]
+    import itertools
+    rows = []
+    for pair in itertools.product(questions, answers, help):
+        rows.append(
+            dict(input=f"<human>: {pair[0]}\n<bot>: {pair[1]}{pair[2]}\n<human>:", prompt_type='plain', source="H2O.ai")
+        )
+    for row in [
+        "<human>: What is H2O.ai?\n<bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.\n<human>:",
+        "<human>: What is h2o.ai?\n<bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.\n<human>:",
+        "<human>: What is H2O?\n<bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.\n<human>:",
+        "<human>: Who is h2o.ai?\n<bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.\n<human>:",
+        "<human>: who is h2o.ai?\n<bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.\n<human>:",
+        "<human>: who is h2o?\n<bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.\n<human>:",
+        "<human>: What is H2O.ai?\n<bot>: H2O.ai is the visionary leader in democratizing AI.\n<human>:",
+        "<human>: Who is H2O.ai?\n<bot>: H2O.ai is the visionary leader in democratizing AI.\n<human>:",
+        "<human>: Who is H2O?\n<bot>: H2O.ai is the visionary leader in democratizing AI.\n<human>:",
+        "<human>: Who is h2o?\n<bot>: H2O.ai is the visionary leader in democratizing AI.\n<human>:",
+        "<human>: who is h2o?\n<bot>: H2O.ai is the visionary leader in democratizing AI.\n<human>:",
+    ]:
+        rows.append(dict(input=row, prompt_type='plain', source='H2O.ai'))
+    print(len(rows))
+    with open("h2ogpt-personality.json", "w") as f:
+        f.write(json.dumps(rows, indent=2))
+    return rows
+def test_check_stats_data():
+    filename = 'h2ogpt-oig-oasst1-instruct-cleaned-v3.json'
+    df = pd.read_json(filename)
+    # get word stats
+    df['char_count'] = df['input'].apply(lambda x: len(x))
+    import matplotlib.pyplot as plt
+    plt.figure(figsize=(10, 10))
+    plt.hist(df['char_count'], bins=100)
+    chars_avg = np.mean(df['char_count'])
+    chars_median = np.median(df['char_count'])
+    plt.title("char_count avg: %s median: %s" % (chars_avg, chars_median))
+    plt.savefig('chars_hist.png')
+    plt.close()
+    # get tokenize stats for random sample of 1000 rows
+    from finetune import generate_and_tokenize_prompt
+    from loaders import get_loaders, get_tokenizer
+    from functools import partial
+    llama_type = False
+    tokenizer_base_model = base_model = 'h2oai/h2ogpt-oasst1-512-20b'
+    model_loader, tokenizer_loader = get_loaders(llama_type=llama_type, model_name=base_model, reward_type=False)
+    local_files_only = False
+    resume_download = True
+    use_auth_token = False
+    tokenizer = get_tokenizer(tokenizer_loader, tokenizer_base_model, local_files_only, resume_download, use_auth_token)
+    prompt_type = 'plain'  # trained with data already in human bot form
+    train_on_inputs = True
+    add_eos_token = False
+    cutoff_len = 512  # can choose 2048
+    generate_and_tokenize_prompt_fun = partial(generate_and_tokenize_prompt, prompt_type=prompt_type,
+                                               train_on_inputs=train_on_inputs, add_eos_token=add_eos_token,
+                                               cutoff_len=cutoff_len, tokenizer=tokenizer)
+    from datasets import load_dataset
+    data = load_dataset("json", data_files={"train": filename})
+    val_set_size = 0.90
+    train_val = data["train"].train_test_split(
+        test_size=val_set_size, shuffle=True, seed=42
+    )
+    train_data = train_val["train"]
+    train_data = train_data.shuffle().map(generate_and_tokenize_prompt_fun, num_proc=os.cpu_count())
+    df_tokens = pd.DataFrame([len(x) for x in train_data['input_ids']], columns=['token_count'])
+    plt.figure(figsize=(10, 10))
+    plt.hist(df_tokens['token_count'], bins=100)
+    token_avg = np.mean(df_tokens['token_count'])
+    token_median = np.median(df_tokens['token_count'])
+    plt.title("token_count with cutoff=%s avg: %s median: %s" % (cutoff_len, token_avg, token_median))
+    plt.savefig('token_hist_%s.png' % cutoff_len)
+    plt.close()
+def get_unhelpful_list():
+    # base versions
+    unhelpful = ["I'm sorry, I didn't quite understand your question, could you please rephrase it?",
+                 "I'm sorry, but I don't understand your question. Could you please rephrase it?",
+                 "I'm sorry, I don't quite understand your question",
+                 "I'm sorry, I don't know",
+                 "I'm sorry, but I don't know",
+                 "I don't know anything",
+                 "I do not know",
+                 "I don't know",
+                 "I don't know how",
+                 "I do not know how",
+                 "Can you please explain what you mean",
+                 "please explain what you mean",
+                 "please explain",
+                 "I'm sorry, but I don't know how to tell a story. Can you please explain what you mean by",
+                 "I'm sorry but I don't understand what you mean",
+                 "I don't understand",
+                 "I don't have the ability",
+                 "I do not have the ability",
+                 "I do not have",
+                 "I am a language model,",
+                 "I am a large language model,",
+                 "I do not understand your question. Can you please try to make it clearer?",
+                 "I'm sorry, but as an AI language model",
+                 "I apologize, but I cannot rephrase text that I cannot understand. Your post is difficult to read and follow.",
+                 "I apologize, but I am not h2oGPT. I am a language model developed by H2O.ai. How may I help you?",
+                 "Sorry, but I am not an actual Linux shell, nor am I capable of emulating one. I am an open source chat assistant and would be glad t",
+                 "I apologize, but I cannot perform the task you have requested.",
+                 "I'm sorry, I cannot perform this task as I am an AI language model and do not have access",
+                 "I'm sorry, I'm not sure what you're asking for here.",
+                 "I'm not sure what you are asking",
+                 "You need to provide more context",
+                 ]
+    # reduced versions, with redundant parts, just to give context for where they came from
+    unhelpful += ["sorry, I didn't quite understand your question",
+                  "I didn't quite understand your question",
+                  "I didn't understand your question",
+                  "I did not understand your question",
+                  "I did not understand the question",
+                  "could you please rephrase"
+                  "could you rephrase"
+                  "I do not understand your question.",
+                  "I do not understand the question.",
+                  "I do not understand that question.",
+                  "Can you please try to make it clearer",
+                  "Can you try to make it clearer",
+                  "sorry, but as an AI language model",
+                  "as an AI language model",
+                  "I apologize, but I cannot",
+                  "I cannot rephrase text",
+                  "I cannot understand. Your post is difficult to read and follow."
+                  "Your post is difficult to read and follow."
+                  "I apologize, but I am",
+                  "Sorry, but I am not ",
+                  "nor am I capable",
+                  "I am not capable of",
+                  "I apologize, but I cannot perform the task you have requested",
+                  "I cannot perform the task",
+                  "I cannot complete the task",
+                  "I'm sorry",
+                  "I am sorry",
+                  "do not have access",
+                  "not sure what you're asking for",
+                  "not sure what you are asking for",
+                  "not sure what is being asked",
+                  "I'm not sure what you are asking",
+                  "not sure what you are asking",
+                  "You need to provide more context",
+                  "provide more context",
+                  ]
+    unhelpful += ["As a large language model",
+                  "cannot provide any information",
+                  "As an artificial intelligence I do not have the capability",
+                  "As an artificial intelligence I don't have the capability",
+                  "As an artificial intelligence I can't",
+                  "As an artificial intelligence I cannot",
+                  "I am sorry but I do not understand",
+                  "Can you please explain",
+                  "(sorry couldn't resist)",
+                  "(sorry could not resist)",
+                  " :)",
+                  " ;)",
+                  " :-)",
+                  " ;-)",
+                  " lol ",
+                  "Thanks so much!!!",
+                  "Thank You :)!!!",
+                  "Please try not to repeat",
+                  "I am an AI language model",
+                  "I'm a AI assistant that",
+                  "I'm an AI assistant that",
+                  "I am an AI assistant that",
+                  "etc.",
+                  "etc.etc.",
+                  "etc. etc.",
+                  "etc etc",
+                  ]
+    return unhelpful
+def test_check_unhelpful():
+    # file = '/home/jon/Downloads/openassistant_oasst1_h2ogpt_graded.json'
+    file = '/home/jon/Downloads/openassistant_oasst1_h2ogpt_grades.json'
+    # file = 'h2ogpt-oig-oasst1-instruct-cleaned-v2.json'
+    unhelpful = get_unhelpful_list()
+    #data = json.load(open(file, 'rt'))
+    df = pd.read_json(file)
+    use_reward_score_threshold = False
+    use_bleu_threshold = False
+    use_sentence_sim = True
+    from sacrebleu.metrics import BLEU
+    bleu = BLEU()
+    from nltk.translate.bleu_score import sentence_bleu
+    def get_bleu(actual, expected_list):
+        #return bleu.sentence_score(actual, expected_list).score
+        return sentence_bleu(expected_list, actual)
+    threshold = 0.0
+    if use_reward_score_threshold:
+        df = df[df['grade_deberta'] > threshold]
+    # back to as if original json load
+    data = df.to_dict(orient='records')
+    bads = {}
+    string_all = str(data)
+    for sub in unhelpful:
+        bads[sub] = string_all.count(sub)
+    bads = {k: v for k, v in bads.items() if v > 0}
+    import pprint
+    pp = pprint.PrettyPrinter(indent=4)
+    pp.pprint(bads)
+    total_bads = sum(list(bads.values()))
+    print('total_bads: %s' % total_bads, flush=True)
+    # check just bot
+    import re
+    convs = [[x.strip() for x in re.split(r'%s|%s' % (human, bot), y['input']) if x.strip()] for y in data]
+    humans = [[x for i, x in enumerate(y) if i % 2 == 0] for y in convs]
+    bots = [[x for i, x in enumerate(y) if i % 2 == 1] for y in convs]
+    # FIXME: apply back to json etc., just see for now
+    bleu_threshold = 0.9
+    if use_bleu_threshold:
+        bots = [[x for x in y if get_bleu(x, unhelpful) < bleu_threshold] for y in tqdm(bots)]
+    cosine_sim_threshold = 0.8
+    if use_sentence_sim:
+        # pip install sentence_transformers-2.2.2
+        from sentence_transformers import SentenceTransformer
+        # sent_model = 'bert-base-nli-mean-tokens'
+        #sent_model = 'nli-distilroberta-base-v2'
+        sent_model = 'all-MiniLM-L6-v2'
+        model = SentenceTransformer(sent_model)
+        sentence_embeddings = model.encode(unhelpful)
+        from sklearn.metrics.pairwise import cosine_similarity
+        bots = [x for x in tqdm(bots) if np.max(cosine_similarity(model.encode(x), sentence_embeddings)) < cosine_sim_threshold]
+    bads_bots = {}
+    string_all = str(bots)
+    for sub in unhelpful:
+        bads_bots[sub] = string_all.count(sub)
+    bads_bots = {k: v for k, v in bads_bots.items() if v > 0}
+    import pprint
+    pp = pprint.PrettyPrinter(indent=4)
+    pp.pprint(bads_bots)
+    total_bads_bots = sum(list(bads_bots.values()))
+    print('threshold: %g use_bleu_threshold: %g total_bads_bots: %s total_bots: %s total_humans: %s' % (threshold, use_bleu_threshold, total_bads_bots, len(bots), len(humans)), flush=True)
+    # assert len(bads) == 0, bads
+    assert len(bads_bots) == 0, bads_bots
+def test_fortune2000_personalized():
+    row_list = []
+    import glob
+    if not os.path.isdir("wikitext"):
+        raise RuntimeError("download https://github.com/h2oai/h2ogpt/files/11423008/wikitext.zip and unzip")
+    for file in glob.glob("wikitext/*.txt"):
+        with open(file, "r") as f:
+            blob = f.read()
+        N = 512 * 4
+        row_list.extend([{'input': s, 'prompt_type': 'plain', 'source': "%s" % os.path.basename(file)}
+                         for s in get_sentences(blob, N) if s])
+    personality = create_personality_data()
+    import copy
+    for i in range(10):
+        row_list.extend(copy.deepcopy(personality))
+    np.random.seed(123)
+    np.random.shuffle(row_list)
+    for i in range(len(row_list)):
+        row_list[i]['id'] = i
+    for i in range(len(row_list)):
+        assert row_list[i]['id'] == i
+    with open("h2ogpt-fortune2000-personalized.json", "w") as ff:
+        ff.write(json.dumps(row_list, indent=2))

finetune.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import os
 import sys
-import time
 from functools import partial
 from typing import List, Union
-from enum import Enum
 import fire
 import numpy as np
 from utils import get_githash, copy_code
 import torch
@@ -17,82 +18,6 @@ def log(*args, **kwargs):
         print(*args, **kwargs)
-class PromptType(Enum):
-    plain = 0
-    instruct = 1
-    quality = 2
-    human_bot = 3
-    dai_faq = 4
-    summarize = 5
-    simple_instruct = 6
-    instruct_vicuna = 7
-    instruct_with_end = 8
-    human_bot_orig = 9
-    prompt_answer = 10
-    open_assistant = 11
-    wizard_lm = 12
-prompt_type_to_model_name = {
-    'plain': [
-        'EleutherAI/gpt-j-6B',
-        'EleutherAI/pythia-6.9b',
-        'EleutherAI/pythia-12b',
-        'EleutherAI/pythia-12b-deduped',
-        'EleutherAI/gpt-neox-20b',
-        'decapoda-research/llama-7b-hf',
-        'decapoda-research/llama-13b-hf',
-        'decapoda-research/llama-30b-hf',
-        'decapoda-research/llama-65b-hf',
-        'facebook/mbart-large-50-many-to-many-mmt',
-        'philschmid/bart-large-cnn-samsum',
-        'philschmid/flan-t5-base-samsum',
-        'gpt2',
-        'distilgpt2',
-        'mosaicml/mpt-7b-storywriter',
-        'mosaicml/mpt-7b-instruct',  # internal code handles instruct
-        'mosaicml/mpt-7b-chat',  # NC, internal code handles instruct
-    ],
-    'prompt_answer': [
-        'h2oai/h2ogpt-gm-oasst1-en-1024-20b',
-        'h2oai/h2ogpt-gm-oasst1-en-1024-12b',
-        'h2oai/h2ogpt-gm-oasst1-multilang-1024-20b',
-        'h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt',
-        'h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2',
-    ],
-    'instruct': [],
-    'instruct_with_end': ['databricks/dolly-v2-12b'],
-    'quality': [],
-    'human_bot': [
-        'h2oai/h2ogpt-oasst1-512-12b',
-        'h2oai/h2ogpt-oasst1-512-20b',
-        'h2oai/h2ogpt-oig-oasst1-512-20b',
-        'h2oai/h2ogpt-oig-oasst1-512-12b',
-        'h2oai/h2ogpt-oig-oasst1-512-6.9b',
-        'h2oai/h2ogpt-research-oasst1-512-30b',  # private
-    ],
-    'dai_faq': [],
-    'summarize': [],
-    'simple_instruct': ['t5-small', 't5-large', 'google/flan-t5', 'google/flan-t5-xxl', 'google/flan-ul2'],
-    'instruct_vicuna': ['AlekseyKorshuk/vicuna-7b', 'TheBloke/stable-vicuna-13B-HF', 'junelee/wizard-vicuna-13b'],
-    'human_bot_orig': ['togethercomputer/GPT-NeoXT-Chat-Base-20B'],
-    "open_assistant": ['OpenAssistant/oasst-sft-7-llama-30b-xor', 'oasst-sft-7-llama-30b'],
-    "wizard_lm": ['ehartford/WizardLM-7B-Uncensored', 'ehartford/WizardLM-13B-Uncensored'],
-}
-inv_prompt_type_to_model_name = {v.strip(): k for k, l in prompt_type_to_model_name.items() for v in l}
-inv_prompt_type_to_model_lower = {v.strip().lower(): k for k, l in prompt_type_to_model_name.items() for v in l}
-prompt_types_strings = []
-for p in PromptType:
-    prompt_types_strings.extend([p.name])
-prompt_types = []
-for p in PromptType:
-    prompt_types.extend([p.name, p.value, str(p.value)])
 # supported by huggingface evaluate
 supported_metrics = ['bleu', 'rouge', 'sacrebleu', 'meteor']
@@ -353,7 +278,7 @@ def train(
         if os.path.exists(checkpoint_name):
             log(f"Restarting from {checkpoint_name}")
             adapters_weights = torch.load(checkpoint_name)
-            model = set_peft_model_state_dict(model, adapters_weights)
         else:
             log(f"Checkpoint {checkpoint_name} not found")
@@ -656,58 +581,6 @@ def train(
     log("\n If there's a warning about missing keys above, please disregard :)")
-def get_loaders(llama_type, model_name, reward_type):
-    # NOTE: Some models need specific new prompt_type
-    # E.g. t5_xxl_true_nli_mixture has input format: "premise: PREMISE_TEXT hypothesis: HYPOTHESIS_TEXT".)
-    if llama_type:
-        from transformers import LlamaForCausalLM, LlamaTokenizer
-        model_loader = LlamaForCausalLM
-        tokenizer_loader = LlamaTokenizer
-    elif 'distilgpt2' in model_name.lower():
-        from transformers import AutoModelForCausalLM, AutoTokenizer
-        return AutoModelForCausalLM, AutoTokenizer
-    elif 'gpt2' in model_name.lower():
-        from transformers import GPT2LMHeadModel, GPT2Tokenizer
-        return GPT2LMHeadModel, GPT2Tokenizer
-    elif 'mbart-' in model_name.lower():
-        from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
-        return MBartForConditionalGeneration, MBart50TokenizerFast
-    elif 't5' == model_name.lower() or \
-         't5-' in model_name.lower() or \
-         'flan-' in model_name.lower():
-        from transformers import AutoTokenizer, T5ForConditionalGeneration
-        return T5ForConditionalGeneration, AutoTokenizer
-    elif 'bigbird' in model_name:
-        from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer
-        return BigBirdPegasusForConditionalGeneration, AutoTokenizer
-    elif 'bart-large-cnn-samsum' in model_name or 'flan-t5-base-samsum' in model_name:
-        from transformers import pipeline
-        return pipeline, "summarization"
-    elif reward_type or 'OpenAssistant/reward-model'.lower() in model_name.lower():
-        from transformers import AutoModelForSequenceClassification, AutoTokenizer
-        return AutoModelForSequenceClassification, AutoTokenizer
-    else:
-        from transformers import AutoTokenizer, AutoModelForCausalLM
-        model_loader = AutoModelForCausalLM
-        tokenizer_loader = AutoTokenizer
-    return model_loader, tokenizer_loader
-def get_tokenizer(tokenizer_loader, tokenizer_base_model, local_files_only, resume_download, use_auth_token):
-    tokenizer = tokenizer_loader.from_pretrained(tokenizer_base_model,
-                                                 local_files_only=local_files_only,
-                                                 resume_download=resume_download,
-                                                 use_auth_token=use_auth_token)
-    tokenizer.pad_token_id = 0  # different from the eos token
-    # when generating, we will use the logits of right-most token to predict the next token
-    # so the padding should be on the left,
-    # e.g. see: https://huggingface.co/transformers/v4.11.3/model_doc/t5.html#inference
-    tokenizer.padding_side = "left"  # Allow batched inference
-    return tokenizer
 def tokenize(prompt, tokenizer, cutoff_len, add_eos_token=False):
     # there's probably a way to do this with the tokenizer settings
     # but again, gotta move fast
@@ -765,253 +638,6 @@ def generate_and_tokenize_prompt(data_point, prompt_type=None, train_on_inputs=F
     return tokenized_full_prompt
-def get_prompt(prompt_type, chat, context, reduced):
-    if prompt_type in [-1, "-1", "plain"]:
-        promptA = promptB = PreInstruct = PreInput = PreResponse = ''
-        terminate_response = []
-        chat_sep = ''
-    elif prompt_type == 'simple_instruct':
-        promptA = promptB = PreInstruct = PreInput = PreResponse = None
-        terminate_response = []
-        chat_sep = '\n'
-    elif prompt_type in [0, "0", "instruct"] or prompt_type in [7, "7", "instruct_with_end"]:
-        promptA = 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n' if not (chat and reduced) else ''
-        promptB = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n' if not (chat and reduced) else ''
-        PreInstruct = """
-### Instruction:
-"""
-        PreInput = """
-### Input:
-"""
-        PreResponse = """
-### Response:
-"""
-        if prompt_type in [7, "7", "instruct_with_end"]:
-            terminate_response = ['### End']
-        else:
-            terminate_response = None
-        chat_sep = '\n'
-    elif prompt_type in [1, "1", "quality"]:
-        promptA = 'Write a detailed high-quality, accurate, fair, Response with about 100 words by following the Instruction as applied on the Input.\n' if not (chat and reduced) else ''
-        promptB = 'Write a detailed high-quality, accurate, fair, Response with about 100 words by following the Instruction.\n' if not (chat and reduced) else ''
-        PreInstruct = """
-### Instruction:
-"""
-        PreInput = """
-### Input:
-"""
-        PreResponse = """
-### Response:
-"""
-        terminate_response = None
-        chat_sep = '\n'
-    elif prompt_type in [2, "2", "human_bot", 9, "9", "human_bot_orig"]:
-        human = '<human>:'
-        bot = "<bot>:"
-        if reduced or context or prompt_type in [2, "2", "human_bot"]:
-            preprompt = ''
-        else:
-            cur_date = time.strftime('%Y-%m-%d')
-            cur_time = time.strftime('%H:%M:%S %p %Z')
-            PRE_PROMPT = """\
-Current Date: {}
-Current Time: {}
-"""
-            preprompt = PRE_PROMPT.format(cur_date, cur_time)
-        start = human
-        promptB = promptA = '%s%s ' % (preprompt, start)
-        PreInstruct = ""
-        PreInput = None
-        if reduced:
-            # when making context, want it to appear as-if LLM generated, which starts with space after :
-            PreResponse = bot + ' '
-        else:
-            # normally LLM adds space after this, because was how trained.
-            # if add space here, non-unique tokenization will often make LLM produce wrong output
-            PreResponse = bot
-        terminate_response = [start, PreResponse]
-        chat_sep = '\n'
-    elif prompt_type in [3, "3", "dai_faq"]:
-        promptA = ''
-        promptB = 'Answer the following Driverless AI question.\n'
-        PreInstruct = """
-### Driverless AI frequently asked question:
-"""
-        PreInput = None
-        PreResponse = """
-### Driverless AI documentation answer:
-"""
-        terminate_response = ['\n\n']
-        chat_sep = terminate_response
-    elif prompt_type in [5, "5", "summarize"]:
-        promptA = promptB = PreInput = ''
-        PreInstruct = '## Main Text\n\n'
-        PreResponse = '\n\n## Summary\n\n'
-        terminate_response = None
-        chat_sep = '\n'
-    elif prompt_type in [6, "6", "instruct_vicuna"]:
-        promptA = promptB = "A chat between a curious human and an artificial intelligence assistant. " \
-            "The assistant gives helpful, detailed, and polite answers to the human's questions." if not (chat and reduced) else ''
-        PreInstruct = """
-### Human:
-"""
-        PreInput = None
-        PreResponse = """
-### Assistant:
-"""
-        terminate_response = ['### Human:']  # but only allow terminate after prompt is found correctly, else can't terminate
-        chat_sep = '\n'
-    elif prompt_type in [10, "10", "prompt_answer"]:
-        preprompt = ''
-        prompt_tokens = "<|prompt|>"
-        answer_tokens = "<|answer|>"
-        start = prompt_tokens
-        promptB = promptA = '%s%s' % (preprompt, start)
-        PreInstruct = ""
-        PreInput = None
-        PreResponse = answer_tokens
-        eos = '<|endoftext|>'  # neox eos
-        terminate_response = [start, PreResponse, eos]
-        chat_sep = eos
-    elif prompt_type in [11, "11", "open_assistant"]:
-        # From added_tokens.json
-        preprompt = ''
-        prompt_tokens = "<|prompter|>"
-        answer_tokens = "<|assistant|>"
-        start = prompt_tokens
-        promptB = promptA = '%s%s' % (preprompt, start)
-        PreInstruct = ""
-        PreInput = None
-        PreResponse = answer_tokens
-        pend = "<|prefix_end|>"
-        eos = "</s>"
-        terminate_response = [start, PreResponse, pend, eos]
-        chat_sep = eos
-    elif prompt_type in [12, "12", "wizard_lm"]:
-        # https://github.com/ehartford/WizardLM/blob/main/src/train_freeform.py
-        preprompt = ''
-        start = ''
-        promptB = promptA = '%s%s' % (preprompt, start)
-        PreInstruct = ""
-        PreInput = None
-        PreResponse = "\n\n### Response"
-        eos = "</s>"
-        terminate_response = [PreResponse, eos]
-        chat_sep = eos
-    else:
-        raise RuntimeError("No such prompt_type=%s" % prompt_type)
-    return promptA, promptB, PreInstruct, PreInput, PreResponse, terminate_response, chat_sep
-def generate_prompt(data_point, prompt_type, chat, reduced):
-    context = data_point.get('context')
-    if context is None:
-        context = ''
-    instruction = data_point.get('instruction')
-    input = data_point.get('input')
-    output = data_point.get('output')
-    prompt_type = data_point.get('prompt_type', prompt_type)
-    assert prompt_type in prompt_types, "Bad prompt type: %s" % prompt_type
-    promptA, promptB, PreInstruct, PreInput, PreResponse, \
-    terminate_response, chat_sep = get_prompt(prompt_type, chat, context, reduced)
-    prompt = context if not reduced else ''
-    if input and promptA:
-        prompt += f"""{promptA}"""
-    elif promptB:
-        prompt += f"""{promptB}"""
-    if instruction and PreInstruct is not None and input and PreInput is not None:
-        prompt += f"""{PreInstruct}{instruction}{PreInput}{input}"""
-        prompt = inject_newline(prompt_type, prompt)
-    elif instruction and input and PreInstruct is None and PreInput is not None:
-        prompt += f"""{PreInput}{instruction}
-{input}"""
-        prompt = inject_newline(prompt_type, prompt)
-    elif input and instruction and PreInput is None and PreInstruct is not None:
-        prompt += f"""{PreInstruct}{instruction}
-{input}"""
-        prompt = inject_newline(prompt_type, prompt)
-    elif instruction and PreInstruct is not None:
-        prompt += f"""{PreInstruct}{instruction}"""
-        prompt = inject_newline(prompt_type, prompt)
-    elif input and PreInput is not None:
-        prompt += f"""{PreInput}{input}"""
-        prompt = inject_newline(prompt_type, prompt)
-    elif input and instruction and PreInput is not None:
-        prompt += f"""{PreInput}{instruction}{input}"""
-        prompt = inject_newline(prompt_type, prompt)
-    elif input and instruction and PreInstruct is not None:
-        prompt += f"""{PreInstruct}{instruction}{input}"""
-        prompt = inject_newline(prompt_type, prompt)
-    elif input and instruction:
-        # i.e. for simple_instruct
-        prompt += f"""{instruction}: {input}"""
-        prompt = inject_newline(prompt_type, prompt)
-    elif input:
-        prompt += f"""{input}"""
-        prompt = inject_newline(prompt_type, prompt)
-    elif instruction:
-        prompt += f"""{instruction}"""
-        prompt = inject_newline(prompt_type, prompt)
-    if PreResponse is not None:
-        prompt += f"""{PreResponse}"""
-        pre_response = PreResponse  # Don't use strip
-    else:
-        pre_response = ''
-    if output:
-        prompt += f"""{output}"""
-    return prompt, pre_response, terminate_response, chat_sep
-def inject_newline(prompt_type, prompt):
-    if prompt_type not in [-1, '-1', 'plain', 'simple_instruct']:
-        # only add new line if structured prompt, while 'plain' is just generation of next tokens from input
-        prompt += '\n'
-    return prompt
-example_data_point0 = dict(instruction="Summarize",
-                           input="Ducks eat seeds by the lake, then swim in the lake where fish eat small animals.",
-                           output="Ducks eat and swim at the lake.")
-example_data_point1 = dict(instruction="Who is smarter, Einstein or Newton?",
-                           output="Einstein.")
-example_data_point2 = dict(input="Who is smarter, Einstein or Newton?",
-                           output="Einstein.")
-example_data_points = [example_data_point0, example_data_point1, example_data_point2]
-def test_train_prompt(prompt_type='instruct', data_point=0):
-    example_data_point = example_data_points[data_point]
-    return generate_prompt(example_data_point, prompt_type, False, False)
 def test_debug():
     fire.Fire(train)

 import os
 import sys
 from functools import partial
 from typing import List, Union
 import fire
 import numpy as np
+from loaders import get_loaders, get_tokenizer
+from prompter import generate_prompt, prompt_types
 from utils import get_githash, copy_code
 import torch
         print(*args, **kwargs)
 # supported by huggingface evaluate
 supported_metrics = ['bleu', 'rouge', 'sacrebleu', 'meteor']
         if os.path.exists(checkpoint_name):
             log(f"Restarting from {checkpoint_name}")
             adapters_weights = torch.load(checkpoint_name)
+            set_peft_model_state_dict(model, adapters_weights)
         else:
             log(f"Checkpoint {checkpoint_name} not found")
     log("\n If there's a warning about missing keys above, please disregard :)")
 def tokenize(prompt, tokenizer, cutoff_len, add_eos_token=False):
     # there's probably a way to do this with the tokenizer settings
     # but again, gotta move fast
     return tokenized_full_prompt
 def test_debug():
     fire.Fire(train)

generate.py CHANGED Viewed

@@ -1,5 +1,9 @@
 import functools
 import queue
 import sys
 import os
 import time
@@ -9,7 +13,12 @@ from datetime import datetime
 import filelock
 import psutil
-from utils import set_seed, clear_torch_cache, save_generate_output, NullContext, wrapped_partial, EThread, get_githash
 SEED = 1236
 set_seed(SEED)
@@ -25,13 +34,16 @@ from peft import PeftModel
 from transformers import GenerationConfig, AutoModel, TextIteratorStreamer
 from accelerate import init_empty_weights, infer_auto_device_map
-from prompter import Prompter
-from finetune import get_loaders, example_data_points, generate_prompt, inv_prompt_type_to_model_lower
 from stopping import get_stopping
 eval_extra_columns = ['prompt', 'response', 'score']
 def main(
         load_8bit: bool = False,
@@ -63,6 +75,7 @@ def main(
         resume_download: bool = True,
         use_auth_token: Union[str, bool] = False,
         trust_remote_code: Union[str, bool] = True,
         src_lang: str = "English",
         tgt_lang: str = "Russian",
@@ -70,7 +83,6 @@ def main(
         gradio: bool = True,
         gradio_avoid_processing_markdown: bool = False,
         chat: bool = True,
-        chat_history: int = 4096,
         chat_context: bool = False,
         stream_output: bool = True,
         show_examples: bool = None,
@@ -98,6 +110,30 @@ def main(
         eval_sharegpt_prompts_only: int = 0,
         eval_sharegpt_prompts_only_seed: int = 1234,
         eval_sharegpt_as_output: bool = False,
 ):
     """
@@ -127,12 +163,12 @@ def main(
     :param resume_download: whether to resume downloads from HF for models
     :param use_auth_token: whether to use HF auth token (requires CLI did huggingface-cli login before)
     :param trust_remote_code: whether to use trust any code needed for HF model
     :param src_lang: source languages to include if doing translation (None = all)
     :param tgt_lang: target languages to include if doing translation (None = all)
     :param gradio: whether to enable gradio, or to enable benchmark mode
     :param gradio_avoid_processing_markdown:
     :param chat: whether to enable chat mode with chat history
-    :param chat_history: maximum character length of chat context/history
     :param chat_context: whether to use extra helpful context if human_bot
     :param stream_output: whether to stream output from generate
     :param show_examples: whether to show clickable examples in gradio
@@ -157,6 +193,41 @@ def main(
     :param eval_sharegpt_prompts_only: for no gradio benchmark, if using ShareGPT prompts for eval
     :param eval_sharegpt_prompts_only_seed: for no gradio benchmark, if seed for ShareGPT sampling
     :param eval_sharegpt_as_output: for no gradio benchmark, whether to test ShareGPT output itself
     :return:
     """
     is_hf = bool(os.getenv("HUGGINGFACE_SPACES"))
@@ -170,8 +241,20 @@ def main(
     # allow set token directly
     use_auth_token = os.environ.get("HUGGINGFACE_API_TOKEN", use_auth_token)
     if is_public:
         input_lines = 1  # ensure set, for ease of use
         temperature = 0.2 if temperature is None else temperature
         top_p = 0.85 if top_p is None else top_p
@@ -211,7 +294,7 @@ def main(
         torch.backends.cudnn.benchmark = True
         torch.backends.cudnn.enabled = False
         torch.set_default_dtype(torch.float32)
-        if psutil.virtual_memory().available < 94*1024**3:
             # 12B uses ~94GB
             # 6.9B uses ~47GB
             base_model = 'h2oai/h2ogpt-oig-oasst1-512-6.9b' if not base_model else base_model
@@ -223,16 +306,22 @@ def main(
         stream_output = False
         # else prompt removal can mess up output
         chat = False
     placeholder_instruction, placeholder_input, \
-    stream_output, show_examples, \
-    prompt_type, temperature, top_p, top_k, num_beams, \
-    max_new_tokens, min_new_tokens, early_stopping, max_time, \
-    repetition_penalty, num_return_sequences, \
-    do_sample, \
-    src_lang, tgt_lang, \
-    examples, \
-    task_info = \
         get_generate_params(model_lower, chat,
                             stream_output, show_examples,
                             prompt_type, temperature, top_p, top_k, num_beams,
@@ -246,6 +335,38 @@ def main(
     print(f"Generating model with params:\n{locals_print}", flush=True)
     print("Command: %s\nHash: %s" % (str(' '.join(sys.argv)), get_githash()), flush=True)
     if not gradio:
         if eval_sharegpt_prompts_only > 0:
             # override default examples with shareGPT ones for human-level eval purposes only
@@ -309,11 +430,9 @@ def main(
             if not eval_sharegpt_as_output:
                 model, tokenizer, device = get_model(**locals())
                 model_state = [model, tokenizer, device, base_model]
-                fun = partial(evaluate, model_state, debug=debug, save_dir=save_dir, is_low_mem=is_low_mem,
-                              raise_generate_gpu_exceptions=raise_generate_gpu_exceptions,
-                              chat_context=chat_context,
-                              concurrency_count=concurrency_count,
-                              lora_weights=lora_weights)
             else:
                 assert eval_sharegpt_prompts_only > 0
@@ -325,8 +444,6 @@ def main(
             t0 = time.time()
             score_dump = []
-            import matplotlib.pyplot as plt
             for exi, ex in enumerate(examples):
                 instruction = ex[eval_func_param_names.index('instruction_nochat')]
                 iinput = ex[eval_func_param_names.index('iinput_nochat')]
@@ -363,7 +480,8 @@ def main(
                         try:
                             score = torch.sigmoid(smodel(**inputs).logits[0].float()).cpu().detach().numpy()[0]
                         except torch.cuda.OutOfMemoryError as e:
-                            print("GPU OOM 1: question: %s answer: %s exception: %s" % (prompt, res, str(e)), flush=True)
                             traceback.print_exc()
                             score = 0.0
                             clear_torch_cache()
@@ -419,22 +537,23 @@ def main(
         smodel, stokenizer, sdevice = get_score_model(**all_kwargs)
         score_model_state0 = [smodel, stokenizer, sdevice, score_model]
-        go_gradio(**locals())
-def get_device():
-    if torch.cuda.is_available():
-        device = "cuda"
-    else:
-        device = "cpu"
-    return device
 def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type,
                        gpu_id=0,
                        use_auth_token=False,
                        trust_remote_code=True,
                        triton_attn=False,
                        long_sequence=True,
                        ):
@@ -448,6 +567,7 @@ def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward
     :param gpu_id:
     :param use_auth_token:
     :param trust_remote_code:
     :param triton_attn:
     :param long_sequence:
     :return:
@@ -455,7 +575,8 @@ def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward
     with init_empty_weights():
         from transformers import AutoConfig
         config = AutoConfig.from_pretrained(base_model, use_auth_token=use_auth_token,
-                                            trust_remote_code=trust_remote_code)
         if triton_attn and 'mpt-' in base_model.lower():
             config.attn_config['attn_impl'] = 'triton'
         if long_sequence:
@@ -485,7 +606,6 @@ def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward
                 dtype=torch.float16 if load_half else torch.float32,
             )
             device_map.update(device_map_model)
-        print('device_map: %s' % device_map, flush=True)
     else:
         device_map = "auto"
@@ -504,6 +624,7 @@ def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward
     else:
         device_map = {'': 'cpu'}
         model_kwargs['load_in_8bit'] = False
     load_in_8bit = model_kwargs.get('load_in_8bit', False)
     model_kwargs['device_map'] = device_map
@@ -537,6 +658,7 @@ def get_model(
         resume_download: bool = True,
         use_auth_token: Union[str, bool] = False,
         trust_remote_code: bool = True,
         compile: bool = True,
         **kwargs,
 ):
@@ -556,11 +678,17 @@ def get_model(
     :param resume_download: resume downloads from HF
     :param use_auth_token: assumes user did on CLI `huggingface-cli login` to access private repo
     :param trust_remote_code: trust code needed by model
     :param compile: whether to compile torch model
     :param kwargs:
     :return:
     """
     print("Get %s model" % base_model, flush=True)
     if lora_weights is not None and lora_weights.strip():
         print("Get %s lora weights" % lora_weights, flush=True)
     device = get_device()
@@ -575,7 +703,8 @@ def get_model(
     from transformers import AutoConfig
     config = AutoConfig.from_pretrained(base_model, use_auth_token=use_auth_token,
-                                        trust_remote_code=trust_remote_code)
     llama_type_from_config = 'llama' in str(config).lower()
     llama_type_from_name = "llama" in base_model.lower()
     llama_type = llama_type_from_config or llama_type_from_name
@@ -593,6 +722,7 @@ def get_model(
                                                      resume_download=resume_download,
                                                      use_auth_token=use_auth_token,
                                                      trust_remote_code=trust_remote_code,
                                                      )
     else:
         tokenizer = tokenizer_loader
@@ -610,6 +740,7 @@ def get_model(
                             resume_download=resume_download,
                             use_auth_token=use_auth_token,
                             trust_remote_code=trust_remote_code,
                             )
         if 'mbart-' not in base_model.lower() and 'mpt-' not in base_model.lower():
             model_kwargs.update(dict(load_in_8bit=load_8bit,
@@ -630,6 +761,7 @@ def get_model(
                                                gpu_id=gpu_id,
                                                use_auth_token=use_auth_token,
                                                trust_remote_code=trust_remote_code,
                                                )
                 else:
                     if load_half and not load_8bit:
@@ -653,6 +785,7 @@ def get_model(
                 resume_download=resume_download,
                 use_auth_token=use_auth_token,
                 trust_remote_code=trust_remote_code,
                 device_map={"": 0} if device == 'cuda' else {"": 'cpu'},  # seems to be required
             )
         else:
@@ -669,6 +802,7 @@ def get_model(
                     resume_download=resume_download,
                     use_auth_token=use_auth_token,
                     trust_remote_code=trust_remote_code,
                     device_map="auto",
                 )
                 if load_half:
@@ -729,11 +863,13 @@ eval_func_param_names = ['instruction',
                          'chat',
                          'instruction_nochat',
                          'iinput_nochat',
                          ]
 def evaluate(
         model_state,
         # START NOTE: Examples must have same order of parameters
         instruction,
         iinput,
@@ -754,6 +890,7 @@ def evaluate(
         chat,
         instruction_nochat,
         iinput_nochat,
         # END NOTE: Examples must have same order of parameters
         src_lang=None,
         tgt_lang=None,
@@ -766,12 +903,34 @@ def evaluate(
         raise_generate_gpu_exceptions=None,
         chat_context=None,
         lora_weights=None,
 ):
     # ensure passed these
     assert concurrency_count is not None
     assert is_low_mem is not None
     assert raise_generate_gpu_exceptions is not None
     assert chat_context is not None
     if debug:
         locals_dict = locals().copy()
@@ -817,10 +976,58 @@ def evaluate(
         # get hidden context if have one
         context = get_context(chat_context, prompt_type)
-    data_point = dict(context=context, instruction=instruction, input=iinput)
     prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
     prompt = prompter.generate_prompt(data_point)
     if isinstance(tokenizer, str):
         # pipeline
         if tokenizer == "summarization":
@@ -838,18 +1045,14 @@ def evaluate(
         # override, ignore user change
         num_return_sequences = 1
     stopping_criteria = get_stopping(prompt_type, tokenizer, device)
-    # help to avoid errors like:
-    # RuntimeError: The size of tensor a (2048) must match the size of tensor b (2049) at non-singleton dimension 3
-    # RuntimeError: expected scalar type Half but found Float
-    # with - 256
-    max_length_tokenize = 768 - 256 if is_low_mem else 2048 - 256
-    cutoff_len = max_length_tokenize * 4  # if reaches limit, then can't generate new tokens
-    output_smallest = 30 * 4
-    prompt = prompt[-cutoff_len - output_smallest:]
     inputs = tokenizer(prompt,
                        return_tensors="pt",
                        truncation=True,
                        max_length=max_length_tokenize)
     if debug and len(inputs["input_ids"]) > 0:
         print('input_ids length', len(inputs["input_ids"][0]), flush=True)
     input_ids = inputs["input_ids"].to(device)
@@ -891,7 +1094,7 @@ def evaluate(
                                 **decoder_kwargs
                                 )
     decoder_raw_kwargs = dict(skip_special_tokens=False,
-                          clean_up_tokenization_spaces=True)
     decoder_raw = functools.partial(tokenizer.decode,
                                     **decoder_raw_kwargs
@@ -904,7 +1107,7 @@ def evaluate(
             # else hit bitsandbytes lack of thread safety:
             # https://github.com/h2oai/h2ogpt/issues/104
             # but only makes sense if concurrency_count == 1
-            context_class = NullContext #if concurrency_count > 1 else filelock.FileLock
             print('Pre-Generate: %s' % str(datetime.now()), flush=True)
             decoded_output = None
             with context_class("generate.lock"):
@@ -923,7 +1126,9 @@ def evaluate(
                     inputs_decoded = prompt = inputs_decoded_raw
                     decoder = decoder_raw
                     decoder_kwargs = decoder_raw_kwargs
-                elif inputs_decoded_raw.replace("<unk> ", "").replace("<unk>", "").replace('\n', ' ').replace(' ', '') == prompt.replace('\n', ' ').replace(' ', ''):
                     inputs_decoded = prompt = inputs_decoded_raw
                     decoder = decoder_raw
                     decoder_kwargs = decoder_raw_kwargs
@@ -931,13 +1136,15 @@ def evaluate(
                     print("WARNING: Special characters in prompt", flush=True)
                 if stream_output:
                     skip_prompt = False
-                    streamer = H2OTextIteratorStreamer(tokenizer, skip_prompt=skip_prompt, block=False, **decoder_kwargs)
                     gen_kwargs.update(dict(streamer=streamer))
-                    target_func = generate_with_exceptions
-                    target = wrapped_partial(generate_with_exceptions, model.generate, prompt, inputs_decoded,
-                                             raise_generate_gpu_exceptions, **gen_kwargs)
                     bucket = queue.Queue()
-                    thread = EThread(target=target, kwargs=dict(streamer=streamer), bucket=bucket)
                     thread.start()
                     outputs = ""
                     try:
@@ -969,7 +1176,30 @@ def evaluate(
                         decoded_output = prompt + outputs[0]
                 if save_dir and decoded_output:
                     save_generate_output(output=decoded_output, base_model=base_model, save_dir=save_dir)
-            print('Post-Generate: %s decoded_output: %s' % (str(datetime.now()), len(decoded_output) if decoded_output else -1), flush=True)
 class H2OTextIteratorStreamer(TextIteratorStreamer):
@@ -977,6 +1207,7 @@ class H2OTextIteratorStreamer(TextIteratorStreamer):
     normally, timeout required for now to handle exceptions, else get()
     but with H2O version of TextIteratorStreamer, loop over block to handle
     """
     def __init__(self, tokenizer, skip_prompt: bool = False, timeout: typing.Optional[float] = None,
                  block=True, **decode_kwargs):
         super().__init__(tokenizer, skip_prompt, **decode_kwargs)
@@ -1003,7 +1234,7 @@ class H2OTextIteratorStreamer(TextIteratorStreamer):
                     print("hit stop", flush=True)
                     # could raise or break, maybe best to raise and make parent see if any exception in thread
                     raise StopIteration()
-                    #break
                 value = self.text_queue.get(block=self.block, timeout=self.timeout)
                 break
             except queue.Empty:
@@ -1014,15 +1245,16 @@ class H2OTextIteratorStreamer(TextIteratorStreamer):
             return value
-def generate_with_exceptions(func, prompt, inputs_decoded, raise_generate_gpu_exceptions, **kwargs):
     try:
-        func(**kwargs)
     except torch.cuda.OutOfMemoryError as e:
         print("GPU OOM 2: prompt: %s inputs_decoded: %s exception: %s" % (prompt, inputs_decoded, str(e)),
               flush=True)
-        if kwargs['input_ids'] is not None:
-            kwargs['input_ids'].cpu()
-        kwargs['input_ids'] = None
         traceback.print_exc()
         clear_torch_cache()
         return
@@ -1214,7 +1446,7 @@ y = np.random.randint(0, 1, 100)
     # move to correct position
     for example in examples:
-        example += [chat, '', '']
         # adjust examples if non-chat mode
         if not chat:
             example[eval_func_param_names.index('instruction_nochat')] = example[
@@ -1223,16 +1455,18 @@ y = np.random.randint(0, 1, 100)
             example[eval_func_param_names.index('iinput_nochat')] = example[eval_func_param_names.index('iinput')]
             example[eval_func_param_names.index('iinput')] = ''
     return placeholder_instruction, placeholder_input, \
-           stream_output, show_examples, \
-           prompt_type, temperature, top_p, top_k, num_beams, \
-           max_new_tokens, min_new_tokens, early_stopping, max_time, \
-           repetition_penalty, num_return_sequences, \
-           do_sample, \
-           src_lang, tgt_lang, \
-           examples, \
-           task_info
 def languages_covered():
@@ -1252,12 +1486,6 @@ def get_context(chat_context, prompt_type):
     return context0
-def test_test_prompt(prompt_type='instruct', data_point=0):
-    example_data_point = example_data_points[data_point]
-    example_data_point.pop('output', None)
-    return generate_prompt(example_data_point, prompt_type, False, False)
 def score_qa(smodel, stokenizer, max_length_tokenize, question, answer, cutoff_len):
     question = question[-cutoff_len:]
     answer = answer[-cutoff_len:]
@@ -1321,39 +1549,3 @@ if __name__ == "__main__":
     python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-512-6.9b
     """
     fire.Fire(main)
-import pytest
-@pytest.mark.parametrize(
-    "base_model",
-    [
-        "h2oai/h2ogpt-oig-oasst1-512-6.9b",
-        "h2oai/h2ogpt-oig-oasst1-512-12b",
-        "h2oai/h2ogpt-oig-oasst1-512-20b",
-        "h2oai/h2ogpt-oasst1-512-12b",
-        "h2oai/h2ogpt-oasst1-512-20b",
-        "h2oai/h2ogpt-gm-oasst1-en-1024-20b",
-        "databricks/dolly-v2-12b",
-        "h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2",
-        "ehartford/WizardLM-7B-Uncensored",
-        "ehartford/WizardLM-13B-Uncensored",
-        "AlekseyKorshuk/vicuna-7b",
-        "TheBloke/stable-vicuna-13B-HF",
-        "decapoda-research/llama-7b-hf",
-        "decapoda-research/llama-13b-hf",
-        "decapoda-research/llama-30b-hf",
-        "junelee/wizard-vicuna-13b",
-    ]
-)
-def test_score_eval(base_model):
-    main(
-        base_model=base_model,
-        chat=False,
-        stream_output=False,
-        gradio=False,
-        eval_sharegpt_prompts_only=500,
-        eval_sharegpt_as_output=False,
-        num_beams=2,
-        infer_devices=False,
-    )

+import ast
 import functools
+import glob
+import inspect
 import queue
+import shutil
 import sys
 import os
 import time
 import filelock
 import psutil
+from loaders import get_loaders
+from utils import set_seed, clear_torch_cache, save_generate_output, NullContext, wrapped_partial, EThread, get_githash, \
+    import_matplotlib, get_device, makedirs
+import_matplotlib()
+from matplotlib import pyplot as plt
 SEED = 1236
 set_seed(SEED)
 from transformers import GenerationConfig, AutoModel, TextIteratorStreamer
 from accelerate import init_empty_weights, infer_auto_device_map
+from prompter import Prompter, inv_prompt_type_to_model_lower
 from stopping import get_stopping
 eval_extra_columns = ['prompt', 'response', 'score']
+langchain_modes = ['Disabled', 'ChatLLM', 'LLM', 'All', 'wiki', 'wiki_full', 'UserData', 'MyData', 'github h2oGPT',
+                   'DriverlessAI docs']
+scratch_base_dir = '/tmp/'
 def main(
         load_8bit: bool = False,
         resume_download: bool = True,
         use_auth_token: Union[str, bool] = False,
         trust_remote_code: Union[str, bool] = True,
+        offload_folder: str = "offline_folder",
         src_lang: str = "English",
         tgt_lang: str = "Russian",
         gradio: bool = True,
         gradio_avoid_processing_markdown: bool = False,
         chat: bool = True,
         chat_context: bool = False,
         stream_output: bool = True,
         show_examples: bool = None,
         eval_sharegpt_prompts_only: int = 0,
         eval_sharegpt_prompts_only_seed: int = 1234,
         eval_sharegpt_as_output: bool = False,
+        langchain_mode: str = 'Disabled',
+        visible_langchain_modes: list = ['UserData', 'MyData'],
+        user_path: str = None,
+        load_db_if_exists: bool = True,
+        keep_sources_in_context: bool = False,
+        db_type: str = 'chroma',
+        use_openai_embedding: bool = False,
+        use_openai_model: bool = False,
+        hf_embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
+        allow_upload_to_user_data: bool = True,
+        allow_upload_to_my_data: bool = True,
+        enable_url_upload: bool = True,
+        enable_text_upload: bool = True,
+        enable_sources_list: bool = True,
+        chunk: bool = True,
+        chunk_size: int = 512,
+        k: int = 4,
+        n_jobs: int = -1,
+        enable_captions: bool = True,
+        captions_model: str = "Salesforce/blip-image-captioning-base",
+        pre_load_caption_model: bool = False,
+        caption_gpu: bool = True,
+        enable_ocr: bool = False,
 ):
     """
     :param resume_download: whether to resume downloads from HF for models
     :param use_auth_token: whether to use HF auth token (requires CLI did huggingface-cli login before)
     :param trust_remote_code: whether to use trust any code needed for HF model
+    :param offload_folder: path for spilling model onto disk
     :param src_lang: source languages to include if doing translation (None = all)
     :param tgt_lang: target languages to include if doing translation (None = all)
     :param gradio: whether to enable gradio, or to enable benchmark mode
     :param gradio_avoid_processing_markdown:
     :param chat: whether to enable chat mode with chat history
     :param chat_context: whether to use extra helpful context if human_bot
     :param stream_output: whether to stream output from generate
     :param show_examples: whether to show clickable examples in gradio
     :param eval_sharegpt_prompts_only: for no gradio benchmark, if using ShareGPT prompts for eval
     :param eval_sharegpt_prompts_only_seed: for no gradio benchmark, if seed for ShareGPT sampling
     :param eval_sharegpt_as_output: for no gradio benchmark, whether to test ShareGPT output itself
+    :param langchain_mode: Data source to include.  Choose "UserData" to only consume files from make_db.py.
+           WARNING: wiki_full requires extra data processing via read_wiki_full.py and requires really good workstation to generate db, unless already present.
+    :param user_path: user path to glob from to generate db for vector search, for 'UserData' langchain mode
+    :param visible_langchain_modes: dbs to generate at launch to be ready for LLM
+           Can be up to ['wiki', 'wiki_full', 'UserData', 'MyData', 'github h2oGPT', 'DriverlessAI docs']
+           But wiki_full is expensive and requires preparation
+           To allow scratch space only live in session, add 'MyData' to list
+           Default: If only want to consume local files, e.g. prepared by make_db.py, only include ['UserData']
+           FIXME: Avoid 'All' for now, not implemented
+    :param load_db_if_exists: Whether to load chroma db if exists or re-generate db
+    :param keep_sources_in_context: Whether to keep url sources in context, not helpful usually
+    :param db_type: 'faiss' for in-memory or 'chroma' for persisted on disk
+    :param use_openai_embedding: Whether to use OpenAI embeddings for vector db
+    :param use_openai_model: Whether to use OpenAI model for use with vector db
+    :param hf_embedding_model: Which HF embedding model to use for vector db
+    :param allow_upload_to_user_data: Whether to allow file uploads to update shared vector db
+    :param allow_upload_to_my_data: Whether to allow file uploads to update scratch vector db
+    :param enable_url_upload: Whether to allow upload from URL
+    :param enable_text_upload: Whether to allow uplaod of text
+    :param enable_sources_list: Whether to allow list (or download for non-shared db) of list of sources for chosen db
+    :param chunk: Whether to chunk data (True unless know data is already optimally chunked)
+    :param chunk_size: Size of chunks, with typically top-4 passed to LLM, so neesd to be in context length
+    :param k: number of chunks to give LLM
+    :param n_jobs: Number of processors to use when consuming documents (-1 = all, is default)
+    :param enable_captions: Whether to support captions using BLIP for image files as documents, then preloads that model
+    :param captions_model: Which model to use for captions.
+           captions_model: int = "Salesforce/blip-image-captioning-base",  # continue capable
+           captions_model: str = "Salesforce/blip2-flan-t5-xl",   # question/answer capable, 16GB state
+           captions_model: int = "Salesforce/blip2-flan-t5-xxl",  # question/answer capable, 60GB state
+           Note: opt-based blip2 are not permissive license due to opt and Meta license restrictions
+    :param pre_load_caption_model: Whether to preload caption model, or load after forking parallel doc loader
+           parallel loading disabled if preload and have images, to prevent deadlocking on cuda context
+           Recommended if using larger caption model
+    :param caption_gpu: If support caption, then use GPU if exists
+    :param enable_ocr: Whether to support OCR on images
     :return:
     """
     is_hf = bool(os.getenv("HUGGINGFACE_SPACES"))
     # allow set token directly
     use_auth_token = os.environ.get("HUGGINGFACE_API_TOKEN", use_auth_token)
+    allow_upload_to_user_data = bool(os.environ.get("allow_upload_to_user_data", allow_upload_to_user_data))
+    allow_upload_to_my_data = bool(os.environ.get("allow_upload_to_my_data", allow_upload_to_my_data))
+    height = os.environ.get("HEIGHT", height)
+    # allow enabling langchain via ENV
+    # FIRST PLACE where LangChain referenced, but no imports related to it
+    langchain_mode = os.environ.get("LANGCHAIN_MODE", langchain_mode)
+    assert langchain_mode in langchain_modes, "Invalid langchain_mode %s" % langchain_mode
+    visible_langchain_modes = ast.literal_eval(os.environ.get("visible_langchain_modes", str(visible_langchain_modes)))
+    if langchain_mode not in visible_langchain_modes and langchain_mode in langchain_modes:
+        visible_langchain_modes += [langchain_mode]
     if is_public:
+        allow_upload_to_user_data = False
         input_lines = 1  # ensure set, for ease of use
         temperature = 0.2 if temperature is None else temperature
         top_p = 0.85 if top_p is None else top_p
         torch.backends.cudnn.benchmark = True
         torch.backends.cudnn.enabled = False
         torch.set_default_dtype(torch.float32)
+        if psutil.virtual_memory().available < 94 * 1024 ** 3:
             # 12B uses ~94GB
             # 6.9B uses ~47GB
             base_model = 'h2oai/h2ogpt-oig-oasst1-512-6.9b' if not base_model else base_model
         stream_output = False
         # else prompt removal can mess up output
         chat = False
+    # hard-coded defaults
+    first_para = False
+    text_limit = None
+    if offload_folder:
+        makedirs(offload_folder)
     placeholder_instruction, placeholder_input, \
+        stream_output, show_examples, \
+        prompt_type, temperature, top_p, top_k, num_beams, \
+        max_new_tokens, min_new_tokens, early_stopping, max_time, \
+        repetition_penalty, num_return_sequences, \
+        do_sample, \
+        src_lang, tgt_lang, \
+        examples, \
+        task_info = \
         get_generate_params(model_lower, chat,
                             stream_output, show_examples,
                             prompt_type, temperature, top_p, top_k, num_beams,
     print(f"Generating model with params:\n{locals_print}", flush=True)
     print("Command: %s\nHash: %s" % (str(' '.join(sys.argv)), get_githash()), flush=True)
+    if langchain_mode != "Disabled":
+        # SECOND PLACE where LangChain referenced, but all imports are kept local so not required
+        from gpt_langchain import prep_langchain, get_some_dbs_from_hf
+        if is_hf:
+            get_some_dbs_from_hf()
+        dbs = {}
+        for langchain_mode1 in visible_langchain_modes:
+            if langchain_mode1 in ['MyData']:
+                # don't use what is on disk, remove it instead
+                for gpath1 in glob.glob(os.path.join(scratch_base_dir, 'db_dir_%s*' % langchain_mode1)):
+                    if os.path.isdir(gpath1):
+                        print("Removing old MyData: %s" % gpath1, flush=True)
+                        shutil.rmtree(gpath1)
+                continue
+            if langchain_mode1 in ['All']:
+                # FIXME: All should be avoided until scans over each db, shouldn't be separate db
+                continue
+            persist_directory1 = 'db_dir_%s' % langchain_mode1  # single place, no special names for each case
+            db = prep_langchain(persist_directory1, load_db_if_exists, db_type, use_openai_embedding,
+                                langchain_mode1, user_path,
+                                hf_embedding_model,
+                                kwargs_make_db=locals())
+            dbs[langchain_mode1] = db
+        # remove None db's so can just rely upon k in dbs for if hav db
+        dbs = {k: v for k, v in dbs.items() if v is not None}
+    else:
+        dbs = {}
+        # import control
+        if os.environ.get("TEST_LANGCHAIN_IMPORT"):
+            assert 'gpt_langchain' not in sys.modules, "Dev bug, import of langchain when should not have"
+            assert 'langchain' not in sys.modules, "Dev bug, import of langchain when should not have"
     if not gradio:
         if eval_sharegpt_prompts_only > 0:
             # override default examples with shareGPT ones for human-level eval purposes only
             if not eval_sharegpt_as_output:
                 model, tokenizer, device = get_model(**locals())
                 model_state = [model, tokenizer, device, base_model]
+                kwargs_evaluate = {k: v for k, v in locals().items() if k in inputs_kwargs_list}
+                my_db_state = [None]
+                fun = partial(evaluate, model_state, my_db_state, **kwargs_evaluate)
             else:
                 assert eval_sharegpt_prompts_only > 0
             t0 = time.time()
             score_dump = []
             for exi, ex in enumerate(examples):
                 instruction = ex[eval_func_param_names.index('instruction_nochat')]
                 iinput = ex[eval_func_param_names.index('iinput_nochat')]
                         try:
                             score = torch.sigmoid(smodel(**inputs).logits[0].float()).cpu().detach().numpy()[0]
                         except torch.cuda.OutOfMemoryError as e:
+                            print("GPU OOM 1: question: %s answer: %s exception: %s" % (prompt, res, str(e)),
+                                  flush=True)
                             traceback.print_exc()
                             score = 0.0
                             clear_torch_cache()
         smodel, stokenizer, sdevice = get_score_model(**all_kwargs)
         score_model_state0 = [smodel, stokenizer, sdevice, score_model]
+        if enable_captions:
+            if pre_load_caption_model:
+                from image_captions import H2OImageCaptionLoader
+                caption_loader = H2OImageCaptionLoader(caption_gpu=caption_gpu).load_model()
+            else:
+                caption_loader = 'gpu' if caption_gpu else 'cpu'
+        else:
+            caption_loader = False
+        go_gradio(**locals())
 def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type,
                        gpu_id=0,
                        use_auth_token=False,
                        trust_remote_code=True,
+                       offload_folder=None,
                        triton_attn=False,
                        long_sequence=True,
                        ):
     :param gpu_id:
     :param use_auth_token:
     :param trust_remote_code:
+    :param offload_folder:
     :param triton_attn:
     :param long_sequence:
     :return:
     with init_empty_weights():
         from transformers import AutoConfig
         config = AutoConfig.from_pretrained(base_model, use_auth_token=use_auth_token,
+                                            trust_remote_code=trust_remote_code,
+                                            offload_folder=offload_folder)
         if triton_attn and 'mpt-' in base_model.lower():
             config.attn_config['attn_impl'] = 'triton'
         if long_sequence:
                 dtype=torch.float16 if load_half else torch.float32,
             )
             device_map.update(device_map_model)
     else:
         device_map = "auto"
     else:
         device_map = {'': 'cpu'}
         model_kwargs['load_in_8bit'] = False
+    print('device_map: %s' % device_map, flush=True)
     load_in_8bit = model_kwargs.get('load_in_8bit', False)
     model_kwargs['device_map'] = device_map
         resume_download: bool = True,
         use_auth_token: Union[str, bool] = False,
         trust_remote_code: bool = True,
+        offload_folder: str = None,
         compile: bool = True,
         **kwargs,
 ):
     :param resume_download: resume downloads from HF
     :param use_auth_token: assumes user did on CLI `huggingface-cli login` to access private repo
     :param trust_remote_code: trust code needed by model
+    :param offload_folder: offload folder
     :param compile: whether to compile torch model
     :param kwargs:
     :return:
     """
     print("Get %s model" % base_model, flush=True)
+    if base_model in ['llama', 'gptj']:
+        from gpt4all_llm import get_model_tokenizer_gpt4all
+        model, tokenizer, device = get_model_tokenizer_gpt4all(base_model)
+        return model, tokenizer, device
     if lora_weights is not None and lora_weights.strip():
         print("Get %s lora weights" % lora_weights, flush=True)
     device = get_device()
     from transformers import AutoConfig
     config = AutoConfig.from_pretrained(base_model, use_auth_token=use_auth_token,
+                                        trust_remote_code=trust_remote_code,
+                                        offload_folder=offload_folder)
     llama_type_from_config = 'llama' in str(config).lower()
     llama_type_from_name = "llama" in base_model.lower()
     llama_type = llama_type_from_config or llama_type_from_name
                                                      resume_download=resume_download,
                                                      use_auth_token=use_auth_token,
                                                      trust_remote_code=trust_remote_code,
+                                                     offload_folder=offload_folder,
                                                      )
     else:
         tokenizer = tokenizer_loader
                             resume_download=resume_download,
                             use_auth_token=use_auth_token,
                             trust_remote_code=trust_remote_code,
+                            offload_folder=offload_folder,
                             )
         if 'mbart-' not in base_model.lower() and 'mpt-' not in base_model.lower():
             model_kwargs.update(dict(load_in_8bit=load_8bit,
                                                gpu_id=gpu_id,
                                                use_auth_token=use_auth_token,
                                                trust_remote_code=trust_remote_code,
+                                               offload_folder=offload_folder,
                                                )
                 else:
                     if load_half and not load_8bit:
                 resume_download=resume_download,
                 use_auth_token=use_auth_token,
                 trust_remote_code=trust_remote_code,
+                offload_folder=offload_folder,
                 device_map={"": 0} if device == 'cuda' else {"": 'cpu'},  # seems to be required
             )
         else:
                     resume_download=resume_download,
                     use_auth_token=use_auth_token,
                     trust_remote_code=trust_remote_code,
+                    offload_folder=offload_folder,
                     device_map="auto",
                 )
                 if load_half:
                          'chat',
                          'instruction_nochat',
                          'iinput_nochat',
+                         'langchain_mode',
                          ]
 def evaluate(
         model_state,
+        my_db_state,
         # START NOTE: Examples must have same order of parameters
         instruction,
         iinput,
         chat,
         instruction_nochat,
         iinput_nochat,
+        langchain_mode,
         # END NOTE: Examples must have same order of parameters
         src_lang=None,
         tgt_lang=None,
         raise_generate_gpu_exceptions=None,
         chat_context=None,
         lora_weights=None,
+        load_db_if_exists=True,
+        dbs=None,
+        user_path=None,
+        use_openai_embedding=None,
+        use_openai_model=None,
+        hf_embedding_model=None,
+        chunk=None,
+        chunk_size=None,
+        db_type=None,
+        k=None,
+        n_jobs=None,
+        first_para=None,
+        text_limit=None,
 ):
     # ensure passed these
     assert concurrency_count is not None
     assert is_low_mem is not None
     assert raise_generate_gpu_exceptions is not None
     assert chat_context is not None
+    assert use_openai_embedding is not None
+    assert use_openai_model is not None
+    assert hf_embedding_model is not None
+    assert chunk is not None
+    assert chunk_size is not None
+    assert db_type is not None
+    assert k is not None
+    assert n_jobs is not None
+    assert first_para is not None
     if debug:
         locals_dict = locals().copy()
         # get hidden context if have one
         context = get_context(chat_context, prompt_type)
     prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
+    data_point = dict(context=context, instruction=instruction, input=iinput)
     prompt = prompter.generate_prompt(data_point)
+    # THIRD PLACE where LangChain referenced, but imports only occur if enabled and have db to use
+    assert langchain_mode in langchain_modes, "Invalid langchain_mode %s" % langchain_mode
+    if langchain_mode in ['MyData'] and my_db_state is not None and len(my_db_state) > 0 and my_db_state[0] is not None:
+        db1 = my_db_state[0]
+    elif dbs is not None and langchain_mode in dbs:
+        db1 = dbs[langchain_mode]
+    else:
+        db1 = None
+    if langchain_mode not in [False, 'Disabled', 'ChatLLM', 'LLM'] and db1 is not None or base_model in ['llama', 'gptj']:
+        query = instruction if not iinput else "%s\n%s" % (instruction, iinput)
+        outr = ""
+        # use smaller cut_distanct for wiki_full since so many matches could be obtained, and often irrelevant unless close
+        from gpt_langchain import run_qa_db
+        for r in run_qa_db(query=query,
+                           model_name=base_model, model=model, tokenizer=tokenizer,
+                           stream_output=stream_output,
+                           prompter=prompter,
+                           load_db_if_exists=load_db_if_exists,
+                           db=db1,
+                           user_path=user_path,
+                           max_new_tokens=max_new_tokens,
+                           cut_distanct=1.1 if langchain_mode in ['wiki_full'] else 1.64,  # FIXME, too arbitrary
+                           use_openai_embedding=use_openai_embedding,
+                           use_openai_model=use_openai_model,
+                           hf_embedding_model=hf_embedding_model,
+                           first_para=first_para,
+                           text_limit=text_limit,
+                           chunk=chunk,
+                           chunk_size=chunk_size,
+                           langchain_mode=langchain_mode,
+                           db_type=db_type,
+                           k=k,
+                           temperature=temperature,
+                           repetition_penalty=repetition_penalty,
+                           top_k=top_k,
+                           top_p=top_p,
+                           prompt_type=prompt_type,
+                           n_jobs=n_jobs,
+                           ):
+            outr = r  # doesn't accumulate, new answer every yield, so only save that full answer
+            yield r
+        if save_dir:
+            save_generate_output(output=outr, base_model=base_model, save_dir=save_dir)
+            print('Post-Generate Langchain: %s decoded_output: %s' % (str(datetime.now()), len(outr) if outr else -1),
+                  flush=True)
+        if outr:
+            return
     if isinstance(tokenizer, str):
         # pipeline
         if tokenizer == "summarization":
         # override, ignore user change
         num_return_sequences = 1
     stopping_criteria = get_stopping(prompt_type, tokenizer, device)
+    _, _, max_length_tokenize, max_prompt_length = get_cutoffs(is_low_mem)
+    prompt = prompt[-max_prompt_length:]
     inputs = tokenizer(prompt,
                        return_tensors="pt",
                        truncation=True,
                        max_length=max_length_tokenize)
+    if inputs['input_ids'].shape[1] >= max_length_tokenize - 1:
+        print("Cutting off input: %s %s" % (inputs['input_ids'].shape[1], max_length_tokenize), flush=True)
     if debug and len(inputs["input_ids"]) > 0:
         print('input_ids length', len(inputs["input_ids"][0]), flush=True)
     input_ids = inputs["input_ids"].to(device)
                                 **decoder_kwargs
                                 )
     decoder_raw_kwargs = dict(skip_special_tokens=False,
+                              clean_up_tokenization_spaces=True)
     decoder_raw = functools.partial(tokenizer.decode,
                                     **decoder_raw_kwargs
             # else hit bitsandbytes lack of thread safety:
             # https://github.com/h2oai/h2ogpt/issues/104
             # but only makes sense if concurrency_count == 1
+            context_class = NullContext  # if concurrency_count > 1 else filelock.FileLock
             print('Pre-Generate: %s' % str(datetime.now()), flush=True)
             decoded_output = None
             with context_class("generate.lock"):
                     inputs_decoded = prompt = inputs_decoded_raw
                     decoder = decoder_raw
                     decoder_kwargs = decoder_raw_kwargs
+                elif inputs_decoded_raw.replace("<unk> ", "").replace("<unk>", "").replace('\n', ' ').replace(' ',
+                                                                                                              '') == prompt.replace(
+                    '\n', ' ').replace(' ', ''):
                     inputs_decoded = prompt = inputs_decoded_raw
                     decoder = decoder_raw
                     decoder_kwargs = decoder_raw_kwargs
                     print("WARNING: Special characters in prompt", flush=True)
                 if stream_output:
                     skip_prompt = False
+                    streamer = H2OTextIteratorStreamer(tokenizer, skip_prompt=skip_prompt, block=False,
+                                                       **decoder_kwargs)
                     gen_kwargs.update(dict(streamer=streamer))
+                    target = wrapped_partial(generate_with_exceptions, model.generate,
+                                             prompt=prompt, inputs_decoded=inputs_decoded,
+                                             raise_generate_gpu_exceptions=raise_generate_gpu_exceptions,
+                                             **gen_kwargs)
                     bucket = queue.Queue()
+                    thread = EThread(target=target, streamer=streamer, bucket=bucket)
                     thread.start()
                     outputs = ""
                     try:
                         decoded_output = prompt + outputs[0]
                 if save_dir and decoded_output:
                     save_generate_output(output=decoded_output, base_model=base_model, save_dir=save_dir)
+            print('Post-Generate: %s decoded_output: %s' % (
+                str(datetime.now()), len(decoded_output) if decoded_output else -1), flush=True)
+inputs_list_names = list(inspect.signature(evaluate).parameters)
+state_names = ['model_state', 'my_db_state']
+inputs_kwargs_list = [x for x in inputs_list_names if x not in eval_func_param_names + state_names]
+def get_cutoffs(is_low_mem, for_context=False):
+    # help to avoid errors like:
+    # RuntimeError: The size of tensor a (2048) must match the size of tensor b (2049) at non-singleton dimension 3
+    # RuntimeError: expected scalar type Half but found Float
+    # with - 256
+    max_length_tokenize = 768 - 256 if is_low_mem else 2048 - 256
+    cutoff_len = max_length_tokenize * 4  # if reaches limit, then can't generate new tokens
+    output_smallest = 30 * 4
+    max_prompt_length = cutoff_len - output_smallest
+    if for_context:
+        # then lower even more to avoid later chop, since just estimate tokens in context bot
+        max_prompt_length = max(64, int(max_prompt_length * 0.8))
+    return cutoff_len, output_smallest, max_length_tokenize, max_prompt_length
 class H2OTextIteratorStreamer(TextIteratorStreamer):
     normally, timeout required for now to handle exceptions, else get()
     but with H2O version of TextIteratorStreamer, loop over block to handle
     """
     def __init__(self, tokenizer, skip_prompt: bool = False, timeout: typing.Optional[float] = None,
                  block=True, **decode_kwargs):
         super().__init__(tokenizer, skip_prompt, **decode_kwargs)
                     print("hit stop", flush=True)
                     # could raise or break, maybe best to raise and make parent see if any exception in thread
                     raise StopIteration()
+                    # break
                 value = self.text_queue.get(block=self.block, timeout=self.timeout)
                 break
             except queue.Empty:
             return value
+def generate_with_exceptions(func, *args, prompt='', inputs_decoded='', raise_generate_gpu_exceptions=True, **kwargs):
     try:
+        func(*args, **kwargs)
     except torch.cuda.OutOfMemoryError as e:
         print("GPU OOM 2: prompt: %s inputs_decoded: %s exception: %s" % (prompt, inputs_decoded, str(e)),
               flush=True)
+        if 'input_ids' in kwargs:
+            if kwargs['input_ids'] is not None:
+                kwargs['input_ids'].cpu()
+            kwargs['input_ids'] = None
         traceback.print_exc()
         clear_torch_cache()
         return
     # move to correct position
     for example in examples:
+        example += [chat, '', '', 'Disabled']
         # adjust examples if non-chat mode
         if not chat:
             example[eval_func_param_names.index('instruction_nochat')] = example[
             example[eval_func_param_names.index('iinput_nochat')] = example[eval_func_param_names.index('iinput')]
             example[eval_func_param_names.index('iinput')] = ''
+        assert len(example) == len(eval_func_param_names), "Wrong example: %s %s" % (
+            len(example), len(eval_func_param_names))
     return placeholder_instruction, placeholder_input, \
+        stream_output, show_examples, \
+        prompt_type, temperature, top_p, top_k, num_beams, \
+        max_new_tokens, min_new_tokens, early_stopping, max_time, \
+        repetition_penalty, num_return_sequences, \
+        do_sample, \
+        src_lang, tgt_lang, \
+        examples, \
+        task_info
 def languages_covered():
     return context0
 def score_qa(smodel, stokenizer, max_length_tokenize, question, answer, cutoff_len):
     question = question[-cutoff_len:]
     answer = answer[-cutoff_len:]
     python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-512-6.9b
     """
     fire.Fire(main)

gpt4all_llm.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import inspect
+import os
+from typing import Dict, Any, Optional, List
+from langchain.callbacks.manager import CallbackManagerForLLMRun
+from pydantic import root_validator
+from langchain.llms import gpt4all
+from dotenv import dotenv_values
+class FakeTokenizer:
+    def encode(self, x, *args, **kwargs):
+        return dict(input_ids=[x])
+    def decode(self, x, *args, **kwargs):
+        return x
+    def __call__(self, x, *args, **kwargs):
+        return self.encode(x, *args, **kwargs)
+def get_model_tokenizer_gpt4all(base_model, **kwargs):
+    # defaults (some of these are generation parameters, so need to be passed in at generation time)
+    model_kwargs = dict(n_ctx=kwargs.get('max_new_tokens', 256),
+                        n_threads=os.cpu_count() // 2,
+                        temp=kwargs.get('temperature', 0.2),
+                        top_p=kwargs.get('top_p', 0.75),
+                        top_k=kwargs.get('top_k', 40))
+    env_gpt4all_file = ".env_gpt4all"
+    model_kwargs.update(dotenv_values(env_gpt4all_file))
+    if base_model == "llama":
+        if 'model_path_llama' not in model_kwargs:
+            raise ValueError("No model_path_llama in %s" % env_gpt4all_file)
+        model_path = model_kwargs.pop('model_path_llama')
+        from gpt4all import GPT4All as GPT4AllModel
+    elif base_model == "gptj":
+        if 'model_path_gptj' not in model_kwargs:
+            raise ValueError("No model_path_gptj in %s" % env_gpt4all_file)
+        model_path = model_kwargs.pop('model_path_gptj')
+        from gpt4all import GPT4All as GPT4AllModel
+    else:
+        raise ValueError("No such base_model %s" % base_model)
+    func_names = list(inspect.signature(GPT4AllModel).parameters)
+    model_kwargs = {k: v for k, v in model_kwargs.items() if k in func_names}
+    model = GPT4AllModel(model_path, **model_kwargs)
+    return model, FakeTokenizer(), 'cpu'
+def get_llm_gpt4all(model_name, model=None,
+                    max_new_tokens=256,
+                    temperature=0.1,
+                    repetition_penalty=1.0,
+                    top_k=40,
+                    top_p=0.7):
+    env_gpt4all_file = ".env_gpt4all"
+    model_kwargs = dotenv_values(env_gpt4all_file)
+    from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+    callbacks = [StreamingStdOutCallbackHandler()]
+    n_ctx = model_kwargs.pop('n_ctx', 1024)
+    default_params = {'context_erase': 0.5, 'n_batch': 1, 'n_ctx': n_ctx, 'n_predict': max_new_tokens,
+                      'repeat_last_n': 64 if repetition_penalty != 1.0 else 0, 'repeat_penalty': repetition_penalty,
+                      'temp': temperature, 'top_k': top_k, 'top_p': top_p}
+    if model_name == 'llama':
+        from langchain.llms import LlamaCpp
+        model_path = model_kwargs.pop('model_path_llama') if model is None else model
+        llm = LlamaCpp(model_path=model_path, n_ctx=n_ctx, callbacks=callbacks, verbose=False)
+    else:
+        model_path = model_kwargs.pop('model_path_gptj') if model is None else model
+        llm = H2OGPT4All(model=model_path, backend='gptj', callbacks=callbacks,
+                         verbose=False, **default_params,
+                         )
+    return llm
+class H2OGPT4All(gpt4all.GPT4All):
+    model: Any
+    """Path to the pre-trained GPT4All model file."""
+    @root_validator()
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that the python package exists in the environment."""
+        try:
+            if isinstance(values["model"], str):
+                from gpt4all import GPT4All as GPT4AllModel
+                full_path = values["model"]
+                model_path, delimiter, model_name = full_path.rpartition("/")
+                model_path += delimiter
+                values["client"] = GPT4AllModel(
+                    model_name=model_name,
+                    model_path=model_path or None,
+                    model_type=values["backend"],
+                    allow_download=False,
+                )
+            else:
+                values["client"] = values["model"]
+            values["backend"] = values["client"].model.model_type
+        except ImportError:
+            raise ValueError(
+                "Could not import gpt4all python package. "
+                "Please install it with `pip install gpt4all`."
+            )
+        return values
+    def _call(
+            self,
+            prompt: str,
+            stop: Optional[List[str]] = None,
+            run_manager: Optional[CallbackManagerForLLMRun] = None,
+    ) -> str:
+        # Roughly 4 chars per token if natural language
+        prompt = prompt[-self.n_ctx * 4:]
+        verbose = False
+        if verbose:
+            print("_call prompt: %s" % prompt, flush=True)
+        return super()._call(prompt, stop=stop, run_manager=run_manager)

gpt_langchain.py ADDED Viewed

	@@ -0,0 +1,1076 @@

+import glob
+import inspect
+import os
+import pathlib
+import pickle
+import shutil
+import subprocess
+import sys
+import tempfile
+import traceback
+import uuid
+import zipfile
+from collections import defaultdict
+from datetime import datetime
+from functools import reduce
+from operator import concat
+from joblib import Parallel, delayed
+from utils import wrapped_partial, EThread, import_matplotlib, sanitize_filename, makedirs, get_url, flatten_list, \
+    get_device
+import_matplotlib()
+import numpy as np
+import pandas as pd
+import requests
+from langchain.chains.qa_with_sources import load_qa_with_sources_chain
+# , GCSDirectoryLoader, GCSFileLoader
+# , OutlookMessageLoader # GPL3
+# ImageCaptionLoader, # use our own wrapper
+#  ReadTheDocsLoader,  # no special file, some path, so have to give as special option
+from langchain.document_loaders import PyPDFLoader, TextLoader, CSVLoader, PythonLoader, TomlLoader, \
+    UnstructuredURLLoader, UnstructuredHTMLLoader, UnstructuredWordDocumentLoader, UnstructuredMarkdownLoader, \
+    EverNoteLoader, UnstructuredEmailLoader, UnstructuredODTLoader, UnstructuredPowerPointLoader, \
+    UnstructuredEPubLoader, UnstructuredImageLoader, UnstructuredRTFLoader, ArxivLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores import FAISS
+from langchain.chains.question_answering import load_qa_chain
+from langchain.docstore.document import Document
+from langchain import PromptTemplate
+from langchain.vectorstores import Chroma
+def get_db(sources, use_openai_embedding=False, db_type='faiss', persist_directory="db_dir", langchain_mode='notset',
+           hf_embedding_model="sentence-transformers/all-MiniLM-L6-v2"):
+    if not sources:
+        return None
+    # get embedding model
+    embedding = get_embedding(use_openai_embedding, hf_embedding_model=hf_embedding_model)
+    # Create vector database
+    if db_type == 'faiss':
+        db = FAISS.from_documents(sources, embedding)
+    elif db_type == 'chroma':
+        collection_name = langchain_mode.replace(' ', '_')
+        os.makedirs(persist_directory, exist_ok=True)
+        db = Chroma.from_documents(documents=sources,
+                                   embedding=embedding,
+                                   persist_directory=persist_directory,
+                                   collection_name=collection_name,
+                                   anonymized_telemetry=False)
+        db.persist()
+        # FIXME: below just proves can load persistent dir, regenerates its embedding files, so a bit wasteful
+        if False:
+            db = Chroma(embedding_function=embedding,
+                        persist_directory=persist_directory,
+                        collection_name=collection_name)
+    else:
+        raise RuntimeError("No such db_type=%s" % db_type)
+    return db
+def add_to_db(db, sources, db_type='faiss', avoid_dup=True):
+    if not sources:
+        return db
+    if db_type == 'faiss':
+        db.add_documents(sources)
+    elif db_type == 'chroma':
+        if avoid_dup:
+            collection = db.get()
+            metadata_sources = set([x['source'] for x in collection['metadatas']])
+            sources = [x for x in sources if x.metadata['source'] not in metadata_sources]
+        if len(sources) == 0:
+            return db
+        db.add_documents(documents=sources)
+        db.persist()
+    else:
+        raise RuntimeError("No such db_type=%s" % db_type)
+    return db
+def get_embedding(use_openai_embedding, hf_embedding_model="sentence-transformers/all-MiniLM-L6-v2"):
+    # Get embedding model
+    if use_openai_embedding:
+        assert os.getenv("OPENAI_API_KEY") is not None, "Set ENV OPENAI_API_KEY"
+        from langchain.embeddings import OpenAIEmbeddings
+        embedding = OpenAIEmbeddings()
+    else:
+        # to ensure can fork without deadlock
+        from langchain.embeddings import HuggingFaceEmbeddings
+        device, torch_dtype, context_class = get_device_dtype()
+        model_kwargs = dict(device=device)
+        embedding = HuggingFaceEmbeddings(model_name=hf_embedding_model, model_kwargs=model_kwargs)
+    return embedding
+def get_answer_from_sources(chain, sources, question):
+    return chain(
+        {
+            "input_documents": sources,
+            "question": question,
+        },
+        return_only_outputs=True,
+    )["output_text"]
+def get_llm(use_openai_model=False, model_name=None, model=None,
+            tokenizer=None, stream_output=False,
+            max_new_tokens=256,
+            temperature=0.1,
+            repetition_penalty=1.0,
+            top_k=40,
+            top_p=0.7,
+            prompt_type=None,
+            ):
+    if use_openai_model:
+        from langchain.llms import OpenAI
+        llm = OpenAI(temperature=0)
+        model_name = 'openai'
+        streamer = None
+    elif model_name in ['gptj', 'llama']:
+        from gpt4all_llm import get_llm_gpt4all
+        llm = get_llm_gpt4all(model_name, model=model, max_new_tokens=max_new_tokens,
+                              temperature=temperature,
+                              repetition_penalty=repetition_penalty,
+                              top_k=top_k,
+                              top_p=top_p,
+                              )
+        streamer = None
+        prompt_type = 'plain'
+    else:
+        from transformers import AutoTokenizer, AutoModelForCausalLM
+        if model is None:
+            # only used if didn't pass model in
+            assert model_name is None
+            assert tokenizer is None
+            model_name = 'h2oai/h2ogpt-oasst1-512-12b'
+            # model_name = 'h2oai/h2ogpt-oig-oasst1-512-6.9b'
+            # model_name = 'h2oai/h2ogpt-oasst1-512-20b'
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            device, torch_dtype, context_class = get_device_dtype()
+            with context_class(device):
+                load_8bit = True
+                # FIXME: for now not to spread across hetero GPUs
+                # device_map={"": 0} if load_8bit and device == 'cuda' else "auto"
+                device_map = {"": 0} if device == 'cuda' else "auto"
+                model = AutoModelForCausalLM.from_pretrained(model_name,
+                                                             device_map=device_map,
+                                                             torch_dtype=torch_dtype,
+                                                             load_in_8bit=load_8bit)
+        gen_kwargs = dict(max_new_tokens=max_new_tokens, return_full_text=True, early_stopping=False)
+        if stream_output:
+            skip_prompt = False
+            from generate import H2OTextIteratorStreamer
+            decoder_kwargs = {}
+            streamer = H2OTextIteratorStreamer(tokenizer, skip_prompt=skip_prompt, block=False, **decoder_kwargs)
+            gen_kwargs.update(dict(streamer=streamer))
+        else:
+            streamer = None
+        if 'h2ogpt' in model_name or prompt_type == 'human_bot':
+            from h2oai_pipeline import H2OTextGenerationPipeline
+            pipe = H2OTextGenerationPipeline(model=model, tokenizer=tokenizer, **gen_kwargs)
+            # pipe.task = "text-generation"
+            # below makes it listen only to our prompt removal, not built in prompt removal that is less general and not specific for our model
+            pipe.task = "text2text-generation"
+            prompt_type = 'human_bot'
+        else:
+            # only for non-instruct tuned cases when ok with just normal next token prediction
+            from transformers import pipeline
+            pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, **gen_kwargs)
+        from langchain.llms import HuggingFacePipeline
+        llm = HuggingFacePipeline(pipeline=pipe)
+    return llm, model_name, streamer, prompt_type
+def get_device_dtype():
+    # torch.device("cuda") leads to cuda:x cuda:y mismatches for multi-GPU consistently
+    import torch
+    n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0
+    device = 'cpu' if n_gpus == 0 else 'cuda'
+    # from utils import NullContext
+    # context_class = NullContext if n_gpus > 1 or n_gpus == 0 else context_class
+    context_class = torch.device
+    torch_dtype = torch.float16 if device == 'cuda' else torch.float32
+    return device, torch_dtype, context_class
+def get_wiki_data(title, first_paragraph_only, text_limit=None, take_head=True):
+    """
+    Get wikipedia data from online
+    :param title:
+    :param first_paragraph_only:
+    :param text_limit:
+    :param take_head:
+    :return:
+    """
+    filename = 'wiki_%s_%s_%s_%s.data' % (first_paragraph_only, title, text_limit, take_head)
+    url = f"https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&explaintext=1&titles={title}"
+    if first_paragraph_only:
+        url += "&exintro=1"
+    import json
+    if not os.path.isfile(filename):
+        data = requests.get(url).json()
+        json.dump(data, open(filename, 'wt'))
+    else:
+        data = json.load(open(filename, "rt"))
+    page_content = list(data["query"]["pages"].values())[0]["extract"]
+    if take_head is not None and text_limit is not None:
+        page_content = page_content[:text_limit] if take_head else page_content[:-text_limit]
+    title_url = str(title).replace(' ', '_')
+    return Document(
+        page_content=page_content,
+        metadata={"source": f"https://en.wikipedia.org/wiki/{title_url}"},
+    )
+def get_wiki_sources(first_para=True, text_limit=None):
+    """
+    Get specific named sources from wikipedia
+    :param first_para:
+    :param text_limit:
+    :return:
+    """
+    default_wiki_sources = ['Unix', 'Microsoft_Windows', 'Linux']
+    wiki_sources = list(os.getenv('WIKI_SOURCES', default_wiki_sources))
+    return [get_wiki_data(x, first_para, text_limit=text_limit) for x in wiki_sources]
+def get_github_docs(repo_owner, repo_name):
+    """
+    Access github from specific repo
+    :param repo_owner:
+    :param repo_name:
+    :return:
+    """
+    with tempfile.TemporaryDirectory() as d:
+        subprocess.check_call(
+            f"git clone --depth 1 https://github.com/{repo_owner}/{repo_name}.git .",
+            cwd=d,
+            shell=True,
+        )
+        git_sha = (
+            subprocess.check_output("git rev-parse HEAD", shell=True, cwd=d)
+            .decode("utf-8")
+            .strip()
+        )
+        repo_path = pathlib.Path(d)
+        markdown_files = list(repo_path.glob("*/*.md")) + list(
+            repo_path.glob("*/*.mdx")
+        )
+        for markdown_file in markdown_files:
+            with open(markdown_file, "r") as f:
+                relative_path = markdown_file.relative_to(repo_path)
+                github_url = f"https://github.com/{repo_owner}/{repo_name}/blob/{git_sha}/{relative_path}"
+                yield Document(page_content=f.read(), metadata={"source": github_url})
+def get_dai_pickle(dest="."):
+    from huggingface_hub import hf_hub_download
+    # True for case when locally already logged in with correct token, so don't have to set key
+    token = os.getenv('HUGGINGFACE_API_TOKEN', True)
+    path_to_zip_file = hf_hub_download('h2oai/dai_docs', 'dai_docs.pickle', token=token, repo_type='dataset')
+    shutil.copy(path_to_zip_file, dest)
+def get_dai_docs(from_hf=False, get_pickle=True):
+    """
+    Consume DAI documentation, or consume from public pickle
+    :param from_hf: get DAI docs from HF, then generate pickle for later use by LangChain
+    :param get_pickle: Avoid raw DAI docs, just get pickle directly from HF
+    :return:
+    """
+    import pickle
+    if get_pickle:
+        get_dai_pickle()
+    dai_store = 'dai_docs.pickle'
+    dst = "working_dir_docs"
+    if not os.path.isfile(dai_store):
+        from create_data import setup_dai_docs
+        dst = setup_dai_docs(dst=dst, from_hf=from_hf)
+        import glob
+        files = list(glob.glob(os.path.join(dst, '*rst'), recursive=True))
+        basedir = os.path.abspath(os.getcwd())
+        from create_data import rst_to_outputs
+        new_outputs = rst_to_outputs(files)
+        os.chdir(basedir)
+        pickle.dump(new_outputs, open(dai_store, 'wb'))
+    else:
+        new_outputs = pickle.load(open(dai_store, 'rb'))
+    sources = []
+    for line, file in new_outputs:
+        # gradio requires any linked file to be with app.py
+        sym_src = os.path.abspath(os.path.join(dst, file))
+        sym_dst = os.path.abspath(os.path.join(os.getcwd(), file))
+        if os.path.lexists(sym_dst):
+            os.remove(sym_dst)
+        os.symlink(sym_src, sym_dst)
+        itm = Document(page_content=line, metadata={"source": file})
+        # NOTE: yield has issues when going into db, loses metadata
+        # yield itm
+        sources.append(itm)
+    return sources
+import distutils.spawn
+have_tesseract = distutils.spawn.find_executable("tesseract")
+have_libreoffice = distutils.spawn.find_executable("libreoffice")
+import pkg_resources
+try:
+    assert pkg_resources.get_distribution('arxiv') is not None
+    assert pkg_resources.get_distribution('pymupdf') is not None
+    have_arxiv = True
+except (pkg_resources.DistributionNotFound, AssertionError):
+    have_arxiv = False
+image_types = ["png", "jpg", "jpeg"]
+non_image_types = ["pdf", "txt", "csv", "toml", "py", "rst", "rtf",
+                   "md", "html",
+                   "enex", "eml", "epub", "odt", "pptx", "ppt",
+                   "zip", "urls",
+                   ]
+# "msg",  GPL3
+if have_libreoffice:
+    non_image_types.extend(["docx", "doc"])
+file_types = non_image_types + image_types
+def add_meta(docs1, file):
+    file_extension = pathlib.Path(file).suffix
+    if not isinstance(docs1, list):
+        docs1 = [docs1]
+    [x.metadata.update(dict(input_type=file_extension, date=str(datetime.now))) for x in docs1]
+def file_to_doc(file, base_path=None, verbose=False, fail_any_exception=False, chunk=True, chunk_size=512,
+                is_url=False, is_txt=False,
+                enable_captions=True,
+                captions_model=None,
+                enable_ocr=False, caption_loader=None,
+                headsize=50):
+    if file is None:
+        if fail_any_exception:
+            raise RuntimeError("Unexpected None file")
+        else:
+            return []
+    doc1 = []  # in case no support, or disabled support
+    if base_path is None and not is_txt and not is_url:
+        # then assume want to persist but don't care which path used
+        # can't be in base_path
+        dir_name = os.path.dirname(file)
+        base_name = os.path.basename(file)
+        # if from gradio, will have its own temp uuid too, but that's ok
+        base_name = sanitize_filename(base_name) + "_" + str(uuid.uuid4())[:10]
+        base_path = os.path.join(dir_name, base_name)
+    if is_url:
+        if file.lower().startswith('arxiv:'):
+            query = file.lower().split('arxiv:')
+            if len(query) == 2 and have_arxiv:
+                query = query[1]
+                docs1 = ArxivLoader(query=query, load_max_docs=20, load_all_available_meta=True).load()
+                # ensure string, sometimes None
+                [[x.metadata.update({k: str(v)}) for k, v in x.metadata.items()] for x in docs1]
+                query_url = f"https://arxiv.org/abs/{query}"
+                [x.metadata.update(
+                    dict(source=x.metadata.get('entry_id', query_url), query=query_url,
+                         input_type='arxiv', head=x.metadata.get('Title', ''), date=str(datetime.now))) for x in
+                    docs1]
+            else:
+                docs1 = []
+        else:
+            docs1 = UnstructuredURLLoader(urls=[file]).load()
+            [x.metadata.update(dict(input_type='url', date=str(datetime.now))) for x in docs1]
+        doc1 = chunk_sources(docs1, chunk_size=chunk_size)
+    elif is_txt:
+        base_path = "user_paste"
+        source_file = os.path.join(base_path, "_%s" % str(uuid.uuid4())[:10])
+        makedirs(os.path.dirname(source_file), exist_ok=True)
+        with open(source_file, "wt") as f:
+            f.write(file)
+        metadata = dict(source=source_file, date=str(datetime.now()), input_type='pasted txt')
+        doc1 = Document(page_content=file, metadata=metadata)
+    elif file.endswith('.html') or file.endswith('.mhtml'):
+        docs1 = UnstructuredHTMLLoader(file_path=file).load()
+        add_meta(docs1, file)
+        doc1 = chunk_sources(docs1, chunk_size=chunk_size)
+    elif (file.endswith('.docx') or file.endswith('.doc')) and have_libreoffice:
+        docs1 = UnstructuredWordDocumentLoader(file_path=file).load()
+        add_meta(docs1, file)
+        doc1 = chunk_sources(docs1, chunk_size=chunk_size)
+    elif file.endswith('.odt'):
+        docs1 = UnstructuredODTLoader(file_path=file).load()
+        add_meta(docs1, file)
+        doc1 = chunk_sources(docs1, chunk_size=chunk_size)
+    elif file.endswith('pptx') or file.endswith('ppt'):
+        docs1 = UnstructuredPowerPointLoader(file_path=file).load()
+        add_meta(docs1, file)
+        doc1 = chunk_sources(docs1, chunk_size=chunk_size)
+    elif file.endswith('.txt'):
+        # use UnstructuredFileLoader ?
+        doc1 = TextLoader(file, encoding="utf8", autodetect_encoding=True).load()
+        add_meta(doc1, file)
+    elif file.endswith('.rtf'):
+        docs1 = UnstructuredRTFLoader(file).load()
+        add_meta(docs1, file)
+        doc1 = chunk_sources(docs1, chunk_size=chunk_size)
+    elif file.endswith('.md'):
+        docs1 = UnstructuredMarkdownLoader(file).load()
+        add_meta(docs1, file)
+        doc1 = chunk_sources(docs1, chunk_size=chunk_size)
+    elif file.endswith('.enex'):
+        doc1 = EverNoteLoader(file).load()
+        add_meta(doc1, file)
+    elif file.endswith('.epub'):
+        docs1 = UnstructuredEPubLoader(file).load()
+        add_meta(docs1, file)
+        doc1 = chunk_sources(docs1, chunk_size=chunk_size)
+    elif file.endswith('.jpeg') or file.endswith('.jpg') or file.endswith('.png'):
+        docs1 = []
+        if have_tesseract and enable_ocr:
+            # OCR, somewhat works, but not great
+            docs1.extend(UnstructuredImageLoader(file).load())
+            add_meta(docs1, file)
+        if enable_captions:
+            # BLIP
+            if caption_loader is not None and not isinstance(caption_loader, (str, bool)):
+                # assumes didn't fork into this process with joblib, else can deadlock
+                caption_loader.set_image_paths([file])
+                docs1c = caption_loader.load()
+                add_meta(docs1c, file)
+                [x.metadata.update(dict(head=x.page_content[:headsize].strip())) for x in docs1c]
+                docs1.extend(docs1c)
+            else:
+                from image_captions import H2OImageCaptionLoader
+                caption_loader = H2OImageCaptionLoader(caption_gpu=caption_loader == 'gpu',
+                                                       blip_model=captions_model,
+                                                       blip_processor=captions_model)
+                caption_loader.set_image_paths([file])
+                docs1c = caption_loader.load()
+                add_meta(docs1c, file)
+                [x.metadata.update(dict(head=x.page_content[:headsize].strip())) for x in docs1c]
+                docs1.extend(docs1c)
+            for doci in docs1:
+                doci.metadata['source'] = doci.metadata['image_path']
+            if docs1:
+                doc1 = chunk_sources(docs1, chunk_size=chunk_size)
+    elif file.endswith('.msg'):
+        raise RuntimeError("Not supported, GPL3 license")
+        # docs1 = OutlookMessageLoader(file).load()
+        # docs1[0].metadata['source'] = file
+    elif file.endswith('.eml'):
+        try:
+            docs1 = UnstructuredEmailLoader(file).load()
+            add_meta(docs1, file)
+            doc1 = chunk_sources(docs1, chunk_size=chunk_size)
+        except ValueError as e:
+            if 'text/html content not found in email' in str(e):
+                # e.g. plain/text dict key exists, but not
+                # doc1 = TextLoader(file, encoding="utf8").load()
+                docs1 = UnstructuredEmailLoader(file, content_source="text/plain").load()
+                add_meta(docs1, file)
+                doc1 = chunk_sources(docs1, chunk_size=chunk_size)
+            else:
+                raise
+    # elif file.endswith('.gcsdir'):
+    #    doc1 = GCSDirectoryLoader(project_name, bucket, prefix).load()
+    # elif file.endswith('.gcsfile'):
+    # doc1 = GCSFileLoader(project_name, bucket, blob).load()
+    elif file.endswith('.rst'):
+        with open(file, "r") as f:
+            doc1 = Document(page_content=f.read(), metadata={"source": file})
+        add_meta(doc1, file)
+    elif file.endswith('.pdf'):
+        # Some PDFs return nothing or junk from PDFMinerLoader
+        # e.g. Beyond fine-tuning_ Classifying high resolution mammograms using function-preserving transformations _ Elsevier Enhanced Reader.pdf
+        doc1 = PyPDFLoader(file).load_and_split()
+        add_meta(doc1, file)
+    elif file.endswith('.csv'):
+        doc1 = CSVLoader(file).load()
+        add_meta(doc1, file)
+    elif file.endswith('.py'):
+        doc1 = PythonLoader(file).load()
+        add_meta(doc1, file)
+    elif file.endswith('.toml'):
+        doc1 = TomlLoader(file).load()
+        add_meta(doc1, file)
+    elif file.endswith('.urls'):
+        with open(file, "r") as f:
+            docs1 = UnstructuredURLLoader(urls=f.readlines()).load()
+        add_meta(docs1, file)
+        doc1 = chunk_sources(docs1, chunk_size=chunk_size)
+    elif file.endswith('.zip'):
+        with zipfile.ZipFile(file, 'r') as zip_ref:
+            # don't put into temporary path, since want to keep references to docs inside zip
+            # so just extract in path where
+            zip_ref.extractall(base_path)
+            # recurse
+            doc1 = path_to_docs(base_path, verbose=verbose, fail_any_exception=fail_any_exception)
+    else:
+        raise RuntimeError("No file handler for %s" % os.path.basename(file))
+    # allow doc1 to be list or not.  If not list, did not chunk yet, so chunk now
+    if not isinstance(doc1, list):
+        if chunk:
+            docs = chunk_sources([doc1], chunk_size=chunk_size)
+        else:
+            docs = [doc1]
+    else:
+        docs = doc1
+    assert isinstance(docs, list)
+    return docs
+def path_to_doc1(file, verbose=False, fail_any_exception=False, return_file=True, chunk=True, chunk_size=512,
+                 is_url=False, is_txt=False,
+                 enable_captions=True,
+                 captions_model=None,
+                 enable_ocr=False, caption_loader=None):
+    if verbose:
+        if is_url:
+            print("Ingesting URL: %s" % file, flush=True)
+        elif is_txt:
+            print("Ingesting Text: %s" % file, flush=True)
+        else:
+            print("Ingesting file: %s" % file, flush=True)
+    res = None
+    try:
+        # don't pass base_path=path, would infinitely recurse
+        res = file_to_doc(file, base_path=None, verbose=verbose, fail_any_exception=fail_any_exception,
+                          chunk=chunk, chunk_size=chunk_size,
+                          is_url=is_url, is_txt=is_txt,
+                          enable_captions=enable_captions,
+                          captions_model=captions_model,
+                          enable_ocr=enable_ocr,
+                          caption_loader=caption_loader)
+    except BaseException as e:
+        print("Failed to ingest %s due to %s" % (file, traceback.format_exc()))
+        if fail_any_exception:
+            raise
+        else:
+            exception_doc = Document(
+                page_content='',
+                metadata={"source": file, "exception": str(e), "traceback": traceback.format_exc()})
+            res = [exception_doc]
+    if return_file:
+        base_tmp = "temp_path_to_doc1"
+        if not os.path.isdir(base_tmp):
+            os.makedirs(base_tmp, exist_ok=True)
+        filename = os.path.join(base_tmp, str(uuid.uuid4()) + ".tmp.pickle")
+        with open(filename, 'wb') as f:
+            pickle.dump(res, f)
+        return filename
+    return res
+def path_to_docs(path_or_paths, verbose=False, fail_any_exception=False, n_jobs=-1,
+                 chunk=True, chunk_size=512,
+                 url=None, text=None,
+                 enable_captions=True,
+                 captions_model=None,
+                 caption_loader=None,
+                 enable_ocr=False,
+                 ):
+    globs_image_types = []
+    globs_non_image_types = []
+    if path_or_paths is None:
+        return []
+    elif url:
+        globs_non_image_types = [url]
+    elif text:
+        globs_non_image_types = [text]
+    elif isinstance(path_or_paths, str):
+        # single path, only consume allowed files
+        path = path_or_paths
+        # Below globs should match patterns in file_to_doc()
+        [globs_image_types.extend(glob.glob(os.path.join(path, "./**/*.%s" % ftype), recursive=True))
+         for ftype in image_types]
+        [globs_non_image_types.extend(glob.glob(os.path.join(path, "./**/*.%s" % ftype), recursive=True))
+         for ftype in non_image_types]
+    else:
+        # list/tuple of files (consume what can, and exception those that selected but cannot consume so user knows)
+        assert isinstance(path_or_paths, (list, tuple)), "Wrong type for path_or_paths: %s" % type(path_or_paths)
+        # reform out of allowed types
+        globs_image_types.extend(flatten_list([[x for x in path_or_paths if x.endswith(y)] for y in image_types]))
+        # could do below:
+        # globs_non_image_types = flatten_list([[x for x in path_or_paths if x.endswith(y)] for y in non_image_types])
+        # But instead, allow fail so can collect unsupported too
+        set_globs_image_types = set(globs_image_types)
+        globs_non_image_types.extend([x for x in path_or_paths if x not in set_globs_image_types])
+    # could use generator, but messes up metadata handling in recursive case
+    if caption_loader and not isinstance(caption_loader, (bool, str)) and \
+            caption_loader.device != 'cpu' or \
+            get_device() == 'cuda':
+        # to avoid deadlocks, presume was preloaded and so can't fork due to cuda context
+        n_jobs_image = 1
+    else:
+        n_jobs_image = n_jobs
+    return_file = True  # local choice
+    is_url = url is not None
+    is_txt = text is not None
+    kwargs = dict(verbose=verbose, fail_any_exception=fail_any_exception,
+                  return_file=return_file,
+                  chunk=chunk, chunk_size=chunk_size,
+                  is_url=is_url,
+                  is_txt=is_txt,
+                  enable_captions=enable_captions,
+                  captions_model=captions_model,
+                  caption_loader=caption_loader,
+                  enable_ocr=enable_ocr,
+                  )
+    if n_jobs != 1 and len(globs_non_image_types) > 1:
+        # avoid nesting, e.g. upload 1 zip and then inside many files
+        # harder to handle if upload many zips with many files, inner parallel one will be disabled by joblib
+        documents = Parallel(n_jobs=n_jobs, verbose=10 if verbose else 0, backend='multiprocessing')(
+            delayed(path_to_doc1)(file, **kwargs) for file in globs_non_image_types
+        )
+    else:
+        documents = [path_to_doc1(file, **kwargs) for file in globs_non_image_types]
+    # do images separately since can't fork after cuda in parent, so can't be parallel
+    if n_jobs_image != 1 and len(globs_image_types) > 1:
+        # avoid nesting, e.g. upload 1 zip and then inside many files
+        # harder to handle if upload many zips with many files, inner parallel one will be disabled by joblib
+        image_documents = Parallel(n_jobs=n_jobs, verbose=10 if verbose else 0, backend='multiprocessing')(
+            delayed(path_to_doc1)(file, **kwargs) for file in globs_image_types
+        )
+    else:
+        image_documents = [path_to_doc1(file, **kwargs) for file in globs_image_types]
+    # add image docs in
+    documents += image_documents
+    if return_file:
+        # then documents really are files
+        files = documents.copy()
+        documents = []
+        for fil in files:
+            with open(fil, 'rb') as f:
+                documents.extend(pickle.load(f))
+            # remove temp pickle
+            os.remove(fil)
+    else:
+        documents = reduce(concat, documents)
+    return documents
+def prep_langchain(persist_directory, load_db_if_exists, db_type, use_openai_embedding, langchain_mode, user_path,
+                   hf_embedding_model, n_jobs=-1, kwargs_make_db={}):
+    """
+    do prep first time, involving downloads
+    # FIXME: Add github caching then add here
+    :return:
+    """
+    assert langchain_mode not in ['MyData'], "Should not prep scratch data"
+    if os.path.isdir(persist_directory):
+        print("Prep: persist_directory=%s exists, using" % persist_directory, flush=True)
+        db = get_existing_db(persist_directory, load_db_if_exists, db_type, use_openai_embedding, langchain_mode,
+                             hf_embedding_model)
+    else:
+        print("Prep: persist_directory=%s does not exist, regenerating" % persist_directory, flush=True)
+        db = None
+        if langchain_mode in ['All', 'DriverlessAI docs']:
+            # FIXME: Could also just use dai_docs.pickle directly and upload that
+            get_dai_docs(from_hf=True)
+        if langchain_mode in ['All', 'wiki']:
+            get_wiki_sources(first_para=kwargs_make_db['first_para'], text_limit=kwargs_make_db['text_limit'])
+        langchain_kwargs = kwargs_make_db.copy()
+        langchain_kwargs.update(locals())
+        db = make_db(**langchain_kwargs)
+    return db
+def get_existing_db(persist_directory, load_db_if_exists, db_type, use_openai_embedding, langchain_mode,
+                    hf_embedding_model):
+    if load_db_if_exists and db_type == 'chroma' and os.path.isdir(persist_directory) and os.path.isdir(
+            os.path.join(persist_directory, 'index')):
+        print("DO Loading db: %s" % langchain_mode, flush=True)
+        embedding = get_embedding(use_openai_embedding, hf_embedding_model=hf_embedding_model)
+        db = Chroma(persist_directory=persist_directory, embedding_function=embedding,
+                    collection_name=langchain_mode.replace(' ', '_'))
+        print("DONE Loading db: %s" % langchain_mode, flush=True)
+        return db
+    return None
+def make_db(**langchain_kwargs):
+    func_names = list(inspect.signature(_make_db).parameters)
+    missing_kwargs = [x for x in func_names if x not in langchain_kwargs]
+    defaults_db = {k: v.default for k, v in dict(inspect.signature(run_qa_db).parameters).items()}
+    for k in missing_kwargs:
+        if k in defaults_db:
+            langchain_kwargs[k] = defaults_db[k]
+    # final check for missing
+    missing_kwargs = [x for x in func_names if x not in langchain_kwargs]
+    assert not missing_kwargs, "Missing kwargs: %s" % missing_kwargs
+    # only keep actual used
+    langchain_kwargs = {k: v for k, v in langchain_kwargs.items() if k in func_names}
+    return _make_db(**langchain_kwargs)
+def _make_db(use_openai_embedding=False,
+             hf_embedding_model="sentence-transformers/all-MiniLM-L6-v2",
+             first_para=False, text_limit=None, chunk=False, chunk_size=1024,
+             langchain_mode=None,
+             user_path=None,
+             db_type='faiss',
+             load_db_if_exists=False,
+             db=None,
+             n_jobs=-1):
+    persist_directory = 'db_dir_%s' % langchain_mode  # single place, no special names for each case
+    if not db and load_db_if_exists and db_type == 'chroma' and os.path.isdir(persist_directory) and os.path.isdir(
+            os.path.join(persist_directory, 'index')):
+        assert langchain_mode not in ['MyData'], "Should not load MyData db this way"
+        print("Loading db", flush=True)
+        embedding = get_embedding(use_openai_embedding, hf_embedding_model=hf_embedding_model)
+        db = Chroma(persist_directory=persist_directory, embedding_function=embedding,
+                    collection_name=langchain_mode.replace(' ', '_'))
+    elif not db:
+        assert langchain_mode not in ['MyData'], "Should not make MyData db this way"
+        sources = []
+        print("Generating sources", flush=True)
+        if langchain_mode in ['wiki_full', 'All', "'All'"]:
+            from read_wiki_full import get_all_documents
+            small_test = None
+            print("Generating new wiki", flush=True)
+            sources1 = get_all_documents(small_test=small_test, n_jobs=os.cpu_count() // 2)
+            print("Got new wiki", flush=True)
+            if chunk:
+                sources1 = chunk_sources(sources1, chunk_size=chunk_size)
+                print("Chunked new wiki", flush=True)
+            sources.extend(sources1)
+        if langchain_mode in ['wiki', 'All', "'All'"]:
+            sources1 = get_wiki_sources(first_para=first_para, text_limit=text_limit)
+            if chunk:
+                sources1 = chunk_sources(sources1, chunk_size=chunk_size)
+            sources.extend(sources1)
+        if langchain_mode in ['github h2oGPT', 'All', "'All'"]:
+            # sources = get_github_docs("dagster-io", "dagster")
+            sources1 = get_github_docs("h2oai", "h2ogpt")
+            # FIXME: always chunk for now
+            sources1 = chunk_sources(sources1, chunk_size=chunk_size)
+            sources.extend(sources1)
+        if langchain_mode in ['DriverlessAI docs', 'All', "'All'"]:
+            sources1 = get_dai_docs(from_hf=True)
+            if chunk and False:  # FIXME: DAI docs are already chunked well, should only chunk more if over limit
+                sources1 = chunk_sources(sources1, chunk_size=chunk_size)
+            sources.extend(sources1)
+        if langchain_mode in ['All', 'UserData']:
+            if user_path:
+                # chunk internally for speed over multiple docs
+                sources1 = path_to_docs(user_path, n_jobs=n_jobs, chunk=chunk, chunk_size=chunk_size)
+                sources.extend(sources1)
+            else:
+                print("Chose UserData but user_path is empty/None", flush=True)
+        if False and langchain_mode in ['urls', 'All', "'All'"]:
+            # from langchain.document_loaders import UnstructuredURLLoader
+            # loader = UnstructuredURLLoader(urls=urls)
+            urls = ["https://www.birdsongsf.com/who-we-are/"]
+            from langchain.document_loaders import PlaywrightURLLoader
+            loader = PlaywrightURLLoader(urls=urls, remove_selectors=["header", "footer"])
+            sources1 = loader.load()
+            sources.extend(sources1)
+        if not sources:
+            print("langchain_mode %s has no sources, not making db" % langchain_mode, flush=True)
+            return None
+        print("Generating db", flush=True)
+        db = get_db(sources, use_openai_embedding=use_openai_embedding, db_type=db_type,
+                    persist_directory=persist_directory, langchain_mode=langchain_mode,
+                    hf_embedding_model=hf_embedding_model)
+        print("Generated db", flush=True)
+    return db
+source_prefix = "Sources [Score | Link]:"
+source_postfix = "End Sources<p>"
+def run_qa_db(**kwargs):
+    func_names = list(inspect.signature(_run_qa_db).parameters)
+    # hard-coded defaults
+    kwargs['answer_with_sources'] = True
+    kwargs['sanitize_bot_response'] = True
+    kwargs['show_rank'] = False
+    missing_kwargs = [x for x in func_names if x not in kwargs]
+    assert not missing_kwargs, "Missing kwargs: %s" % missing_kwargs
+    # only keep actual used
+    kwargs = {k: v for k, v in kwargs.items() if k in func_names}
+    return _run_qa_db(**kwargs)
+def _run_qa_db(query=None,
+               use_openai_model=False, use_openai_embedding=False,
+               first_para=False, text_limit=None, k=4, chunk=False, chunk_size=1024,
+               user_path=None,
+               db_type='faiss',
+               model_name=None, model=None, tokenizer=None,
+               hf_embedding_model="sentence-transformers/all-MiniLM-L6-v2",
+               stream_output=False,
+               prompter=None,
+               prompt_type=None,
+               answer_with_sources=True,
+               cut_distanct=1.1,
+               sanitize_bot_response=True,
+               show_rank=False,
+               load_db_if_exists=False,
+               db=None,
+               max_new_tokens=256,
+               temperature=0.1,
+               repetition_penalty=1.0,
+               top_k=40,
+               top_p=0.7,
+               langchain_mode=None,
+               n_jobs=-1):
+    """
+    :param query:
+    :param use_openai_model:
+    :param use_openai_embedding:
+    :param first_para:
+    :param text_limit:
+    :param k:
+    :param chunk:
+    :param chunk_size:
+    :param user_path: user path to glob recursively from
+    :param db_type: 'faiss' for in-memory db or 'chroma' for persistent db
+    :param model_name: model name, used to switch behaviors
+    :param model: pre-initialized model, else will make new one
+    :param tokenizer: pre-initialized tokenizer, else will make new one.  Required not None if model is not None
+    :param answer_with_sources
+    :return:
+    """
+    # FIXME: For All just go over all dbs instead of a separate db for All
+    db = make_db(**locals())
+    prompt_type = prompter.prompt_type if prompter is not None else prompt_type
+    llm, model_name, streamer, prompt_type_out = get_llm(use_openai_model=use_openai_model, model_name=model_name,
+                                                         model=model, tokenizer=tokenizer,
+                                                         stream_output=stream_output,
+                                                         max_new_tokens=max_new_tokens,
+                                                         temperature=temperature,
+                                                         repetition_penalty=repetition_penalty,
+                                                         top_k=top_k,
+                                                         top_p=top_p,
+                                                         prompt_type=prompt_type,
+                                                         )
+    if model_name in ['llama', 'gptj']:
+        # FIXME: for now, streams to stdout/stderr currently
+        stream_output = False
+    if not use_openai_model and prompt_type not in ['plain'] or model_name in ['llama', 'gptj']:
+        # instruct-like, rather than few-shot prompt_type='plain' as default
+        # but then sources confuse the model with how inserted among rest of text, so avoid
+        prefix = ""
+        if langchain_mode in ['Disabled', 'ChatLLM', 'LLM']:
+            use_context = False
+            template = """%s{context}{question}""" % prefix
+        else:
+            use_context = True
+            template = """%s
+==
+{context}
+==
+{question}""" % prefix
+        prompt = PromptTemplate(
+            # input_variables=["summaries", "question"],
+            input_variables=["context", "question"],
+            template=template,
+        )
+        chain = load_qa_chain(llm, prompt=prompt)
+    else:
+        chain = load_qa_with_sources_chain(llm)
+        use_context = True
+    if query is None:
+        query = "What are the main differences between Linux and Windows?"
+    # https://github.com/hwchase17/langchain/issues/1946
+    # FIXME: Seems to way to get size of chroma db to limit k to avoid
+    # Chroma collection MyData contains fewer than 4 elements.
+    # type logger error
+    k_db = 1000 if db_type == 'chroma' else k  # k=100 works ok too for
+    if db and use_context:
+        docs_with_score = db.similarity_search_with_score(query, k=k_db)[:k]
+        # cut off so no high distance docs/sources considered
+        docs = [x[0] for x in docs_with_score if x[1] < cut_distanct]
+        scores = [x[1] for x in docs_with_score if x[1] < cut_distanct]
+        if len(scores) > 0:
+            print("Distance: min: %s max: %s mean: %s median: %s" %
+                  (scores[0], scores[-1], np.mean(scores), np.median(scores)), flush=True)
+    else:
+        docs = []
+        scores = []
+    if not docs and use_context:
+        return None
+    common_words_file = "data/NGSL_1.2_stats.csv.zip"
+    if os.path.isfile(common_words_file):
+        df = pd.read_csv("data/NGSL_1.2_stats.csv.zip")
+        import string
+        reduced_query = query.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))).strip()
+        reduced_query_words = reduced_query.split(' ')
+        set_common = set(df['Lemma'].values.tolist())
+        num_common = len([x.lower() in set_common for x in reduced_query_words])
+        frac_common = num_common / len(reduced_query)
+        # FIXME: report to user bad query that uses too many common words
+        print("frac_common: %s" % frac_common, flush=True)
+    if langchain_mode in ['Disabled', 'ChatLLM', 'LLM']:
+        chain_kwargs = dict(input_documents=[], question=query)
+    else:
+        chain_kwargs = dict(input_documents=docs, question=query)
+    if stream_output:
+        answer = None
+        assert streamer is not None
+        target = wrapped_partial(chain, chain_kwargs)
+        import queue
+        bucket = queue.Queue()
+        thread = EThread(target=target, streamer=streamer, bucket=bucket)
+        thread.start()
+        outputs = ""
+        prompt = None  # FIXME
+        try:
+            for new_text in streamer:
+                # print("new_text: %s" % new_text, flush=True)
+                if bucket.qsize() > 0 or thread.exc:
+                    thread.join()
+                outputs += new_text
+                if prompter:  # and False:  # FIXME: pipeline can already use prompter
+                    output1 = prompter.get_response(outputs, prompt=prompt,
+                                                    sanitize_bot_response=sanitize_bot_response)
+                    yield output1
+                else:
+                    yield outputs
+        except BaseException:
+            # if any exception, raise that exception if was from thread, first
+            if thread.exc:
+                raise thread.exc
+            raise
+        finally:
+            # in case no exception and didn't join with thread yet, then join
+            if not thread.exc:
+                answer = thread.join()
+        # in case raise StopIteration or broke queue loop in streamer, but still have exception
+        if thread.exc:
+            raise thread.exc
+        # FIXME: answer is not string outputs from streamer.  How to get actual final output?
+        # answer = outputs
+    else:
+        answer = chain(chain_kwargs)
+    if not use_context:
+        ret = answer['output_text']
+        yield ret
+    elif answer is not None:
+        print("query: %s" % query, flush=True)
+        print("answer: %s" % answer['output_text'], flush=True)
+        # link
+        answer_sources = [(max(0.0, 1.5 - score) / 1.5, get_url(doc)) for score, doc in
+                          zip(scores, answer['input_documents'])]
+        answer_sources_dict = defaultdict(list)
+        [answer_sources_dict[url].append(score) for score, url in answer_sources]
+        answers_dict = {}
+        for url, scores_url in answer_sources_dict.items():
+            answers_dict[url] = np.max(scores_url)
+        answer_sources = [(score, url) for url, score in answers_dict.items()]
+        answer_sources.sort(key=lambda x: x[0], reverse=True)
+        if show_rank:
+            # answer_sources = ['%d | %s' % (1 + rank, url) for rank, (score, url) in enumerate(answer_sources)]
+            # sorted_sources_urls = "Sources [Rank | Link]:<br>" + "<br>".join(answer_sources)
+            answer_sources = ['%s' % url for rank, (score, url) in enumerate(answer_sources)]
+            sorted_sources_urls = "Ranked Sources:<br>" + "<br>".join(answer_sources)
+        else:
+            answer_sources = ['<li>%.2g | %s</li>' % (score, url) for score, url in answer_sources]
+            sorted_sources_urls = f"{source_prefix}<p><ul>" + "<p>".join(answer_sources)
+            sorted_sources_urls += f"</ul></p>{source_postfix}"
+        if not answer['output_text'].endswith('\n'):
+            answer['output_text'] += '\n'
+        if answer_with_sources:
+            ret = answer['output_text'] + '\n' + sorted_sources_urls
+        else:
+            ret = answer['output_text']
+        yield ret
+    return
+def chunk_sources(sources, chunk_size=1024):
+    source_chunks = []
+    # Below for known separator
+    # splitter = CharacterTextSplitter(separator=" ", chunk_size=chunk_size, chunk_overlap=0)
+    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
+    for source in sources:
+        # print(source.metadata['source'], flush=True)
+        for chunky in splitter.split_text(source.page_content):
+            source_chunks.append(Document(page_content=chunky, metadata=source.metadata))
+    return source_chunks
+def get_db_from_hf(dest=".", db_dir='db_dir_DriverlessAI_docs.zip'):
+    from huggingface_hub import hf_hub_download
+    # True for case when locally already logged in with correct token, so don't have to set key
+    token = os.getenv('HUGGINGFACE_API_TOKEN', True)
+    path_to_zip_file = hf_hub_download('h2oai/db_dirs', db_dir, token=token, repo_type='dataset')
+    import zipfile
+    with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
+        zip_ref.extractall(dest)
+    return path_to_zip_file
+# Note dir has space in some cases, while zip does not
+some_db_zips = [['db_dir_DriverlessAI_docs.zip', 'db_dir_DriverlessAI docs', 'CC-BY-NC license'],
+                ['db_dir_UserData.zip', 'db_dir_UserData', 'CC-BY license for ArXiv'],
+                ['db_dir_github_h2oGPT.zip', 'db_dir_github h2oGPT', 'ApacheV2 license'],
+                ['db_dir_wiki.zip', 'db_dir_wiki', 'CC-BY-SA Wikipedia license'],
+                # ['db_dir_wiki_full.zip', 'db_dir_wiki_full.zip', '23GB, 05/04/2023 CC-BY-SA Wiki license'],
+                ]
+all_db_zips = some_db_zips + \
+              [['db_dir_wiki_full.zip', 'db_dir_wiki_full.zip', '23GB, 05/04/2023 CC-BY-SA Wiki license'],
+               ]
+def get_some_dbs_from_hf(dest='.', db_zips=None):
+    if db_zips is None:
+        db_zips = some_db_zips
+    for db_dir, dir_expected, license1 in db_zips:
+        path_to_zip_file = get_db_from_hf(dest=dest, db_dir=db_dir)
+        assert os.path.isfile(path_to_zip_file), "Missing zip in %s" % path_to_zip_file
+        if dir_expected:
+            assert os.path.isdir(os.path.join(dest, dir_expected)), "Missing path for %s" % dir_expected
+            assert os.path.isdir(os.path.join(dest, dir_expected, 'index')), "Missing index in %s" % dir_expected
+if __name__ == '__main__':
+    pass

gradio_runner.py CHANGED Viewed

@@ -1,15 +1,23 @@
 import copy
 import functools
 import inspect
 import os
 import sys
 from gradio_themes import H2oTheme, SoftTheme, get_h2o_title, get_simple_title, get_dark_js
-from prompter import Prompter
 from utils import get_githash, flatten_list, zip_data, s3up, clear_torch_cache, get_torch_allocated, system_info_print, \
-    ping
-from finetune import prompt_type_to_model_name, prompt_types_strings, generate_prompt, inv_prompt_type_to_model_lower
-from generate import get_model, languages_covered, evaluate, eval_func_param_names, score_qa
 import gradio as gr
 from apscheduler.schedulers.background import BackgroundScheduler
@@ -25,6 +33,21 @@ def go_gradio(**kwargs):
     model_state0 = kwargs['model_state0']
     score_model_state0 = kwargs['score_model_state0']
     queue = True
     # easy update of kwargs needed for evaluate() etc.
     kwargs.update(locals())
@@ -42,6 +65,9 @@ def go_gradio(**kwargs):
     title = 'h2oGPT'
     if 'h2ogpt-research' in kwargs['base_model']:
         title += " [Research demonstration]"
     if kwargs['verbose']:
         description = f"""Model {kwargs['base_model']} Instruct dataset.
                       For more information, visit our GitHub pages: [h2oGPT](https://github.com/h2oai/h2ogpt) and [H2O LLM Studio](https://github.com/h2oai/h2o-llmstudio).
@@ -49,9 +75,11 @@ def go_gradio(**kwargs):
                       Hash: {get_githash()}
                       """
     else:
-        description = "For more information, visit our GitHub pages: [h2oGPT](https://github.com/h2oai/h2ogpt) and [H2O LLM Studio](https://github.com/h2oai/h2o-llmstudio)<br>"
     description += "If this host is busy, try [12B](https://gpt.h2o.ai), [30B](http://gpt2.h2o.ai), [HF Spaces1 12B](https://huggingface.co/spaces/h2oai/h2ogpt-chatbot) or [HF Spaces2 12B](https://huggingface.co/spaces/h2oai/h2ogpt-chatbot2)<br>"
     description += """<p>By using h2oGPT, you accept our [Terms of Service](https://github.com/h2oai/h2ogpt/blob/main/tos.md)</p>"""
     if kwargs['verbose']:
         task_info_md = f"""
@@ -66,6 +94,9 @@ def go_gradio(**kwargs):
     """
     else:
         css_code = """footer {visibility: hidden}"""
     if kwargs['gradio_avoid_processing_markdown']:
         from gradio_client import utils as client_utils
@@ -134,6 +165,8 @@ def go_gradio(**kwargs):
         model_state2 = gr.State([None, None, None, None])
         model_options_state = gr.State([model_options])
         lora_options_state = gr.State([lora_options])
         gr.Markdown(f"""
             {get_h2o_title(title) if kwargs['h2ocolors'] else get_simple_title(title)}
@@ -142,7 +175,7 @@ def go_gradio(**kwargs):
             """)
         if is_hf:
             gr.HTML(
-                '''<center><a href="https://huggingface.co/spaces/h2oai/h2ogpt-chatbot?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>Duplicate this Space to skip the queue and run in a private space</center>''')
         # go button visible if
         base_wanted = kwargs['base_model'] != no_model_str and kwargs['login_mode_if_model0']
@@ -153,7 +186,7 @@ def go_gradio(**kwargs):
                 with gr.Row():
                     col_nochat = gr.Column(visible=not kwargs['chat'])
                     with col_nochat:  # FIXME: for model comparison, and check rest
-                        text_output_nochat = gr.Textbox(lines=5, label=output_label0)
                         instruction_nochat = gr.Textbox(
                             lines=kwargs['input_lines'],
                             label=instruction_label_nochat,
@@ -187,7 +220,7 @@ def go_gradio(**kwargs):
                                 submit = gr.Button(value='Submit').style(full_width=False, size='sm')
                                 stop_btn = gr.Button(value="Stop").style(full_width=False, size='sm')
                         with gr.Row():
-                            clear = gr.Button("New Conversation")
                             flag_btn = gr.Button("Flag")
                             if not kwargs['auto_score']:  # FIXME: For checkbox model2
                                 with gr.Column(visible=kwargs['score_model']):
@@ -206,7 +239,7 @@ def go_gradio(**kwargs):
                                     score_text2 = gr.Textbox("Response Score2: NA", show_label=False, visible=False)
                             retry = gr.Button("Regenerate")
                             undo = gr.Button("Undo")
-                with gr.TabItem("Input/Output"):
                     with gr.Row():
                         if 'mbart-' in kwargs['model_lower']:
                             src_lang = gr.Dropdown(list(languages_covered().keys()),
@@ -215,6 +248,122 @@ def go_gradio(**kwargs):
                             tgt_lang = gr.Dropdown(list(languages_covered().keys()),
                                                    value=kwargs['tgt_lang'],
                                                    label="Output Language")
                 with gr.TabItem("Expert"):
                     with gr.Row():
                         with gr.Column():
@@ -243,7 +392,7 @@ def go_gradio(**kwargs):
                             )
                             # FIXME: https://github.com/h2oai/h2ogpt/issues/106
                             if os.getenv('TESTINGFAIL'):
-                                 max_beams = 8 if not (is_low_mem or is_public) else 1
                             else:
                                 max_beams = 1
                             num_beams = gr.Slider(minimum=1, maximum=max_beams, step=1,
@@ -356,12 +505,13 @@ def go_gradio(**kwargs):
                         with gr.Column():
                             with gr.Row():
                                 system_btn = gr.Button(value='Get System Info')
-                                system_text = gr.Textbox(label='System Info', interactive=False)
                             with gr.Row():
                                 zip_btn = gr.Button("Zip")
                                 zip_text = gr.Textbox(label="Zip file name", interactive=False)
-                                file_output = gr.File()
                             with gr.Row():
                                 s3up_btn = gr.Button("S3UP")
                                 s3up_text = gr.Textbox(label='S3UP result', interactive=False)
@@ -378,8 +528,103 @@ def go_gradio(**kwargs):
         # Get flagged data
         zip_data1 = functools.partial(zip_data, root_dirs=['flagged_data_points', kwargs['save_dir']])
-        zip_btn.click(zip_data1, inputs=None, outputs=[file_output, zip_text], queue=False)
-        s3up_btn.click(s3up, inputs=zip_text, outputs=s3up_text, queue=False)
         def check_admin_pass(x):
             return gr.update(visible=x == admin_pass)
@@ -569,49 +814,66 @@ def go_gradio(**kwargs):
             """
             # don't deepcopy, can contain model itself
             args_list = list(args).copy()
-            history = args_list[-1]  # model_state is -2
             if retry and history:
                 history.pop()
             if not history:
                 print("No history", flush=True)
                 history = [['', None]]
                 yield history, ''
                 return
             # ensure output will be unique to models
             history = copy.deepcopy(history)
             instruction1 = history[-1][0]
             context1 = ''
-            if kwargs['chat_history'] > 0:
-                prompt_type_arg_id = eval_func_param_names.index('prompt_type')
-                prompt_type1 = args_list[prompt_type_arg_id]
-                chat_arg_id = eval_func_param_names.index('chat')
-                chat1 = args_list[chat_arg_id]
                 context1 = ''
-                for histi in range(len(history) - 1):
                     data_point = dict(instruction=history[histi][0], input='', output=history[histi][1])
                     prompt, pre_response, terminate_response, chat_sep = generate_prompt(data_point, prompt_type1,
                                                                                          chat1, reduced=True)
-                    # md -> back to text, maybe not super improtant if model trained enough
                     prompt = prompt.replace('<br>', chat_sep)
-                    context1 += prompt
-                    if not context1.endswith(chat_sep):
-                        context1 += chat_sep
                 _, pre_response, terminate_response, chat_sep = generate_prompt({}, prompt_type1, chat1,
                                                                                 reduced=True)
                 if context1 and not context1.endswith(chat_sep):
                     context1 += chat_sep  # ensure if terminates abruptly, then human continues on next line
             args_list[0] = instruction1  # override original instruction with history from user
-            # only include desired chat history
-            args_list[2] = context1[-kwargs['chat_history']:]
-            model_state1 = args_list[-2]
             if model_state1[0] is None or model_state1[0] == no_model_str:
                 history = [['', None]]
                 yield history, ''
                 return
-            args_list = args_list[:-2]
             fun1 = partial(evaluate,
                            model_state1,
                            **kwargs_evaluate)
             try:
                 for output in fun1(*tuple(args_list)):
@@ -645,11 +907,11 @@ def go_gradio(**kwargs):
                          outputs=text_output,
                          )
         bot_args = dict(fn=bot,
-                        inputs=inputs_list + [model_state] + [text_output],
                         outputs=[text_output, exception_text],
                         )
         retry_bot_args = dict(fn=functools.partial(bot, retry=True),
-                              inputs=inputs_list + [model_state] + [text_output],
                               outputs=[text_output, exception_text],
                               )
         undo_user_args = dict(fn=functools.partial(user, undo=True),
@@ -663,11 +925,11 @@ def go_gradio(**kwargs):
                           outputs=text_output2,
                           )
         bot_args2 = dict(fn=bot,
-                         inputs=inputs_list + [model_state2] + [text_output2],
                          outputs=[text_output2, exception_text],
                          )
         retry_bot_args2 = dict(fn=functools.partial(bot, retry=True),
-                               inputs=inputs_list + [model_state2] + [text_output2],
                                outputs=[text_output2, exception_text],
                                )
         undo_user_args2 = dict(fn=functools.partial(user, undo=True),
@@ -694,7 +956,8 @@ def go_gradio(**kwargs):
             .then(clear_instruct, None, iinput)
         submit_event1d = submit_event1c.then(**bot_args, api_name='instruction_bot' if allow_api else None,
                                              queue=queue)
-        submit_event1e = submit_event1d.then(**score_args_submit, api_name='instruction_bot_score' if allow_api else None,
                                              queue=queue)
         submit_event1f = submit_event1e.then(**bot_args2, api_name='instruction_bot2' if allow_api else None,
                                              queue=queue)
@@ -735,12 +998,134 @@ def go_gradio(**kwargs):
             .then(**score_args_submit, api_name='undo_score' if allow_api else None) \
             .then(**score_args2_submit, api_name='undo_score2' if allow_api else None)
         # does both models
-        clear.click(lambda: None, None, text_output, queue=False, api_name='clear' if allow_api else None) \
-            .then(lambda: None, None, text_output2, queue=False, api_name='clear2' if allow_api else None)
         # NOTE: clear of instruction/iinput for nochat has to come after score,
         # because score for nochat consumes actual textbox, while chat consumes chat history filled by user()
-        submit_event_nochat = submit_nochat.click(fun, inputs=[model_state] + inputs_list,
                                                   outputs=text_output_nochat,
                                                   queue=queue,
                                                   api_name='submit_nochat' if allow_api else None) \
@@ -842,8 +1227,8 @@ def go_gradio(**kwargs):
             new_state = [list0[0] + [x]]
             new_options = [*new_state[0]]
             return gr.Dropdown.update(value=x, choices=new_options), \
-                   gr.Dropdown.update(value=x, choices=new_options), \
-                   '', new_state
         add_model_event = add_model_button.click(fn=dropdown_model_list,
                                                  inputs=[model_options_state, new_model],
@@ -857,8 +1242,8 @@ def go_gradio(**kwargs):
             x1 = x if model_used1 == no_model_str else lora_used1
             x2 = x if model_used2 == no_model_str else lora_used2
             return gr.Dropdown.update(value=x1, choices=new_options), \
-                   gr.Dropdown.update(value=x2, choices=new_options), \
-                   '', new_state
         add_lora_event = add_lora_button.click(fn=dropdown_lora_list,
                                                inputs=[lora_options_state, new_lora, model_used, lora_used, model_used2,
@@ -916,10 +1301,20 @@ def go_gradio(**kwargs):
     scheduler = BackgroundScheduler()
     scheduler.add_job(func=clear_torch_cache, trigger="interval", seconds=20)
-    if is_public:
         scheduler.add_job(func=ping, trigger="interval", seconds=60)
     scheduler.start()
     demo.launch(share=kwargs['share'], server_name="0.0.0.0", show_error=True,
                 favicon_path=favicon_path, prevent_thread_lock=True,
                 auth=kwargs['auth'])
@@ -928,9 +1323,7 @@ def go_gradio(**kwargs):
         demo.block_thread()
-input_args_list = ['model_state']
-inputs_kwargs_list = ['debug', 'save_dir', 'sanitize_bot_response', 'model_state0', 'is_low_mem',
-                      'raise_generate_gpu_exceptions', 'chat_context', 'concurrency_count', 'lora_weights']
 def get_inputs_list(inputs_dict, model_lower):
@@ -946,9 +1339,204 @@ def get_inputs_list(inputs_dict, model_lower):
         if k == 'kwargs':
             continue
         if k in input_args_list + inputs_kwargs_list:
-            # these are added via partial, not taken as input
             continue
         if 'mbart-' not in model_lower and k in ['src_lang', 'tgt_lang']:
             continue
         inputs_list.append(inputs_dict[k])
     return inputs_list

 import copy
 import functools
 import inspect
+import json
 import os
+import random
 import sys
+import traceback
+import uuid
+import filelock
+import pandas as pd
+import tabulate
 from gradio_themes import H2oTheme, SoftTheme, get_h2o_title, get_simple_title, get_dark_js
+from prompter import Prompter, \
+    prompt_type_to_model_name, prompt_types_strings, inv_prompt_type_to_model_lower, generate_prompt
 from utils import get_githash, flatten_list, zip_data, s3up, clear_torch_cache, get_torch_allocated, system_info_print, \
+    ping, get_short_name, get_url, makedirs
+from generate import get_model, languages_covered, evaluate, eval_func_param_names, score_qa, langchain_modes, \
+    inputs_kwargs_list, get_cutoffs, scratch_base_dir
 import gradio as gr
 from apscheduler.schedulers.background import BackgroundScheduler
     model_state0 = kwargs['model_state0']
     score_model_state0 = kwargs['score_model_state0']
     queue = True
+    dbs = kwargs['dbs']
+    db_type = kwargs['db_type']
+    visible_langchain_modes = kwargs['visible_langchain_modes']
+    allow_upload_to_user_data = kwargs['allow_upload_to_user_data']
+    allow_upload_to_my_data = kwargs['allow_upload_to_my_data']
+    enable_sources_list = kwargs['enable_sources_list']
+    enable_url_upload = kwargs['enable_url_upload']
+    enable_text_upload = kwargs['enable_text_upload']
+    allow_upload = allow_upload_to_user_data or allow_upload_to_my_data
+    use_openai_embedding = kwargs['use_openai_embedding']
+    hf_embedding_model = kwargs['hf_embedding_model']
+    enable_captions = kwargs['enable_captions']
+    captions_model = kwargs['captions_model']
+    enable_ocr = kwargs['enable_ocr']
+    caption_loader = kwargs['caption_loader']
     # easy update of kwargs needed for evaluate() etc.
     kwargs.update(locals())
     title = 'h2oGPT'
     if 'h2ogpt-research' in kwargs['base_model']:
         title += " [Research demonstration]"
+    more_info = """For more information, visit our GitHub pages: [h2oGPT](https://github.com/h2oai/h2ogpt) and [H2O-LLMStudio](https://github.com/h2oai/h2o-llmstudio)<br>"""
+    if is_public:
+        more_info += """<iframe src="https://ghbtns.com/github-btn.html?user=h2oai&repo=h2ogpt&type=star&count=true&size=small" frameborder="0" scrolling="0" width="150" height="20" title="GitHub"></iframe>"""
     if kwargs['verbose']:
         description = f"""Model {kwargs['base_model']} Instruct dataset.
                       For more information, visit our GitHub pages: [h2oGPT](https://github.com/h2oai/h2ogpt) and [H2O LLM Studio](https://github.com/h2oai/h2o-llmstudio).
                       Hash: {get_githash()}
                       """
     else:
+        description = more_info
     description += "If this host is busy, try [12B](https://gpt.h2o.ai), [30B](http://gpt2.h2o.ai), [HF Spaces1 12B](https://huggingface.co/spaces/h2oai/h2ogpt-chatbot) or [HF Spaces2 12B](https://huggingface.co/spaces/h2oai/h2ogpt-chatbot2)<br>"
     description += """<p>By using h2oGPT, you accept our [Terms of Service](https://github.com/h2oai/h2ogpt/blob/main/tos.md)</p>"""
+    if is_hf:
+        description += '''<a href="https://huggingface.co/spaces/h2oai/h2ogpt-chatbot?duplicate=true"><img src="https://bit.ly/3gLdBN6" style="white-space: nowrap" alt="Duplicate Space"></a>'''
     if kwargs['verbose']:
         task_info_md = f"""
     """
     else:
         css_code = """footer {visibility: hidden}"""
+    css_code += """
+body.dark{#warning {background-color: #555555};}
+"""
     if kwargs['gradio_avoid_processing_markdown']:
         from gradio_client import utils as client_utils
         model_state2 = gr.State([None, None, None, None])
         model_options_state = gr.State([model_options])
         lora_options_state = gr.State([lora_options])
+        my_db_state = gr.State([None, None])
+        chat_state = gr.State({})
         gr.Markdown(f"""
             {get_h2o_title(title) if kwargs['h2ocolors'] else get_simple_title(title)}
             """)
         if is_hf:
             gr.HTML(
+                )
         # go button visible if
         base_wanted = kwargs['base_model'] != no_model_str and kwargs['login_mode_if_model0']
                 with gr.Row():
                     col_nochat = gr.Column(visible=not kwargs['chat'])
                     with col_nochat:  # FIXME: for model comparison, and check rest
+                        text_output_nochat = gr.Textbox(lines=5, label=output_label0).style(show_copy_button=True)
                         instruction_nochat = gr.Textbox(
                             lines=kwargs['input_lines'],
                             label=instruction_label_nochat,
                                 submit = gr.Button(value='Submit').style(full_width=False, size='sm')
                                 stop_btn = gr.Button(value="Stop").style(full_width=False, size='sm')
                         with gr.Row():
+                            clear = gr.Button("Save, New Conversation")
                             flag_btn = gr.Button("Flag")
                             if not kwargs['auto_score']:  # FIXME: For checkbox model2
                                 with gr.Column(visible=kwargs['score_model']):
                                     score_text2 = gr.Textbox("Response Score2: NA", show_label=False, visible=False)
                             retry = gr.Button("Regenerate")
                             undo = gr.Button("Undo")
+                with gr.TabItem("Chat"):
                     with gr.Row():
                         if 'mbart-' in kwargs['model_lower']:
                             src_lang = gr.Dropdown(list(languages_covered().keys()),
                             tgt_lang = gr.Dropdown(list(languages_covered().keys()),
                                                    value=kwargs['tgt_lang'],
                                                    label="Output Language")
+                    radio_chats = gr.Radio(value=None, label="Saved Chats", visible=True, interactive=True,
+                                           type='value')
+                    with gr.Row():
+                        remove_chat_btn = gr.Button(value="Remove Selected Chat", visible=True)
+                        clear_chat_btn = gr.Button(value="Clear Chat", visible=True)
+                    chats_row = gr.Row(visible=True).style(equal_height=False)
+                    with chats_row:
+                        export_chats_btn = gr.Button(value="Export Chats")
+                        chats_file = gr.File(interactive=False, label="Download File")
+                    chats_row2 = gr.Row(visible=True).style(equal_height=False)
+                    with chats_row2:
+                        chatsup_output = gr.File(label="Upload Chat File(s)",
+                                                 file_types=['.json'],
+                                                 file_count='multiple',
+                                                 elem_id="warning", elem_classes="feedback")
+                        add_to_chats_btn = gr.Button("Add File(s) to Chats")
+                with gr.TabItem("Data Source"):
+                    langchain_readme = get_url('https://github.com/h2oai/h2ogpt/blob/main/README_LangChain.md',
+                                               from_str=True)
+                    gr.HTML(value=f"""LangChain Support Disabled<p>
+                            Run:<p>
+                            <code>
+                            python generate.py --langchain_mode=MyData
+                            </code>
+                            <p>
+                            For more options see: {langchain_readme}""",
+                            visible=kwargs['langchain_mode'] == 'Disabled', interactive=False)
+                    data_row = gr.Row(visible=kwargs['langchain_mode'] != 'Disabled')
+                    with data_row:
+                        if is_hf:
+                            # don't show 'wiki' since only usually useful for internal testing at moment
+                            no_show_modes = ['Disabled', 'wiki']
+                        else:
+                            no_show_modes = ['Disabled']
+                        allowed_modes = visible_langchain_modes.copy()
+                        allowed_modes = [x for x in allowed_modes if x in dbs]
+                        allowed_modes += ['ChatLLM', 'LLM']
+                        if allow_upload_to_my_data and 'MyData' not in allowed_modes:
+                            allowed_modes += ['MyData']
+                        if allow_upload_to_user_data and 'UserData' not in allowed_modes:
+                            allowed_modes += ['UserData']
+                        langchain_mode = gr.Radio(
+                            [x for x in langchain_modes if x in allowed_modes and x not in no_show_modes],
+                            value=kwargs['langchain_mode'],
+                            label="Data Source",
+                            visible=kwargs['langchain_mode'] != 'Disabled')
+                        def upload_file(files, x):
+                            file_paths = [file.name for file in files]
+                            return files, file_paths
+                    upload_row = gr.Row(visible=kwargs['langchain_mode'] != 'Disabled' and allow_upload).style(
+                        equal_height=False)
+                    # import control
+                    if kwargs['langchain_mode'] != 'Disabled':
+                        from gpt_langchain import file_types, have_arxiv
+                    else:
+                        have_arxiv = False
+                        file_types = []
+                    with upload_row:
+                        file_types_str = '[' + ' '.join(file_types) + ']'
+                        fileup_output = gr.File(label=f'Upload {file_types_str}',
+                                                file_types=file_types,
+                                                file_count="multiple",
+                                                elem_id="warning", elem_classes="feedback")
+                        with gr.Row():
+                            upload_button = gr.UploadButton("Upload %s" % file_types_str,
+                                                            file_types=file_types,
+                                                            file_count="multiple",
+                                                            visible=False,
+                                                            )
+                            # add not visible until upload something
+                            with gr.Column():
+                                add_to_shared_db_btn = gr.Button("Add File(s) to Shared UserData DB",
+                                                                 visible=allow_upload_to_user_data)  # and False)
+                                add_to_my_db_btn = gr.Button("Add File(s) to Scratch MyData DB",
+                                                             visible=allow_upload_to_my_data)  # and False)
+                    url_row = gr.Row(
+                        visible=kwargs['langchain_mode'] != 'Disabled' and allow_upload and enable_url_upload).style(
+                        equal_height=False)
+                    with url_row:
+                        url_label = 'URL (http/https) or ArXiv:' if have_arxiv else 'URL (http/https)'
+                        url_text = gr.Textbox(label=url_label, interactive=True)
+                        with gr.Column():
+                            url_user_btn = gr.Button(value='Add URL content to Shared UserData DB',
+                                                     visible=allow_upload_to_user_data)
+                            url_my_btn = gr.Button(value='Add URL content to Scratch MyData DB',
+                                                   visible=allow_upload_to_my_data)
+                    text_row = gr.Row(
+                        visible=kwargs['langchain_mode'] != 'Disabled' and allow_upload and enable_text_upload).style(
+                        equal_height=False)
+                    with text_row:
+                        user_text_text = gr.Textbox(label='Paste Text', interactive=True)
+                        with gr.Column():
+                            user_text_user_btn = gr.Button(value='Add Text to Shared UserData DB',
+                                                           visible=allow_upload_to_user_data)
+                            user_text_my_btn = gr.Button(value='Add Text to Scratch MyData DB',
+                                                         visible=allow_upload_to_my_data)
+                    # WIP:
+                    with gr.Row(visible=False).style(equal_height=False):
+                        github_textbox = gr.Textbox(label="Github URL")
+                        with gr.Row(visible=True):
+                            github_shared_btn = gr.Button(value="Add Github to Shared UserData DB",
+                                                          visible=allow_upload_to_user_data)
+                            github_my_btn = gr.Button(value="Add Github to Scratch MyData DB",
+                                                      visible=allow_upload_to_my_data)
+                    sources_row = gr.Row(visible=kwargs['langchain_mode'] != 'Disabled' and enable_sources_list).style(
+                        equal_height=False)
+                    with sources_row:
+                        sources_text = gr.HTML(label='Sources Added', interactive=False)
+                    sources_row2 = gr.Row(visible=kwargs['langchain_mode'] != 'Disabled' and enable_sources_list).style(
+                        equal_height=False)
+                    with sources_row2:
+                        get_sources_btn = gr.Button(value="Get Sources List for Selected DB")
+                        file_source = gr.File(interactive=False, label="Download File with list of Sources")
                 with gr.TabItem("Expert"):
                     with gr.Row():
                         with gr.Column():
                             )
                             # FIXME: https://github.com/h2oai/h2ogpt/issues/106
                             if os.getenv('TESTINGFAIL'):
+                                max_beams = 8 if not (is_low_mem or is_public) else 1
                             else:
                                 max_beams = 1
                             num_beams = gr.Slider(minimum=1, maximum=max_beams, step=1,
                         with gr.Column():
                             with gr.Row():
                                 system_btn = gr.Button(value='Get System Info')
+                                system_text = gr.Textbox(label='System Info', interactive=False).style(
+                                    show_copy_button=True)
                             with gr.Row():
                                 zip_btn = gr.Button("Zip")
                                 zip_text = gr.Textbox(label="Zip file name", interactive=False)
+                                file_output = gr.File(interactive=False)
                             with gr.Row():
                                 s3up_btn = gr.Button("S3UP")
                                 s3up_text = gr.Textbox(label='S3UP result', interactive=False)
         # Get flagged data
         zip_data1 = functools.partial(zip_data, root_dirs=['flagged_data_points', kwargs['save_dir']])
+        zip_btn.click(zip_data1, inputs=None, outputs=[file_output, zip_text], queue=False,
+                      api_name='zip_data' if allow_api else None)
+        s3up_btn.click(s3up, inputs=zip_text, outputs=s3up_text, queue=False,
+                       api_name='s3up_data' if allow_api else None)
+        def make_add_visible(x):
+            return gr.update(visible=x is not None)
+        def clear_file_list():
+            return None
+        def make_invisible():
+            return gr.update(visible=False)
+        def make_visible():
+            return gr.update(visible=True)
+        # add itself to output to ensure shows working and can't click again
+        upload_button.upload(upload_file, inputs=[upload_button, fileup_output],
+                             outputs=[upload_button, fileup_output], queue=queue,
+                             api_name='upload_file' if allow_api else None) \
+            .then(make_add_visible, fileup_output, add_to_shared_db_btn, queue=queue) \
+            .then(make_add_visible, fileup_output, add_to_my_db_btn, queue=queue) \
+            .then(make_invisible, outputs=upload_button, queue=queue)
+        # Add to UserData
+        update_user_db_func = functools.partial(update_user_db, dbs=dbs, db_type=db_type, langchain_mode='UserData',
+                                                use_openai_embedding=use_openai_embedding,
+                                                hf_embedding_model=hf_embedding_model,
+                                                enable_captions=enable_captions,
+                                                captions_model=captions_model,
+                                                enable_ocr=enable_ocr,
+                                                caption_loader=caption_loader,
+                                                )
+        # note for update_user_db_func output is ignored for db
+        add_to_shared_db_btn.click(update_user_db_func,
+                                   inputs=[fileup_output, my_db_state, add_to_shared_db_btn, add_to_my_db_btn],
+                                   outputs=[add_to_shared_db_btn, add_to_my_db_btn, sources_text], queue=queue,
+                                   api_name='add_to_shared' if allow_api else None) \
+            .then(clear_file_list, outputs=fileup_output, queue=queue)
+        # .then(make_invisible, outputs=add_to_shared_db_btn, queue=queue)
+        # .then(make_visible, outputs=upload_button, queue=queue)
+        def clear_textbox():
+            return gr.Textbox.update(value='')
+        update_user_db_url_func = functools.partial(update_user_db_func, is_url=True)
+        url_user_btn.click(update_user_db_url_func,
+                           inputs=[url_text, my_db_state, add_to_shared_db_btn, add_to_my_db_btn],
+                           outputs=[add_to_shared_db_btn, add_to_my_db_btn, sources_text], queue=queue,
+                           api_name='add_url_to_shared' if allow_api else None) \
+            .then(clear_textbox, outputs=url_text, queue=queue)
+        update_user_db_txt_func = functools.partial(update_user_db_func, is_txt=True)
+        user_text_user_btn.click(update_user_db_txt_func,
+                                 inputs=[user_text_text, my_db_state, add_to_shared_db_btn, add_to_my_db_btn],
+                                 outputs=[add_to_shared_db_btn, add_to_my_db_btn, sources_text], queue=queue,
+                                 api_name='add_text_to_shared' if allow_api else None) \
+            .then(clear_textbox, outputs=user_text_text, queue=queue)
+        # Add to MyData
+        update_my_db_func = functools.partial(update_user_db, dbs=dbs, db_type=db_type, langchain_mode='MyData',
+                                              use_openai_embedding=use_openai_embedding,
+                                              hf_embedding_model=hf_embedding_model,
+                                              enable_captions=enable_captions,
+                                              captions_model=captions_model,
+                                              enable_ocr=enable_ocr,
+                                              caption_loader=caption_loader,
+                                              )
+        add_to_my_db_btn.click(update_my_db_func,
+                               inputs=[fileup_output, my_db_state, add_to_shared_db_btn, add_to_my_db_btn],
+                               outputs=[my_db_state, add_to_shared_db_btn, add_to_my_db_btn, sources_text], queue=queue,
+                               api_name='add_to_my' if allow_api else None) \
+            .then(clear_file_list, outputs=fileup_output, queue=queue)
+        # .then(make_invisible, outputs=add_to_shared_db_btn, queue=queue)
+        # .then(make_visible, outputs=upload_button, queue=queue)
+        update_my_db_url_func = functools.partial(update_my_db_func, is_url=True)
+        url_my_btn.click(update_my_db_url_func,
+                         inputs=[url_text, my_db_state, add_to_shared_db_btn, add_to_my_db_btn],
+                         outputs=[my_db_state, add_to_shared_db_btn, add_to_my_db_btn, sources_text], queue=queue,
+                         api_name='add_url_to_my' if allow_api else None) \
+            .then(clear_textbox, outputs=url_text, queue=queue)
+        update_my_db_txt_func = functools.partial(update_my_db_func, is_txt=True)
+        user_text_my_btn.click(update_my_db_txt_func,
+                               inputs=[user_text_text, my_db_state, add_to_shared_db_btn, add_to_my_db_btn],
+                               outputs=[my_db_state, add_to_shared_db_btn, add_to_my_db_btn, sources_text], queue=queue,
+                               api_name='add_txt_to_my' if allow_api else None) \
+            .then(clear_textbox, outputs=user_text_text, queue=queue)
+        get_sources1 = functools.partial(get_sources, dbs=dbs)
+        get_sources_btn.click(get_sources1, inputs=[my_db_state, langchain_mode], outputs=file_source, queue=queue,
+                              api_name='get_sources' if allow_api else None)
         def check_admin_pass(x):
             return gr.update(visible=x == admin_pass)
             """
             # don't deepcopy, can contain model itself
             args_list = list(args).copy()
+            model_state1 = args_list[-3]
+            my_db_state1 = args_list[-2]
+            history = args_list[-1]
+            args_list = args_list[:-3]  # only keep rest needed for evaluate()
+            langchain_mode1 = args_list[eval_func_param_names.index('langchain_mode')]
             if retry and history:
                 history.pop()
+                if not args_list[eval_func_param_names.index('do_sample')]:
+                    # if was not sampling, no point in retry unless change to sample
+                    args_list[eval_func_param_names.index('do_sample')] = True
             if not history:
                 print("No history", flush=True)
                 history = [['', None]]
                 yield history, ''
                 return
             # ensure output will be unique to models
+            _, _, _, max_prompt_length = get_cutoffs(is_low_mem, for_context=True)
             history = copy.deepcopy(history)
             instruction1 = history[-1][0]
             context1 = ''
+            if max_prompt_length is not None and langchain_mode1 not in ['LLM']:
+                prompt_type1 = args_list[eval_func_param_names.index('prompt_type')]
+                chat1 = args_list[eval_func_param_names.index('chat')]
                 context1 = ''
+                # - 1 below because current instruction already in history from user()
+                for histi in range(0, len(history) - 1):
                     data_point = dict(instruction=history[histi][0], input='', output=history[histi][1])
                     prompt, pre_response, terminate_response, chat_sep = generate_prompt(data_point, prompt_type1,
                                                                                          chat1, reduced=True)
+                    # md -> back to text, maybe not super important if model trained enough
+                    if not kwargs['keep_sources_in_context']:
+                        from gpt_langchain import source_prefix, source_postfix
+                        import re
+                        prompt = re.sub(f'{re.escape(source_prefix)}.*?{re.escape(source_postfix)}', '', prompt,
+                                        flags=re.DOTALL)
+                        if prompt.endswith('\n<p>'):
+                            prompt = prompt[:-4]
                     prompt = prompt.replace('<br>', chat_sep)
+                    if not prompt.endswith(chat_sep):
+                        prompt += chat_sep
+                    # most recent first, add older if can
+                    # only include desired chat history
+                    if len(prompt + context1) > max_prompt_length:
+                        break
+                    context1 = prompt + context1
                 _, pre_response, terminate_response, chat_sep = generate_prompt({}, prompt_type1, chat1,
                                                                                 reduced=True)
                 if context1 and not context1.endswith(chat_sep):
                     context1 += chat_sep  # ensure if terminates abruptly, then human continues on next line
             args_list[0] = instruction1  # override original instruction with history from user
+            args_list[2] = context1
             if model_state1[0] is None or model_state1[0] == no_model_str:
                 history = [['', None]]
                 yield history, ''
                 return
             fun1 = partial(evaluate,
                            model_state1,
+                           my_db_state1,
                            **kwargs_evaluate)
             try:
                 for output in fun1(*tuple(args_list)):
                          outputs=text_output,
                          )
         bot_args = dict(fn=bot,
+                        inputs=inputs_list + [model_state, my_db_state] + [text_output],
                         outputs=[text_output, exception_text],
                         )
         retry_bot_args = dict(fn=functools.partial(bot, retry=True),
+                              inputs=inputs_list + [model_state, my_db_state] + [text_output],
                               outputs=[text_output, exception_text],
                               )
         undo_user_args = dict(fn=functools.partial(user, undo=True),
                           outputs=text_output2,
                           )
         bot_args2 = dict(fn=bot,
+                         inputs=inputs_list + [model_state2, my_db_state] + [text_output2],
                          outputs=[text_output2, exception_text],
                          )
         retry_bot_args2 = dict(fn=functools.partial(bot, retry=True),
+                               inputs=inputs_list + [model_state2, my_db_state] + [text_output2],
                                outputs=[text_output2, exception_text],
                                )
         undo_user_args2 = dict(fn=functools.partial(user, undo=True),
             .then(clear_instruct, None, iinput)
         submit_event1d = submit_event1c.then(**bot_args, api_name='instruction_bot' if allow_api else None,
                                              queue=queue)
+        submit_event1e = submit_event1d.then(**score_args_submit,
+                                             api_name='instruction_bot_score' if allow_api else None,
                                              queue=queue)
         submit_event1f = submit_event1e.then(**bot_args2, api_name='instruction_bot2' if allow_api else None,
                                              queue=queue)
             .then(**score_args_submit, api_name='undo_score' if allow_api else None) \
             .then(**score_args2_submit, api_name='undo_score2' if allow_api else None)
+        # MANAGE CHATS
+        def dedup(short_chat, short_chats):
+            if short_chat not in short_chats:
+                return short_chat
+            for i in range(1, 1000):
+                short_chat_try = short_chat + "_" + str(i)
+                if short_chat_try not in short_chats:
+                    return short_chat_try
+            # fallback and hope for best
+            short_chat = short_chat + "_" + str(random.random())
+            return short_chat
+        def get_short_chat(x, short_chats, short_len=20, words=4):
+            if x and len(x[0]) == 2 and x[0][0] is not None:
+                short_chat = ' '.join(x[0][0][:short_len].split(' ')[:words]).strip()
+                short_chat = dedup(short_chat, short_chats)
+            else:
+                short_chat = None
+            return short_chat
+        def is_chat_same(x, y):
+            # <p> etc. added in chat, try to remove some of that to help avoid dup entries when hit new conversation
+            is_same = True
+            # length of conversation has to be same
+            if len(x) != len(y):
+                return False
+            for stepx, stepy in zip(x, y):
+                if len(stepx) != len(stepy):
+                    # something off with a conversation
+                    return False
+                if len(stepx) != 2:
+                    # something off
+                    return False
+                if len(stepy) != 2:
+                    # something off
+                    return False
+                questionx = stepx[0].replace('<p>', '').replace('</p>', '')
+                answerx = stepx[1].replace('<p>', '').replace('</p>', '')
+                questiony = stepy[0].replace('<p>', '').replace('</p>', '')
+                answery = stepy[1].replace('<p>', '').replace('</p>', '')
+                if questionx != questiony or answerx != answery:
+                    return False
+            return is_same
+        def save_chat(chat1, chat2, chat_state1):
+            short_chats = list(chat_state1.keys())
+            for chati in [chat1, chat2]:
+                if chati and len(chati) > 0 and len(chati[0]) == 2 and chati[0][1] is not None:
+                    short_chat = get_short_chat(chati, short_chats)
+                    if short_chat:
+                        already_exists = any([is_chat_same(chati, x) for x in chat_state1.values()])
+                        if not already_exists:
+                            chat_state1[short_chat] = chati
+            return chat_state1
+        def update_radio_chats(chat_state1):
+            return gr.update(choices=list(chat_state1.keys()), value=None)
+        def deselect_radio_chats():
+            return gr.update(value=None)
+        def switch_chat(chat_key, chat_state1):
+            chosen_chat = chat_state1[chat_key]
+            return chosen_chat, chosen_chat
+        radio_chats.input(switch_chat, inputs=[radio_chats, chat_state], outputs=[text_output, text_output2])
+        def remove_chat(chat_key, chat_state1):
+            chat_state1.pop(chat_key, None)
+            return chat_state1
+        remove_chat_btn.click(remove_chat, inputs=[radio_chats, chat_state], outputs=chat_state) \
+            .then(update_radio_chats, inputs=chat_state, outputs=radio_chats)
+        def get_chats1(chat_state1):
+            base = 'chats'
+            makedirs(base, exist_ok=True)
+            filename = os.path.join(base, 'chats_%s.json' % str(uuid.uuid4()))
+            with open(filename, "wt") as f:
+                f.write(json.dumps(chat_state1, indent=2))
+            return filename
+        export_chats_btn.click(get_chats1, inputs=chat_state, outputs=chats_file, queue=False,
+                               api_name='export_chats' if allow_api else None)
+        def add_chats_from_file(file, chat_state1, add_btn):
+            if isinstance(file, str):
+                files = [file]
+            else:
+                files = file
+            for file1 in files:
+                try:
+                    if hasattr(file1, 'name'):
+                        file1 = file1.name
+                    with open(file1, "rt") as f:
+                        new_chats = json.loads(f.read())
+                        for chat1_k, chat1_v in new_chats.items():
+                            # ignore chat1_k, regenerate and de-dup to avoid loss
+                            chat_state1 = save_chat(chat1_v, None, chat_state1)
+                except BaseException as e:
+                    print("Add chats exception: %s" % str(e), flush=True)
+            return chat_state1, add_btn
+        # note for update_user_db_func output is ignored for db
+        add_to_chats_btn.click(add_chats_from_file,
+                               inputs=[chatsup_output, chat_state, add_to_chats_btn],
+                               outputs=[chat_state, add_to_my_db_btn], queue=False,
+                               api_name='add_to_chats' if allow_api else None) \
+            .then(clear_file_list, outputs=chatsup_output, queue=False) \
+            .then(update_radio_chats, inputs=chat_state, outputs=radio_chats, queue=False)
+        clear_chat_btn.click(lambda: None, None, text_output, queue=False, api_name='clear' if allow_api else None) \
+            .then(lambda: None, None, text_output2, queue=False, api_name='clear2' if allow_api else None) \
+            .then(deselect_radio_chats, inputs=None, outputs=radio_chats, queue=False)
         # does both models
+        clear.click(save_chat, inputs=[text_output, text_output2, chat_state], outputs=chat_state,
+                    api_name='save_chat' if allow_api else None) \
+            .then(update_radio_chats, inputs=chat_state, outputs=radio_chats,
+                  api_name='update_chats' if allow_api else None) \
+            .then(lambda: None, None, text_output, queue=False, api_name='clearB' if allow_api else None) \
+            .then(lambda: None, None, text_output2, queue=False, api_name='clearB2' if allow_api else None)
         # NOTE: clear of instruction/iinput for nochat has to come after score,
         # because score for nochat consumes actual textbox, while chat consumes chat history filled by user()
+        submit_event_nochat = submit_nochat.click(fun,
+                                                  inputs=[model_state, my_db_state] + inputs_list,
                                                   outputs=text_output_nochat,
                                                   queue=queue,
                                                   api_name='submit_nochat' if allow_api else None) \
             new_state = [list0[0] + [x]]
             new_options = [*new_state[0]]
             return gr.Dropdown.update(value=x, choices=new_options), \
+                gr.Dropdown.update(value=x, choices=new_options), \
+                '', new_state
         add_model_event = add_model_button.click(fn=dropdown_model_list,
                                                  inputs=[model_options_state, new_model],
             x1 = x if model_used1 == no_model_str else lora_used1
             x2 = x if model_used2 == no_model_str else lora_used2
             return gr.Dropdown.update(value=x1, choices=new_options), \
+                gr.Dropdown.update(value=x2, choices=new_options), \
+                '', new_state
         add_lora_event = add_lora_button.click(fn=dropdown_lora_list,
                                                inputs=[lora_options_state, new_lora, model_used, lora_used, model_used2,
     scheduler = BackgroundScheduler()
     scheduler.add_job(func=clear_torch_cache, trigger="interval", seconds=20)
+    if is_public and \
+            kwargs['base_model'] not in ['gptj', 'llama']:
+        # FIXME: disable for gptj, langchain or gpt4all modify print itself
+        # FIXME: and any multi-threaded/async print will enter model output!
         scheduler.add_job(func=ping, trigger="interval", seconds=60)
     scheduler.start()
+    # import control
+    if kwargs['langchain_mode'] == 'Disabled' and \
+            os.environ.get("TEST_LANGCHAIN_IMPORT") and \
+            kwargs['base_model'] not in ['gptj', 'llama']:
+        assert 'gpt_langchain' not in sys.modules, "Dev bug, import of langchain when should not have"
+        assert 'langchain' not in sys.modules, "Dev bug, import of langchain when should not have"
     demo.launch(share=kwargs['share'], server_name="0.0.0.0", show_error=True,
                 favicon_path=favicon_path, prevent_thread_lock=True,
                 auth=kwargs['auth'])
         demo.block_thread()
+input_args_list = ['model_state', 'my_db_state']
 def get_inputs_list(inputs_dict, model_lower):
         if k == 'kwargs':
             continue
         if k in input_args_list + inputs_kwargs_list:
+            # these are added at use time for args or partial for kwargs, not taken as input
             continue
         if 'mbart-' not in model_lower and k in ['src_lang', 'tgt_lang']:
             continue
         inputs_list.append(inputs_dict[k])
     return inputs_list
+def get_sources(db1, langchain_mode, dbs=None):
+    if langchain_mode in ['ChatLLM', 'LLM']:
+        source_files_added = "NA"
+    elif langchain_mode in ['wiki_full']:
+        source_files_added = "Not showing wiki_full, takes about 20 seconds and makes 4MB file." \
+                             "  Ask [email protected] for file if required."
+    elif langchain_mode == 'MyData' and len(db1) > 0 and db1[0] is not None:
+        db_get = db1[0].get()
+        source_files_added = '\n'.join(sorted(set([x['source'] for x in db_get['metadatas']])))
+    elif langchain_mode in dbs and dbs[langchain_mode] is not None:
+        db1 = dbs[langchain_mode]
+        db_get = db1.get()
+        source_files_added = '\n'.join(sorted(set([x['source'] for x in db_get['metadatas']])))
+    else:
+        source_files_added = "None"
+    sources_file = 'sources_%s_%s' % (langchain_mode, str(uuid.uuid4()))
+    with open(sources_file, "wt") as f:
+        f.write(source_files_added)
+    return sources_file
+def update_user_db(file, db1, x, y, *args, dbs=None, langchain_mode='UserData', **kwargs):
+    try:
+        return _update_user_db(file, db1, x, y, *args, dbs=dbs, langchain_mode=langchain_mode, **kwargs)
+    except BaseException as e:
+        print(traceback.format_exc(), flush=True)
+        # gradio has issues if except, so fail semi-gracefully, else would hang forever in processing textbox
+        ex_str = "Exception: %s" % str(e)
+        source_files_added = """\
+        <html>
+          <body>
+            <p>
+               Sources: <br>
+            </p>
+               <div style="overflow-y: auto;height:400px">
+               {0}
+               </div>
+          </body>
+        </html>
+        """.format(ex_str)
+        if langchain_mode == 'MyData':
+            return db1, x, y, source_files_added
+        else:
+            return x, y, source_files_added
+def _update_user_db(file, db1, x, y, dbs=None, db_type=None, langchain_mode='UserData', use_openai_embedding=False,
+                    hf_embedding_model="sentence-transformers/all-MiniLM-L6-v2",
+                    caption_loader=None,
+                    enable_captions=True,
+                    captions_model="Salesforce/blip-image-captioning-base",
+                    enable_ocr=False,
+                    verbose=False,
+                    chunk=True, chunk_size=512, is_url=False, is_txt=False):
+    assert isinstance(dbs, dict), "Wrong type for dbs: %s" % str(type(dbs))
+    assert db_type in ['faiss', 'chroma'], "db_type %s not supported" % db_type
+    from gpt_langchain import add_to_db, get_db, path_to_docs
+    # handle case of list of temp buffer
+    if isinstance(file, list) and len(file) > 0 and hasattr(file[0], 'name'):
+        file = [x.name for x in file]
+    # handle single file of temp buffer
+    if hasattr(file, 'name'):
+        file = file.name
+    if verbose:
+        print("Adding %s" % file, flush=True)
+    sources = path_to_docs(file if not is_url and not is_txt else None,
+                           verbose=verbose, chunk=chunk, chunk_size=chunk_size,
+                           url=file if is_url else None,
+                           text=file if is_txt else None,
+                           enable_captions=enable_captions,
+                           captions_model=captions_model,
+                           enable_ocr=enable_ocr,
+                           caption_loader=caption_loader,
+                           )
+    exceptions = [x for x in sources if x.metadata.get('exception')]
+    sources = [x for x in sources if 'exception' not in x.metadata]
+    with filelock.FileLock("db_%s.lock" % langchain_mode.replace(' ', '_')):
+        if langchain_mode == 'MyData':
+            if db1[0] is not None:
+                # then add
+                add_to_db(db1[0], sources, db_type=db_type)
+            else:
+                assert len(db1) == 2 and db1[1] is None, "Bad MyData db: %s" % db1
+                # then create
+                # assign fresh hash for this user session, so not shared
+                # if added has to original state and didn't change, then would be shared db for all users
+                db1[1] = str(uuid.uuid4())
+                persist_directory = os.path.join(scratch_base_dir, 'db_dir_%s_%s' % (langchain_mode, db1[1]))
+                db1[0] = get_db(sources, use_openai_embedding=use_openai_embedding,
+                                db_type=db_type,
+                                persist_directory=persist_directory,
+                                langchain_mode=langchain_mode,
+                                hf_embedding_model=hf_embedding_model)
+                if db1[0] is None:
+                    db1[1] = None
+            source_files_added = get_source_files(db1[0], exceptions=exceptions)
+            return db1, x, y, source_files_added
+        else:
+            persist_directory = 'db_dir_%s' % langchain_mode
+            if langchain_mode in dbs and dbs[langchain_mode] is not None:
+                # then add
+                add_to_db(dbs[langchain_mode], sources, db_type=db_type)
+            else:
+                # then create
+                db = get_db(sources, use_openai_embedding=use_openai_embedding,
+                            db_type=db_type,
+                            persist_directory=persist_directory,
+                            langchain_mode=langchain_mode,
+                            hf_embedding_model=hf_embedding_model)
+                dbs[langchain_mode] = db
+            # NOTE we do not return db, because function call always same code path
+            # return dbs[langchain_mode], x, y
+            # db in this code path is updated in place
+            source_files_added = get_source_files(dbs[langchain_mode], exceptions=exceptions)
+            return x, y, source_files_added
+def get_source_files(db, exceptions=None):
+    if exceptions is None:
+        exceptions = []
+    if db is not None:
+        metadatas = db.get()['metadatas']
+    else:
+        metadatas = []
+    # below automatically de-dups
+    from gpt_langchain import get_url
+    small_dict = {get_url(x['source'], from_str=True, short_name=True): get_short_name(x.get('head')) for x in
+                  metadatas}
+    # if small_dict is empty dict, that's ok
+    df = pd.DataFrame(small_dict.items(), columns=['source', 'head'])
+    df.index = df.index + 1
+    df.index.name = 'index'
+    source_files_added = tabulate.tabulate(df, headers='keys', tablefmt='unsafehtml')
+    if exceptions:
+        exception_metadatas = [x.metadata for x in exceptions]
+        small_dict = {get_url(x['source'], from_str=True, short_name=True): get_short_name(x.get('exception')) for x in
+                      exception_metadatas}
+        # if small_dict is empty dict, that's ok
+        df = pd.DataFrame(small_dict.items(), columns=['source', 'exception'])
+        df.index = df.index + 1
+        df.index.name = 'index'
+        exceptions_html = tabulate.tabulate(df, headers='keys', tablefmt='unsafehtml')
+    else:
+        exceptions_html = ''
+    if metadatas and exceptions:
+        source_files_added = """\
+        <html>
+          <body>
+            <p>
+               Sources: <br>
+            </p>
+               <div style="overflow-y: auto;height:400px">
+               {0}
+               {1}
+               </div>
+          </body>
+        </html>
+        """.format(source_files_added, exceptions_html)
+    elif metadatas:
+        source_files_added = """\
+        <html>
+          <body>
+            <p>
+               Sources: <br>
+            </p>
+               <div style="overflow-y: auto;height:400px">
+               {0}
+               </div>
+          </body>
+        </html>
+        """.format(source_files_added)
+    elif exceptions_html:
+        source_files_added = """\
+        <html>
+          <body>
+            <p>
+               Exceptions: <br>
+            </p>
+               <div style="overflow-y: auto;height:400px">
+               {0}
+               </div>
+          </body>
+        </html>
+        """.format(exceptions_html)
+    else:
+        source_files_added = ""
+    return source_files_added

gradio_themes.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 from gradio.themes.soft import Soft
-from gradio.themes.utils import Color, colors, sizes
 h2o_yellow = Color(
     name="yellow",
@@ -74,6 +75,7 @@ class H2oTheme(Soft):
             body_background_fill_dark="*neutral_900",
             background_fill_primary_dark="*block_background_fill",
             block_radius="0 0 8px 8px",
         )

 from __future__ import annotations
 from gradio.themes.soft import Soft
+from gradio.themes import Color
+from gradio.themes.utils import colors, sizes
 h2o_yellow = Color(
     name="yellow",
             body_background_fill_dark="*neutral_900",
             background_fill_primary_dark="*block_background_fill",
             block_radius="0 0 8px 8px",
+            checkbox_label_text_color_selected_dark='#000000',
         )

h2oai_pipeline.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from transformers import TextGenerationPipeline
+from transformers.pipelines.text_generation import ReturnType
+from stopping import get_stopping
+prompt_type = "human_bot"
+human = "<human>:"
+bot = "<bot>:"
+# human-bot interaction like OIG dataset
+prompt = """{human} {instruction}
+{bot}""".format(
+    human=human,
+    instruction="{instruction}",
+    bot=bot,
+)
+class H2OTextGenerationPipeline(TextGenerationPipeline):
+    def __init__(self, *args, use_prompter=False, debug=False, chat=False, stream_output=False,
+                 sanitize_bot_response=True, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.use_prompter = use_prompter
+        self.prompt_text = None
+        if self.use_prompter:
+            from prompter import Prompter
+            self.prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
+        else:
+            self.prompter = None
+        self.sanitize_bot_response = sanitize_bot_response
+    def preprocess(self, prompt_text, prefix="", handle_long_generation=None, **generate_kwargs):
+        prompt_text = prompt.format(instruction=prompt_text)
+        self.prompt_text = prompt_text
+        return super().preprocess(prompt_text, prefix=prefix, handle_long_generation=handle_long_generation,
+                                  **generate_kwargs)
+    def postprocess(self, model_outputs, return_type=ReturnType.FULL_TEXT, clean_up_tokenization_spaces=True):
+        records = super().postprocess(model_outputs, return_type=return_type,
+                                      clean_up_tokenization_spaces=clean_up_tokenization_spaces)
+        for rec in records:
+            if self.use_prompter:
+                outputs = rec['generated_text']
+                outputs = self.prompter.get_response(outputs, prompt=self.prompt_text,
+                                                     sanitize_bot_response=self.sanitize_bot_response)
+            else:
+                outputs = rec['generated_text'].split(bot)[1].strip().split(human)[0].strip()
+            rec['generated_text'] = outputs
+        return records
+    def _forward(self, model_inputs, **generate_kwargs):
+        stopping_criteria = get_stopping(prompt_type, self.tokenizer, self.device, human=human, bot=bot)
+        generate_kwargs['stopping_criteria'] = stopping_criteria
+        return super()._forward(model_inputs, **generate_kwargs)

loaders.py ADDED Viewed

	@@ -0,0 +1,50 @@

+def get_loaders(llama_type, model_name, reward_type):
+    # NOTE: Some models need specific new prompt_type
+    # E.g. t5_xxl_true_nli_mixture has input format: "premise: PREMISE_TEXT hypothesis: HYPOTHESIS_TEXT".)
+    if llama_type:
+        from transformers import LlamaForCausalLM, LlamaTokenizer
+        model_loader = LlamaForCausalLM
+        tokenizer_loader = LlamaTokenizer
+    elif 'distilgpt2' in model_name.lower():
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        return AutoModelForCausalLM, AutoTokenizer
+    elif 'gpt2' in model_name.lower():
+        from transformers import GPT2LMHeadModel, GPT2Tokenizer
+        return GPT2LMHeadModel, GPT2Tokenizer
+    elif 'mbart-' in model_name.lower():
+        from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
+        return MBartForConditionalGeneration, MBart50TokenizerFast
+    elif 't5' == model_name.lower() or \
+         't5-' in model_name.lower() or \
+         'flan-' in model_name.lower():
+        from transformers import AutoTokenizer, T5ForConditionalGeneration
+        return T5ForConditionalGeneration, AutoTokenizer
+    elif 'bigbird' in model_name:
+        from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer
+        return BigBirdPegasusForConditionalGeneration, AutoTokenizer
+    elif 'bart-large-cnn-samsum' in model_name or 'flan-t5-base-samsum' in model_name:
+        from transformers import pipeline
+        return pipeline, "summarization"
+    elif reward_type or 'OpenAssistant/reward-model'.lower() in model_name.lower():
+        from transformers import AutoModelForSequenceClassification, AutoTokenizer
+        return AutoModelForSequenceClassification, AutoTokenizer
+    else:
+        from transformers import AutoTokenizer, AutoModelForCausalLM
+        model_loader = AutoModelForCausalLM
+        tokenizer_loader = AutoTokenizer
+    return model_loader, tokenizer_loader
+def get_tokenizer(tokenizer_loader, tokenizer_base_model, local_files_only, resume_download, use_auth_token):
+    tokenizer = tokenizer_loader.from_pretrained(tokenizer_base_model,
+                                                 local_files_only=local_files_only,
+                                                 resume_download=resume_download,
+                                                 use_auth_token=use_auth_token)
+    tokenizer.pad_token_id = 0  # different from the eos token
+    # when generating, we will use the logits of right-most token to predict the next token
+    # so the padding should be on the left,
+    # e.g. see: https://huggingface.co/transformers/v4.11.3/model_doc/t5.html#inference
+    tokenizer.padding_side = "left"  # Allow batched inference
+    return tokenizer

prompter.py CHANGED Viewed

@@ -1,4 +1,355 @@
-from finetune import generate_prompt
 class Prompter(object):
@@ -13,6 +364,12 @@ class Prompter(object):
         self.stream_output = stream_output
         self.repeat_penalty = repeat_penalty
         self.allowed_repeat_line_length = allowed_repeat_line_length
     def generate_prompt(self, data_point):
         reduced = False
@@ -55,6 +412,18 @@ class Prompter(object):
         for oi, output in enumerate(outputs):
             if self.prompt_type in [0, '0', 'plain']:
                 output = clean_response(output)
             else:
                 # find first instance of prereponse
                 # prompt sometimes has odd characters, that mutate length,

+import time
+from enum import Enum
+class PromptType(Enum):
+    plain = 0
+    instruct = 1
+    quality = 2
+    human_bot = 3
+    dai_faq = 4
+    summarize = 5
+    simple_instruct = 6
+    instruct_vicuna = 7
+    instruct_with_end = 8
+    human_bot_orig = 9
+    prompt_answer = 10
+    open_assistant = 11
+    wizard_lm = 12
+    wizard_mega = 13
+prompt_type_to_model_name = {
+    'plain': [
+        'EleutherAI/gpt-j-6B',
+        'EleutherAI/pythia-6.9b',
+        'EleutherAI/pythia-12b',
+        'EleutherAI/pythia-12b-deduped',
+        'EleutherAI/gpt-neox-20b',
+        'decapoda-research/llama-7b-hf',
+        'decapoda-research/llama-13b-hf',
+        'decapoda-research/llama-30b-hf',
+        'decapoda-research/llama-65b-hf',
+        'facebook/mbart-large-50-many-to-many-mmt',
+        'philschmid/bart-large-cnn-samsum',
+        'philschmid/flan-t5-base-samsum',
+        'gpt2',
+        'distilgpt2',
+        'mosaicml/mpt-7b-storywriter',
+        'mosaicml/mpt-7b-instruct',  # internal code handles instruct
+        'mosaicml/mpt-7b-chat',  # NC, internal code handles instruct
+        'gptj',  # internally handles prompting
+        'llama',  # internally handles prompting
+    ],
+    'prompt_answer': [
+        'h2oai/h2ogpt-gm-oasst1-en-1024-20b',
+        'h2oai/h2ogpt-gm-oasst1-en-1024-12b',
+        'h2oai/h2ogpt-gm-oasst1-multilang-1024-20b',
+        'h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt',
+        'h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2',
+    ],
+    'instruct': [],
+    'instruct_with_end': ['databricks/dolly-v2-12b'],
+    'quality': [],
+    'human_bot': [
+        'h2oai/h2ogpt-oasst1-512-12b',
+        'h2oai/h2ogpt-oasst1-512-20b',
+        'h2oai/h2ogpt-oig-oasst1-256-6_9b',
+        'h2oai/h2ogpt-oig-oasst1-512-6_9b',
+        'h2oai/h2ogpt-research-oasst1-512-30b',  # private
+    ],
+    'dai_faq': [],
+    'summarize': [],
+    'simple_instruct': ['t5-small', 't5-large', 'google/flan-t5', 'google/flan-t5-xxl', 'google/flan-ul2'],
+    'instruct_vicuna': ['AlekseyKorshuk/vicuna-7b', 'TheBloke/stable-vicuna-13B-HF', 'junelee/wizard-vicuna-13b'],
+    'human_bot_orig': ['togethercomputer/GPT-NeoXT-Chat-Base-20B'],
+    "open_assistant": ['OpenAssistant/oasst-sft-7-llama-30b-xor', 'oasst-sft-7-llama-30b'],
+    "wizard_lm": ['ehartford/WizardLM-7B-Uncensored', 'ehartford/WizardLM-13B-Uncensored'],
+    "wizard_mega": ['openaccess-ai-collective/wizard-mega-13b'],
+}
+inv_prompt_type_to_model_name = {v.strip(): k for k, l in prompt_type_to_model_name.items() for v in l}
+inv_prompt_type_to_model_lower = {v.strip().lower(): k for k, l in prompt_type_to_model_name.items() for v in l}
+prompt_types_strings = []
+for p in PromptType:
+    prompt_types_strings.extend([p.name])
+prompt_types = []
+for p in PromptType:
+    prompt_types.extend([p.name, p.value, str(p.value)])
+def get_prompt(prompt_type, chat, context, reduced):
+    if prompt_type in [-1, "-1", "plain"]:
+        promptA = promptB = PreInstruct = PreInput = PreResponse = ''
+        terminate_response = []
+        chat_sep = ''
+        humanstr = ''
+        botstr = ''
+    elif prompt_type == 'simple_instruct':
+        promptA = promptB = PreInstruct = PreInput = PreResponse = None
+        terminate_response = []
+        chat_sep = '\n'
+        humanstr = ''
+        botstr = ''
+    elif prompt_type in [0, "0", "instruct"] or prompt_type in [7, "7", "instruct_with_end"]:
+        promptA = 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n' if not (
+                    chat and reduced) else ''
+        promptB = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n' if not (
+                    chat and reduced) else ''
+        PreInstruct = """
+### Instruction:
+"""
+        PreInput = """
+### Input:
+"""
+        PreResponse = """
+### Response:
+"""
+        if prompt_type in [7, "7", "instruct_with_end"]:
+            terminate_response = ['### End']
+        else:
+            terminate_response = None
+        chat_sep = '\n'
+        humanstr = PreInstruct
+        botstr = PreResponse
+    elif prompt_type in [1, "1", "quality"]:
+        promptA = 'Write a detailed high-quality, accurate, fair, Response with about 100 words by following the Instruction as applied on the Input.\n' if not (
+                    chat and reduced) else ''
+        promptB = 'Write a detailed high-quality, accurate, fair, Response with about 100 words by following the Instruction.\n' if not (
+                    chat and reduced) else ''
+        PreInstruct = """
+### Instruction:
+"""
+        PreInput = """
+### Input:
+"""
+        PreResponse = """
+### Response:
+"""
+        terminate_response = None
+        chat_sep = '\n'
+        humanstr = PreInstruct  # first thing human says
+        botstr = PreResponse  # first thing bot says
+    elif prompt_type in [2, "2", "human_bot", 9, "9", "human_bot_orig"]:
+        human = '<human>:'
+        bot = "<bot>:"
+        if reduced or context or prompt_type in [2, "2", "human_bot"]:
+            preprompt = ''
+        else:
+            cur_date = time.strftime('%Y-%m-%d')
+            cur_time = time.strftime('%H:%M:%S %p %Z')
+            PRE_PROMPT = """\
+Current Date: {}
+Current Time: {}
+"""
+            preprompt = PRE_PROMPT.format(cur_date, cur_time)
+        start = human
+        promptB = promptA = '%s%s ' % (preprompt, start)
+        PreInstruct = ""
+        PreInput = None
+        if reduced:
+            # when making context, want it to appear as-if LLM generated, which starts with space after :
+            PreResponse = bot + ' '
+        else:
+            # normally LLM adds space after this, because was how trained.
+            # if add space here, non-unique tokenization will often make LLM produce wrong output
+            PreResponse = bot
+        terminate_response = [start, PreResponse]
+        chat_sep = '\n'
+        humanstr = human  # tag before human talks
+        botstr = bot  # tag before bot talks
+    elif prompt_type in [3, "3", "dai_faq"]:
+        promptA = ''
+        promptB = 'Answer the following Driverless AI question.\n'
+        PreInstruct = """
+### Driverless AI frequently asked question:
+"""
+        PreInput = None
+        PreResponse = """
+### Driverless AI documentation answer:
+"""
+        terminate_response = ['\n\n']
+        chat_sep = terminate_response
+        humanstr = PreInstruct
+        botstr = PreResponse
+    elif prompt_type in [5, "5", "summarize"]:
+        promptA = promptB = PreInput = ''
+        PreInstruct = '## Main Text\n\n'
+        PreResponse = '\n\n## Summary\n\n'
+        terminate_response = None
+        chat_sep = '\n'
+        humanstr = PreInstruct
+        botstr = PreResponse
+    elif prompt_type in [6, "6", "instruct_vicuna"]:
+        promptA = promptB = "A chat between a curious human and an artificial intelligence assistant. " \
+                            "The assistant gives helpful, detailed, and polite answers to the human's questions." if not (
+                    chat and reduced) else ''
+        PreInstruct = """
+### Human:
+"""
+        PreInput = None
+        PreResponse = """
+### Assistant:
+"""
+        terminate_response = [
+            '### Human:']  # but only allow terminate after prompt is found correctly, else can't terminate
+        chat_sep = '\n'
+        humanstr = PreInstruct
+        botstr = PreResponse
+    elif prompt_type in [10, "10", "prompt_answer"]:
+        preprompt = ''
+        prompt_tokens = "<|prompt|>"
+        answer_tokens = "<|answer|>"
+        start = prompt_tokens
+        promptB = promptA = '%s%s' % (preprompt, start)
+        PreInstruct = ""
+        PreInput = None
+        PreResponse = answer_tokens
+        eos = '<|endoftext|>'  # neox eos
+        terminate_response = [start, PreResponse, eos]
+        chat_sep = eos
+        humanstr = prompt_tokens
+        botstr = answer_tokens
+    elif prompt_type in [11, "11", "open_assistant"]:
+        # From added_tokens.json
+        preprompt = ''
+        prompt_tokens = "<|prompter|>"
+        answer_tokens = "<|assistant|>"
+        start = prompt_tokens
+        promptB = promptA = '%s%s' % (preprompt, start)
+        PreInstruct = ""
+        PreInput = None
+        PreResponse = answer_tokens
+        pend = "<|prefix_end|>"
+        eos = "</s>"
+        terminate_response = [start, PreResponse, pend, eos]
+        chat_sep = eos
+        humanstr = prompt_tokens
+        botstr = answer_tokens
+    elif prompt_type in [12, "12", "wizard_lm"]:
+        # https://github.com/ehartford/WizardLM/blob/main/src/train_freeform.py
+        preprompt = ''
+        start = ''
+        promptB = promptA = '%s%s' % (preprompt, start)
+        PreInstruct = ""
+        PreInput = None
+        PreResponse = "\n\n### Response"
+        eos = "</s>"
+        terminate_response = [PreResponse, eos]
+        chat_sep = eos
+        humanstr = promptA
+        botstr = PreResponse
+    elif prompt_type in [13, "13", "wizard_mega"]:
+        preprompt = ''
+        start = ''
+        promptB = promptA = '%s%s' % (preprompt, start)
+        PreInstruct = """
+### Instruction:
+"""
+        PreInput = None
+        PreResponse = """
+### Assistant:
+"""
+        terminate_response = [PreResponse]
+        chat_sep = '\n'
+        humanstr = PreInstruct
+        botstr = PreResponse
+    else:
+        raise RuntimeError("No such prompt_type=%s" % prompt_type)
+    return promptA, promptB, PreInstruct, PreInput, PreResponse, terminate_response, chat_sep, humanstr, botstr
+def generate_prompt(data_point, prompt_type, chat, reduced):
+    context = data_point.get('context')
+    if context is None:
+        context = ''
+    instruction = data_point.get('instruction')
+    input = data_point.get('input')
+    output = data_point.get('output')
+    prompt_type = data_point.get('prompt_type', prompt_type)
+    assert prompt_type in prompt_types, "Bad prompt type: %s" % prompt_type
+    promptA, promptB, PreInstruct, PreInput, PreResponse, \
+        terminate_response, chat_sep, humanstr, botstr = get_prompt(prompt_type, chat, context, reduced)
+    prompt = context if not reduced else ''
+    if input and promptA:
+        prompt += f"""{promptA}"""
+    elif promptB:
+        prompt += f"""{promptB}"""
+    if instruction and PreInstruct is not None and input and PreInput is not None:
+        prompt += f"""{PreInstruct}{instruction}{PreInput}{input}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif instruction and input and PreInstruct is None and PreInput is not None:
+        prompt += f"""{PreInput}{instruction}
+{input}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif input and instruction and PreInput is None and PreInstruct is not None:
+        prompt += f"""{PreInstruct}{instruction}
+{input}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif instruction and PreInstruct is not None:
+        prompt += f"""{PreInstruct}{instruction}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif input and PreInput is not None:
+        prompt += f"""{PreInput}{input}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif input and instruction and PreInput is not None:
+        prompt += f"""{PreInput}{instruction}{input}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif input and instruction and PreInstruct is not None:
+        prompt += f"""{PreInstruct}{instruction}{input}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif input and instruction:
+        # i.e. for simple_instruct
+        prompt += f"""{instruction}: {input}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif input:
+        prompt += f"""{input}"""
+        prompt = inject_newline(prompt_type, prompt)
+    elif instruction:
+        prompt += f"""{instruction}"""
+        prompt = inject_newline(prompt_type, prompt)
+    if PreResponse is not None:
+        prompt += f"""{PreResponse}"""
+        pre_response = PreResponse  # Don't use strip
+    else:
+        pre_response = ''
+    if output:
+        prompt += f"""{output}"""
+    return prompt, pre_response, terminate_response, chat_sep
+def inject_newline(prompt_type, prompt):
+    if prompt_type not in [-1, '-1', 'plain', 'simple_instruct']:
+        # only add new line if structured prompt, while 'plain' is just generation of next tokens from input
+        prompt += '\n'
+    return prompt
 class Prompter(object):
         self.stream_output = stream_output
         self.repeat_penalty = repeat_penalty
         self.allowed_repeat_line_length = allowed_repeat_line_length
+        self.prompt = None
+        context = ""  # not for chat context
+        reduced = False  # not for chat context
+        self.promptA, self.promptB, self.PreInstruct, self.PreInput, self.PreResponse, \
+            self.terminate_response, self.chat_sep, self.humanstr, self.botstr = \
+            get_prompt(prompt_type, chat, context, reduced)
     def generate_prompt(self, data_point):
         reduced = False
         for oi, output in enumerate(outputs):
             if self.prompt_type in [0, '0', 'plain']:
                 output = clean_response(output)
+            elif prompt is None:
+                # then use most basic parsing like pipeline
+                if self.botstr in output:
+                    if self.humanstr:
+                        output = clean_response(output.split(self.botstr)[1].strip().split(self.humanstr)[0].strip())
+                    else:
+                        # i.e. use after bot but only up to next bot
+                        output = clean_response(output.split(self.botstr)[1].strip().split(self.botstr)[0].strip())
+                else:
+                    # output = clean_response(output.strip())
+                    # assume just not printed yet
+                    output = ""
             else:
                 # find first instance of prereponse
                 # prompt sometimes has odd characters, that mutate length,

requirements.txt CHANGED Viewed

@@ -2,7 +2,7 @@
 datasets==2.12.0
 sentencepiece==0.1.97
 accelerate==0.18.0
-gradio==3.27.0
 huggingface_hub==0.14.1
 appdirs==1.4.4
 fire==0.5.0
@@ -35,7 +35,7 @@ tensorboard==2.12.1
 neptune==1.1.1
 # for gradio client
-gradio_client==0.1.3
 beautifulsoup4==4.12.2
 markdown==3.4.1
@@ -45,7 +45,55 @@ pytest-xdist==3.2.1
 nltk==3.8.1
 textstat==0.7.3
 pandoc==2.3
-pypandoc==1.11
 openpyxl==3.1.2
 lm_dataformat==0.0.20
 bioc==2.0

 datasets==2.12.0
 sentencepiece==0.1.97
 accelerate==0.18.0
+gradio==3.31.0
 huggingface_hub==0.14.1
 appdirs==1.4.4
 fire==0.5.0
 neptune==1.1.1
 # for gradio client
+gradio_client==0.2.5
 beautifulsoup4==4.12.2
 markdown==3.4.1
 nltk==3.8.1
 textstat==0.7.3
 pandoc==2.3
+#pypandoc==1.11
+pypandoc_binary==1.11
 openpyxl==3.1.2
 lm_dataformat==0.0.20
 bioc==2.0
+# To install with constraints
+# grep -v '#\|peft' requirements.txt > req_constraints.txt ; pip install -r requirements_optional_langchain.txt -c req_constraints.txt
+# optional for chat with PDF
+langchain==0.0.178
+pypdf==3.8.1
+tiktoken==0.3.3
+# avoid textract, requires old six
+#textract==1.6.5
+# choose:
+#faiss-cpu
+faiss-gpu==1.7.2
+# for HF embeddings
+sentence_transformers==2.2.2
+# for OpenAI embeddings (requires key)
+openai==0.27.6
+# local vector db
+chromadb==0.3.23
+# server vector db
+#pymilvus==2.2.8
+# weak url support, if can't install opencv etc. If comment-in this one, then comment-out unstructured[local-inference]==0.6.6
+# unstructured==0.6.6
+# strong support for images
+# Requires on Ubuntu: sudo apt-get install libmagic-dev poppler-utils tesseract-ocr libreoffice
+unstructured[local-inference]==0.6.6
+#pdf2image==1.16.3
+#pytesseract==0.3.10
+pillow
+pdfminer.six==20221105
+urllib3==1.26.6
+requests_file==1.5.1
+#pdf2image==1.16.3
+#pytesseract==0.3.10
+tabulate==0.9.0
+# FYI pandoc already part of requirements.txt
+jq==1.4.1
+# to check licenses
+# Run: pip-licenses|grep -v 'BSD\|Apache\|MIT'
+pip-licenses==4.3.0

utils.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import functools
 import os
 import gc
 import pathlib
@@ -12,6 +14,9 @@ import traceback
 import zipfile
 from datetime import datetime
 import filelock
 import numpy as np
 import pandas as pd
@@ -53,7 +58,11 @@ def clear_torch_cache():
 def ping():
-    print('Ping: %s' % str(datetime.now()), flush=True)
 def get_torch_allocated():
@@ -61,6 +70,16 @@ def get_torch_allocated():
     return torch.cuda.memory_allocated()
 def system_info():
     import psutil
@@ -111,21 +130,26 @@ def system_info_print():
         return "Error: %s" % str(e)
-def zip_data(root_dirs=None, zip_file=None, base_dir='./'):
     try:
         return _zip_data(zip_file=zip_file, base_dir=base_dir, root_dirs=root_dirs)
     except Exception as e:
         traceback.print_exc()
         print('Exception in zipping: %s' % str(e))
 def _zip_data(root_dirs=None, zip_file=None, base_dir='./'):
     if zip_file is None:
         datetime_str = str(datetime.now()).replace(" ", "_").replace(":", "_")
         host_name = os.getenv('HF_HOSTNAME', 'emptyhost')
         zip_file = "data_%s_%s.zip" % (datetime_str, host_name)
     assert root_dirs is not None
     with zipfile.ZipFile(zip_file, "w") as expt_zip:
         for root_dir in root_dirs:
             if root_dir is None:
@@ -237,6 +261,7 @@ class NullContext(threading.local):
     Used as a stand-in if a particular block of code is only sometimes
     used with a normal context manager:
     """
     def __init__(self, *args, **kwargs):
         pass
@@ -270,16 +295,18 @@ class ThreadException(Exception):
 class EThread(threading.Thread):
     # Function that raises the custom exception
     def __init__(self, group=None, target=None, name=None,
-                 args=(), kwargs=None, *, daemon=None, bucket=None):
         self.bucket = bucket
-        self.streamer = kwargs.get('streamer')
         self.exc = None
         super().__init__(group=group, target=target, name=name, args=args, kwargs=kwargs, daemon=daemon)
     def run(self):
         # Variable that stores the exception, if raised by someFunction
         try:
-            super().run()
         except BaseException as e:
             print("thread exception: %s" % str(sys.exc_info()))
             self.bucket.put(sys.exc_info())
@@ -287,6 +314,10 @@ class EThread(threading.Thread):
             if self.streamer:
                 print("make stop: %s" % str(sys.exc_info()), flush=True)
                 self.streamer.do_stop = True
     def join(self, timeout=None):
         threading.Thread.join(self)
@@ -295,3 +326,443 @@ class EThread(threading.Thread):
         # if any was caught
         if self.exc:
             raise self.exc

+import contextlib
 import functools
+import hashlib
 import os
 import gc
 import pathlib
 import zipfile
 from datetime import datetime
 import filelock
+import requests, uuid
+from typing import Tuple, Callable, Dict
+from concurrent.futures import ProcessPoolExecutor
 import numpy as np
 import pandas as pd
 def ping():
+    try:
+        print('Ping: %s' % str(datetime.now()), flush=True)
+    except AttributeError:
+        # some programs wrap print and will fail with flush passed
+        pass
 def get_torch_allocated():
     return torch.cuda.memory_allocated()
+def get_device():
+    import torch
+    if torch.cuda.is_available():
+        device = "cuda"
+    else:
+        device = "cpu"
+    return device
 def system_info():
     import psutil
         return "Error: %s" % str(e)
+def zip_data(root_dirs=None, zip_file=None, base_dir='./', fail_any_exception=False):
     try:
         return _zip_data(zip_file=zip_file, base_dir=base_dir, root_dirs=root_dirs)
     except Exception as e:
         traceback.print_exc()
         print('Exception in zipping: %s' % str(e))
+        if not fail_any_exception:
+            raise
 def _zip_data(root_dirs=None, zip_file=None, base_dir='./'):
+    if isinstance(root_dirs, str):
+        root_dirs = [root_dirs]
     if zip_file is None:
         datetime_str = str(datetime.now()).replace(" ", "_").replace(":", "_")
         host_name = os.getenv('HF_HOSTNAME', 'emptyhost')
         zip_file = "data_%s_%s.zip" % (datetime_str, host_name)
     assert root_dirs is not None
+    if not os.path.isdir(os.path.dirname(zip_file)):
+        os.makedirs(os.path.dirname(zip_file), exist_ok=True)
     with zipfile.ZipFile(zip_file, "w") as expt_zip:
         for root_dir in root_dirs:
             if root_dir is None:
     Used as a stand-in if a particular block of code is only sometimes
     used with a normal context manager:
     """
     def __init__(self, *args, **kwargs):
         pass
 class EThread(threading.Thread):
     # Function that raises the custom exception
     def __init__(self, group=None, target=None, name=None,
+                 args=(), kwargs=None, *, daemon=None, streamer=None, bucket=None):
         self.bucket = bucket
+        self.streamer = streamer
         self.exc = None
+        self._return = None
         super().__init__(group=group, target=target, name=name, args=args, kwargs=kwargs, daemon=daemon)
     def run(self):
         # Variable that stores the exception, if raised by someFunction
         try:
+            if self._target is not None:
+                self._return = self._target(*self._args, **self._kwargs)
         except BaseException as e:
             print("thread exception: %s" % str(sys.exc_info()))
             self.bucket.put(sys.exc_info())
             if self.streamer:
                 print("make stop: %s" % str(sys.exc_info()), flush=True)
                 self.streamer.do_stop = True
+        finally:
+            # Avoid a refcycle if the thread is running a function with
+            # an argument that has a member that points to the thread.
+            del self._target, self._args, self._kwargs
     def join(self, timeout=None):
         threading.Thread.join(self)
         # if any was caught
         if self.exc:
             raise self.exc
+        return self._return
+def import_matplotlib():
+    import matplotlib
+    matplotlib.use('agg')
+    # KEEP THESE HERE! START
+    import matplotlib.pyplot as plt
+    import pandas as pd
+    # to avoid dlopen deadlock in fork
+    import pandas.core.computation.expressions as pd_expressions
+    import pandas._libs.groupby as pd_libgroupby
+    import pandas._libs.reduction as pd_libreduction
+    import pandas.core.algorithms as pd_algorithms
+    import pandas.core.common as pd_com
+    import numpy as np
+    # KEEP THESE HERE! END
+def get_sha(value):
+    return hashlib.md5(str(value).encode('utf-8')).hexdigest()
+def sanitize_filename(name):
+    """
+    Sanitize file *base* names.
+    :param name: name to sanitize
+    :return:
+    """
+    bad_chars = ['[', ']', ',', '/', '\\', '\\w', '\\s', '-', '+', '\"', '\'', '>', '<', ' ', '=', ')', '(', ':', '^']
+    for char in bad_chars:
+        name = name.replace(char, "_")
+    length = len(name)
+    file_length_limit = 250  # bit smaller than 256 for safety
+    sha_length = 32
+    real_length_limit = file_length_limit - (sha_length + 2)
+    if length > file_length_limit:
+        sha = get_sha(name)
+        half_real_length_limit = max(1, int(real_length_limit / 2))
+        name = name[0:half_real_length_limit] + "_" + sha + "_" + name[length - half_real_length_limit:length]
+    return name
+def shutil_rmtree_simple(*args, **kwargs):
+    path = args[0]
+    assert not os.path.samefile(path, "./tmp"), "Should not be trying to remove entire data directory: %s" % str(path)
+    # print("Removing path %s" % args[0])  # for debugging
+    return shutil.rmtree(*args, **kwargs)
+def remove_simple(path: str):
+    try:
+        if path is not None and os.path.exists(path):
+            if os.path.isdir(path):
+                shutil_rmtree_simple(path, ignore_errors=True)
+            else:
+                with contextlib.suppress(FileNotFoundError):
+                    os.remove(path)
+    except:
+        pass
+def makedirs(path, exist_ok=True):
+    """
+    Avoid some inefficiency in os.makedirs()
+    :param path:
+    :param exist_ok:
+    :return:
+    """
+    if os.path.isdir(path) and os.path.exists(path):
+        assert exist_ok, "Path already exists"
+        return path
+    os.makedirs(path, exist_ok=exist_ok)
+def atomic_move_simple(src, dst):
+    try:
+        shutil.move(src, dst)
+    except (shutil.Error, FileExistsError):
+        pass
+    remove_simple(src)
+def download_simple(url, dest=None, print_func=None):
+    if print_func is not None:
+        print_func("BEGIN get url %s" % str(url))
+    if url.startswith("file://"):
+        from requests_file import FileAdapter
+        s = requests.Session()
+        s.mount('file://', FileAdapter())
+        url_data = s.get(url, stream=True)
+    else:
+        url_data = requests.get(url, stream=True)
+    if dest is None:
+        dest = os.path.basename(url)
+    if url_data.status_code != requests.codes.ok:
+        msg = "Cannot get url %s, code: %s, reason: %s" % (
+            str(url),
+            str(url_data.status_code),
+            str(url_data.reason),
+        )
+        raise requests.exceptions.RequestException(msg)
+    url_data.raw.decode_content = True
+    makedirs(os.path.dirname(dest), exist_ok=True)
+    uuid_tmp = str(uuid.uuid4())[:6]
+    dest_tmp = dest + "_dl_" + uuid_tmp + ".tmp"
+    with open(dest_tmp, "wb") as f:
+        shutil.copyfileobj(url_data.raw, f)
+    atomic_move_simple(dest_tmp, dest)
+    if print_func is not None:
+        print_func("END get url %s" % str(url))
+def download(url, dest=None, dest_path=None):
+    if dest_path is not None:
+        dest = os.path.join(dest_path, os.path.basename(url))
+        if os.path.isfile(dest):
+            print("already downloaded %s -> %s" % (url, dest))
+            return dest
+    elif dest is not None:
+        if os.path.exists(dest):
+            print("already downloaded %s -> %s" % (url, dest))
+            return dest
+    else:
+        uuid_tmp = "dl2_" + str(uuid.uuid4())[:6]
+        dest = uuid_tmp + os.path.basename(url)
+    print("downloading %s to %s" % (url, dest))
+    if url.startswith("file://"):
+        from requests_file import FileAdapter
+        s = requests.Session()
+        s.mount('file://', FileAdapter())
+        url_data = s.get(url, stream=True)
+    else:
+        url_data = requests.get(url, stream=True)
+    if url_data.status_code != requests.codes.ok:
+        msg = "Cannot get url %s, code: %s, reason: %s" % (
+            str(url), str(url_data.status_code), str(url_data.reason))
+        raise requests.exceptions.RequestException(msg)
+    url_data.raw.decode_content = True
+    dirname = os.path.dirname(dest)
+    if dirname != "" and not os.path.isdir(dirname):
+        makedirs(os.path.dirname(dest), exist_ok=True)
+    uuid_tmp = "dl3_" + str(uuid.uuid4())[:6]
+    dest_tmp = dest + "_" + uuid_tmp + ".tmp"
+    with open(dest_tmp, 'wb') as f:
+        shutil.copyfileobj(url_data.raw, f)
+    try:
+        shutil.move(dest_tmp, dest)
+    except FileExistsError:
+        pass
+    remove_simple(dest_tmp)
+    return dest
+def get_url(x, from_str=False, short_name=False):
+    if not from_str:
+        source = x.metadata['source']
+    else:
+        source = x
+    if short_name:
+        source_name = get_short_name(source)
+    else:
+        source_name = source
+    if source.startswith('http://') or source.startswith('https://'):
+        return """<a href="%s" target="_blank"  rel="noopener noreferrer">%s</a>""" % (
+            source, source_name)
+    else:
+        return """<a href="file/%s" target="_blank"  rel="noopener noreferrer">%s</a>""" % (
+            source, source_name)
+def get_short_name(name, maxl=50):
+    if name is None:
+        return ''
+    length = len(name)
+    if length > maxl:
+        allow_length = maxl - 3
+        half_allowed = max(1, int(allow_length / 2))
+        name = name[0:half_allowed] + "..." + name[length - half_allowed:length]
+    return name
+def cuda_vis_check(total_gpus):
+    """Helper function to count GPUs by environment variable
+    Stolen from Jon's h2o4gpu utils
+    """
+    cudavis = os.getenv("CUDA_VISIBLE_DEVICES")
+    which_gpus = []
+    if cudavis is not None:
+        # prune away white-space, non-numerics,
+        # except commas for simple checking
+        cudavis = "".join(cudavis.split())
+        import re
+        cudavis = re.sub("[^0-9,]", "", cudavis)
+        lencudavis = len(cudavis)
+        if lencudavis == 0:
+            total_gpus = 0
+        else:
+            total_gpus = min(
+                total_gpus,
+                os.getenv("CUDA_VISIBLE_DEVICES").count(",") + 1)
+            which_gpus = os.getenv("CUDA_VISIBLE_DEVICES").split(",")
+            which_gpus = [int(x) for x in which_gpus]
+    else:
+        which_gpus = list(range(0, total_gpus))
+    return total_gpus, which_gpus
+def get_ngpus_vis(raise_if_exception=True):
+    ngpus_vis1 = 0
+    shell = False
+    if shell:
+        cmd = "nvidia-smi -L 2> /dev/null"
+    else:
+        cmd = ["nvidia-smi", "-L"]
+    try:
+        timeout = 5 * 3
+        o = subprocess.check_output(cmd, shell=shell, timeout=timeout)
+        lines = o.decode("utf-8").splitlines()
+        ngpus_vis1 = 0
+        for line in lines:
+            if 'Failed to initialize NVML' not in line:
+                ngpus_vis1 += 1
+    except (FileNotFoundError, subprocess.CalledProcessError, OSError):
+        # GPU systems might not have nvidia-smi, so can't fail
+        pass
+    except subprocess.TimeoutExpired as e:
+        print('Failed get_ngpus_vis: %s' % str(e))
+        if raise_if_exception:
+            raise
+    ngpus_vis1, which_gpus = cuda_vis_check(ngpus_vis1)
+    return ngpus_vis1
+def get_mem_gpus(raise_if_exception=True, ngpus=None):
+    totalmem_gpus1 = 0
+    usedmem_gpus1 = 0
+    freemem_gpus1 = 0
+    if ngpus == 0:
+        return totalmem_gpus1, usedmem_gpus1, freemem_gpus1
+    try:
+        cmd = "nvidia-smi -q 2> /dev/null | grep -A 3 'FB Memory Usage'"
+        o = subprocess.check_output(cmd, shell=True, timeout=15)
+        lines = o.decode("utf-8").splitlines()
+        for line in lines:
+            if 'Total' in line:
+                totalmem_gpus1 += int(line.split()[2]) * 1024 ** 2
+            if 'Used' in line:
+                usedmem_gpus1 += int(line.split()[2]) * 1024 ** 2
+            if 'Free' in line:
+                freemem_gpus1 += int(line.split()[2]) * 1024 ** 2
+    except (FileNotFoundError, subprocess.CalledProcessError, OSError):
+        # GPU systems might not have nvidia-smi, so can't fail
+        pass
+    except subprocess.TimeoutExpired as e:
+        print('Failed get_mem_gpus: %s' % str(e))
+        if raise_if_exception:
+            raise
+    return totalmem_gpus1, usedmem_gpus1, freemem_gpus1
+class ForkContext(threading.local):
+    """
+        Set context for forking
+        Ensures state is returned once done
+    """
+    def __init__(self, args=None, kwargs=None, forkdata_capable=True):
+        """
+        :param args:
+        :param kwargs:
+        :param forkdata_capable: whether fork is forkdata capable and will use copy-on-write forking of args/kwargs
+        """
+        self.forkdata_capable = forkdata_capable
+        if self.forkdata_capable:
+            self.has_args = args is not None
+            self.has_kwargs = kwargs is not None
+            forkdatacontext.args = args
+            forkdatacontext.kwargs = kwargs
+        else:
+            self.has_args = False
+            self.has_kwargs = False
+    def __enter__(self):
+        try:
+            # flush all outputs so doesn't happen during fork -- don't print/log inside ForkContext contexts!
+            sys.stdout.flush()
+            sys.stderr.flush()
+        except BaseException as e:
+            # exit not called if exception, and don't want to leave forkdatacontext filled in that case
+            print("ForkContext failure on enter: %s" % str(e))
+            self.finally_act()
+            raise
+        return self
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        self.finally_act()
+    def finally_act(self):
+        """
+            Done when exception hit or exit is reached in context
+            first reset forkdatacontext as crucial to have reset even if later 2 calls fail
+        :return: None
+        """
+        if self.forkdata_capable and (self.has_args or self.has_kwargs):
+            forkdatacontext._reset()
+class _ForkDataContext(threading.local):
+    def __init__(
+            self,
+            args=None,
+            kwargs=None,
+    ):
+        """
+        Global context for fork to carry data to subprocess instead of relying upon copy/pickle/serialization
+        :param args: args
+        :param kwargs: kwargs
+        """
+        assert isinstance(args, (tuple, type(None)))
+        assert isinstance(kwargs, (dict, type(None)))
+        self.__args = args
+        self.__kwargs = kwargs
+    @property
+    def args(self) -> Tuple:
+        """returns args"""
+        return self.__args
+    @args.setter
+    def args(self, args):
+        if self.__args is not None:
+            raise AttributeError(
+                "args cannot be overwritten: %s %s" % (str(self.__args), str(self.__kwargs))
+            )
+        self.__args = args
+    @property
+    def kwargs(self) -> Dict:
+        """returns kwargs"""
+        return self.__kwargs
+    @kwargs.setter
+    def kwargs(self, kwargs):
+        if self.__kwargs is not None:
+            raise AttributeError(
+                "kwargs cannot be overwritten: %s %s" % (str(self.__args), str(self.__kwargs))
+            )
+        self.__kwargs = kwargs
+    def _reset(self):
+        """Reset fork arg-kwarg context to default values"""
+        self.__args = None
+        self.__kwargs = None
+    def get_args_kwargs(self, func, args, kwargs) -> Tuple[Callable, Tuple, Dict]:
+        if self.__args:
+            args = self.__args[1:]
+            if not func:
+                assert len(self.__args) > 0, "if have no func, must have in args"
+                func = self.__args[0]  # should always be there
+        if self.__kwargs:
+            kwargs = self.__kwargs
+        try:
+            return func, args, kwargs
+        finally:
+            forkdatacontext._reset()
+    @staticmethod
+    def get_args_kwargs_for_traced_func(func, args, kwargs):
+        """
+        Return args/kwargs out of forkdatacontext when using copy-on-write way of passing args/kwargs
+        :param func: actual function ran by _traced_func, which itself is directly what mppool treats as function
+        :param args:
+        :param kwargs:
+        :return: func, args, kwargs from forkdatacontext if used, else originals
+        """
+        # first 3 lines are debug
+        func_was_None = func is None
+        args_was_None_or_empty = args is None or len(args) == 0
+        kwargs_was_None_or_empty = kwargs is None or len(kwargs) == 0
+        forkdatacontext_args_was_None = forkdatacontext.args is None
+        forkdatacontext_kwargs_was_None = forkdatacontext.kwargs is None
+        func, args, kwargs = forkdatacontext.get_args_kwargs(func, args, kwargs)
+        using_forkdatacontext = func_was_None and func is not None  # pulled func out of forkdatacontext.__args[0]
+        assert forkdatacontext.args is None, "forkdatacontext.args should be None after get_args_kwargs"
+        assert forkdatacontext.kwargs is None, "forkdatacontext.kwargs should be None after get_args_kwargs"
+        proc_type = kwargs.get('proc_type', 'SUBPROCESS')
+        if using_forkdatacontext:
+            assert proc_type == "SUBPROCESS" or proc_type == "SUBPROCESS"
+        if proc_type == "NORMAL":
+            assert forkdatacontext_args_was_None, "if no fork, expect forkdatacontext.args None entering _traced_func"
+            assert forkdatacontext_kwargs_was_None, "if no fork, expect forkdatacontext.kwargs None entering _traced_func"
+        assert func is not None, "function should not be None, indicates original args[0] was None or args was None"
+        return func, args, kwargs
+forkdatacontext = _ForkDataContext()
+def _traced_func(func, *args, **kwargs):
+    func, args, kwargs = forkdatacontext.get_args_kwargs_for_traced_func(func, args, kwargs)
+    return func(*args, **kwargs)
+def call_subprocess_onetask(func, args=None, kwargs=None):
+    if isinstance(args, list):
+        args = tuple(args)
+    if args is None:
+        args = ()
+    if kwargs is None:
+        kwargs = {}
+    args = list(args)
+    args = [func] + args
+    args = tuple(args)
+    with ForkContext(args=args, kwargs=kwargs):
+        args = (None,)
+        kwargs = {}
+        with ProcessPoolExecutor(max_workers=1) as executor:
+            future = executor.submit(_traced_func, *args, **kwargs)
+            return future.result()