ONNX-Demo / pyserini /resources /index-metadata /faiss.miracl-v1.0.mdpr-tied-pft-msmarco-ft-miracl.20230329.e40d4a.README.md
ArthurChen189's picture
upload pyserini
62977bb
|
raw
history blame
4.54 kB

miracl-v1.0-mdpr-tied-pft-msmarco-ft-miracl-${lang}

This index was generated on 2023/03/21 using tevatron with the following commands:

Create Train Directory

create_train_dir.py

import json
from pyserini.search.lucene import LuceneSearcher
from datasets import load_dataset
from random import shuffle
from tqdm import tqdm

searcher = LuceneSearcher.from_prebuilt_index('miracl-v1.0-${lang}')
searcher.set_language('${lang}')

miracl_train = load_dataset('miracl/miracl', '${lang}', split='train')
with open('miracl_train_bm25_neg_top100_random30.${lang}.jsonl', 'w') as f:
    for data in tqdm(miracl_train):
        query = data['query']
        positives = data['positive_passages']
        negatives = data['negative_passages']
        positive_ids = [p['docid'] for p in positives]
        negative_ids = [p['docid'] for p in negatives]
        hits = searcher.search(query, k=100)
        bm25_negatives = []
        for hit in hits:
            info = json.loads(hit.raw)
            if info['docid'] not in positive_ids and info['docid'] not in negative_ids:
                bm25_negatives.append(info)
        all_negatives = negatives + bm25_negatives
        shuffle(all_negatives)
        random_30_negatives = all_negatives[:30]
        data['negative_passages'] = random_30_negatives
        if len(random_30_negatives) > 0:
            f.write(json.dumps(data, ensure_ascii=False)+'\n')
python create_train_dir.py

Train

CUDA_VISIBLE_DEVICES=0 python -m tevatron.driver.train \
  --output_dir model_miracl_${lang} \
  --model_name_or_path castorini/mdpr-tied-pft-msmarco \
  --tokenizer_name bert-base-multilingual-cased \
  --save_steps 20000 \
  --dataset_name Tevatron/msmarco-passage \
  --per_device_train_batch_size 64 \
  --train_dir miracl_train_bm25_neg_top100_random30.${lang}.jsonl \
  --train_n_passages 2 \
  --learning_rate 1e-5 \
  --q_max_len 32 \
  --p_max_len 256 \
  --num_train_epochs 40 \
  --logging_steps 10 \
  --overwrite_output_dir \
  --fp16

Encode Corpus

CUDA_VISIBLE_DEVICES=0 python -m tevatron.driver.encode \
  --output_dir=temp_out \
  --model_name_or_path model_miracl_${lang} \
  --fp16 \
  --per_device_eval_batch_size 256 \
  --dataset_name miracl/miracl-corpus:${lang} \
  --p_max_len 256 \
  --encoded_save_path model_miracl_${lang}_corpus/${lang}_corpus_emb.pt 

Convert Index

convert_index.py

import numpy as np
import faiss
import pickle
import os
from tqdm import tqdm
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--input', type=str, required=True)
parser.add_argument('--output', type=str, required=True)
args = parser.parse_args()

def pickle_load(path):
    with open(path, 'rb') as f:
        reps, lookup = pickle.load(f)
    return np.array(reps), lookup

index = faiss.IndexFlatIP(768)

all_ids = []
for name in tqdm(os.listdir(args.input)):
    if 'corpus_emb' not in name:
        continue
    path = os.path.join(args.input, name)
    reps, ids = pickle_load(path)
    all_ids.extend(ids)
    index.add(reps)

faiss.write_index(index, f'{args.output}/index')
with open(f'{args.output}/docid', 'w') as f:
    for i in all_ids:
        f.write(f'{i}\n')
python test.py --input=model_miracl_${lang}_corpus --output=${lang}_index

Index from Pyserini

Tested to use the same checkpoint to index directly via Pyserini using the following command, got the same score. (on basilisk) (only tested on Swahili)

encoder=castorini/mdpr-tied-pft-msmarco-ft-miracl-$lang

index_dir=miracl-v1.0-$lang-mdpr-tied-pft-msmarco-ft-miracl-$lang
echo $index_dir


CUDA_VISIBLE_DEVICES=1 \
python -m pyserini.encode   input   --corpus $corpus \
                                    --fields title text \
                                    --delimiter "\n\n" \
                                    --shard-id $shard_id \
                                    --shard-num $shard_num \
                            output  --embeddings  $index_dir \
                                    --to-faiss \
                            encoder --encoder $encoder \
                                    --fields title text \
                                    --batch 128 \
                                    --encoder-class 'auto' \
                                    --fp16