Spaces:

geonmin-kim
/

NetsPresso_QA

Runtime error

App Files Files Community

geonmin-kim commited on Sep 12, 2023

Commit

d6585f5

•

1 Parent(s): d571cf0

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +6 -0
.gitattributes +2 -0
.gitignore +23 -0
.gitmodules +3 -0
Dockerfile +53 -0
LICENSE.txt +191 -0
MANIFEST.in +5 -0
README.md +257 -8
__pycache__/search_online.cpython-38.pyc +0 -0
analyze_answer_inclusion_in_retrieval.py +100 -0
collections/.gitkeep +1 -0
convert_cast22_eval_json_to_queries_tsv.py +24 -0
download_oracle_jdk.sh +15 -0
encode_8shards_cuda0.sh +26 -0
encode_8shards_cuda1.sh +26 -0
encode_8shards_cuda2.sh +26 -0
encode_8shards_cuda3.sh +26 -0
entrypoint.sh +23 -0
flagged/log.csv +272 -0
integrations/clprf/test_clprf.py +951 -0
integrations/clprf/test_trec_covid_r3.py +85 -0
integrations/clprf/test_trec_covid_r4.py +91 -0
integrations/clprf/test_trec_covid_r5.py +141 -0
integrations/dense/test_ance.py +185 -0
integrations/dense/test_distilbert_kd.py +45 -0
integrations/dense/test_distilbert_tasb.py +44 -0
integrations/dense/test_dpr.py +339 -0
integrations/dense/test_encode.py +137 -0
integrations/dense/test_kilt.py +69 -0
integrations/dense/test_sbert.py +67 -0
integrations/dense/test_tct_colbert-v2.py +132 -0
integrations/dense/test_tct_colbert.py +183 -0
integrations/lucenesearcher_anserini_checker.py +78 -0
integrations/lucenesearcher_score_checker.py +71 -0
integrations/papers/test_ecir2023.py +91 -0
integrations/papers/test_sigir2021.py +180 -0
integrations/papers/test_sigir2022.py +100 -0
integrations/run_lucenesearcher.py +51 -0
integrations/sparse/test_lucenesearcher_check_core17.py +59 -0
integrations/sparse/test_lucenesearcher_check_core18.py +59 -0
integrations/sparse/test_lucenesearcher_check_irst.py +355 -0
integrations/sparse/test_lucenesearcher_check_ltr_msmarco_document.py +60 -0
integrations/sparse/test_lucenesearcher_check_ltr_msmarco_passage.py +59 -0
integrations/sparse/test_lucenesearcher_check_robust04.py +59 -0
integrations/sparse/test_lucenesearcher_check_robust05.py +59 -0
integrations/sparse/test_lucenesearcher_multithread.py +79 -0
integrations/sparse/test_nmslib.py +66 -0
integrations/sparse/test_prebuilt_msmarco_v1_doc.py +170 -0
integrations/sparse/test_prebuilt_msmarco_v1_passage.py +96 -0
integrations/sparse/test_prebuilt_robust04.py +42 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,6 @@

+datasets/
+embeddings/
+sparse/
+Dockerfile
+indexes
+.dockerignore

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+pyserini/resources/jars/anserini-0.21.1-SNAPSHOT-fatjar.jar filter=lfs diff=lfs merge=lfs -text
+runs/q=NQtest_c=wikidpr_m=bm25.run filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,23 @@

+*~
+.DS_Store
+.idea
+__pycache__
+*.pyc
+pyserini/resources/jars/*.jar
+collections/*
+indexes/*
+.vscode/
+venv/
+datasets/
+embeddings/
+# build directories from `python3 setup.py sdist bdist_wheel`
+build/
+dist/
+pyserini.egg-info/
+# runs should be ignored
+runs/
+# logs should also be ignored
+logs/

.gitmodules ADDED Viewed

	@@ -0,0 +1,3 @@

+[submodule "tools"]
+	path = tools
+	url = https://github.com/castorini/anserini-tools.git

Dockerfile ADDED Viewed

	@@ -0,0 +1,53 @@

+FROM nvcr.io/nvidia/pytorch:22.01-py3
+ENV HOME=/root
+ENV LC_ALL C.UTF-8
+ENV LANG C.UTF-8
+ARG DEBIAN_FRONTEND=noninteractive
+# APP_PATH: assumed to be volume mounted with host
+ENV APP_PATH=$HOME/np_app_text_retrieval_inference
+ENV PYTHONPATH $APP_PATH
+# EXTERNAL_PATH: assumed having system specific files, not volume mounted with host
+ENV EXTERNAL_PATH=/workspace/external
+RUN mkdir -p $EXTERNAL_PATH
+WORKDIR $APP_PATH
+COPY . $APP_PATH/
+### ubuntu package
+RUN apt-get update
+RUN apt-get install git vim unzip wget ffmpeg libsm6 libxext6 -y
+### conda
+# note that retriever (based on pyserini) support cpu only for faiss
+RUN conda install -c conda-forge faiss-cpu
+### pip
+# install editable mode
+RUN pip install --upgrade pip && \
+    pip install --no-binary :all: nmslib && \
+    pip install -e .
+# update java version >= 11
+WORKDIR $EXTERNAL_PATH
+RUN cp $APP_PATH/download_oracle_jdk.sh $EXTERNAL_PATH/
+RUN sh download_oracle_jdk.sh
+RUN ln -sf ${EXTERNAL_PATH}/jdk-20.0.2/bin/java /opt/conda/bin/java
+ENV JAVA_HOME=${EXTERNAL_PATH}/jdk-20.0.2
+# anserini: need for sparse retrieval/indexing
+RUN conda install -c conda-forge maven
+RUN mvn -v
+RUN git clone https://github.com/castorini/anserini.git
+RUN cd anserini && \
+    mvn clean -Dmaven.test.skip=true package appassembler:assemble
+# copy fatjar from anserini to app
+# will be done at entrypoint
+#COPY $EXTERNAL_PATH/anserini/target/anserini-*-fatjar.jar $APP_PATH/pyserini/resources/jars
+## entrypoint: call entrypoint.sh instead
+WORKDIR $APP_PATH
+#CMD ["bash"]

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,191 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   Copyright 2019-2021 Pyserini authors
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

MANIFEST.in ADDED Viewed

	@@ -0,0 +1,5 @@

+include *.md
+include requirements.txt
+include pyserini/2cr/*.yaml
+include pyserini/2cr/*.template
+include pyserini/resources/jars/anserini-*-fatjar.jar

README.md CHANGED Viewed

@@ -1,12 +1,261 @@
 ---
-title: NetsPresso QA
-emoji: 📉
-colorFrom: red
-colorTo: pink
 sdk: gradio
-sdk_version: 3.43.2
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: NetsPresso_QA
+app_file: run_ralm_netspresso_doc.py
 sdk: gradio
+sdk_version: 3.41.2
 ---
+# Text retrieval inference (indexing, search)
+## Installation
+1. 저장소 다운로드
+```bash
+git clone https://github.com/nota-github/np_app_text_retrieval_inference
+```
+2. 모델 환경이 정의된 도커 이미지 생성 및 실행
+```bash
+cd np_app_semantic_search_inference
+docker build --cache-from notadockerhub/np_app_text_retrieval_inference:latest -t notadockerhub/np_app_text_retrieval_inference:latest -f ./Dockerfile .
+docker run --name {container_name} --shm-size=8g -it --gpus '"device=0"' -v {your_code_dir}:/root/np_app_text_retrieval_inference -v /{your_data_dir}:/workspace/datasets notadockerhub/np_app_text_retrieval_inference:latest
+```
+* retrieval시에는 gpu가 BERT 기반의 query encoding시에만 사용됩니다. 전체 시간에서는 적은 비율을 차지하므로 cpu만 사용해도 속도에서 큰 차이는 없습니다.
+* 원하는 문서들을 indexing하는 경우 BERT를 이용하여 일회성으로 encoding하는데, 이 경우는 gpu를 사용하면 cpu보다 많은 시간을 절약할 수 있습니다.
+  * 현재 구현에서는 single gpu 사용만을 지원하고 있으며, multi gpu 사용을 위해서는 individual process를 만들어서 병렬로 처리해야 합니다.
+* 대부분의 코드는 [pyserini](https://github.com/castorini/pyserini)에 기반하고 있습니다.
+## Dataset
+```
+datasets
+	|-- dataset_name
+	|   |--	collection.jsonl
+	|   |--	queries.tsv
+  |   |--	qrels.txt (optional, 정량평가를 원할 경우)
+```
+* collection.jsonl: each line is `{"id": "PASSAGE_ID", "contents": "CONTENTS"}`.
+* queries.tsv: each line is `QUERY_ID\tCONTENTS`.
+* qrels.txt: each line is `QUERY_ID QUERY_TYPE PASSAGE_ID RELEVANCE_SCORE`.
+## Recommended retriever
+* sparse model: BM25
+* dense model
+    * multi-lingual: mDPR, mContriever
+    * multi-vector: colBERT
+* hybrid model: sparse (first-pass) + dense (reranking)
+* 다국어를 encode하는 baseline 모델은 `castorini/mdpr-question-nq`을 사용.
+* 언어별 다양한 pre-trained 모델은 [HuggingFace model hub](https://huggingface.co/models)에서 검색 해볼 수 있음.
+## Sample dataset
+* [mrtydi-korean](https://github.com/castorini/mr.tydi)
+  * 11개 언어를 포함한 다국어 검색을 위한 benchmark dataset
+  * 한국어의 경우 1496126개의 passage와 421개의 test query를 제공함
+  * `title`과 `text`를 포함한 multi-field를 활용할 수 있음 (일반적으로는 `text`만 사용 가능)
+* [data hub](https://www.notion.so/notaai/Data-Hub-V1-Current-Version-45b0b0aa62084b3e985244ebb264d444?pvs=4)에서 원본 데이터 및 indexing된 결과물을 다운받을 수 있음.
+  * @data_hub:/ssd2/np_app/Dataset_Hub/Semantic_Search/{corpus,indexes}
+## Procedure
+### 1. Indexing
+* Fast retrieval을 위해서 collection의 passage에 대한 indexing을 미리 계산함
+* indexing 과정은 미리 만들둔것을 사용해도 됨
+  * mrtydi-korean의 경우 data hub에 존재
+  * [pre-built indexes for benchmark corpus/model](https://github.com/castorini/pyserini/blob/master/docs/prebuilt-indexes.md)
+</br>
+* dense model
+```
+python -m pyserini.encode \
+  input   --corpus /path/to/dataset/collection.jsonl  \
+          --fields text \
+  output  --embeddings indexes/dataset_name/dense \
+          --to-faiss \
+  encoder --encoder huggingface_model_name_or_checkpoint_path \
+          --fields text \
+          --max-length $MAX_LENGTH \
+          --batch $BATCH_SIZE \
+          --fp16
+```
+* huggingface_model_name_or_checkpoint_path: huggingface model hub에서 제공하는 모델 이름 또는 checkpoint path
+  * e.g., mrtydi의 경우: `castorini/mdpr-passage-nq` 사용 (retrieval시의 query encoding: `castorini/mdpr-question-nq`)
+  * tied(vs. split)의 경우 passage/query encoder가 같음(vs. 다름)
+* sparse model
+```
+python -m pyserini.index.lucene \
+  --collection JsonCollection \
+  --input datasets/dataset_name/collection.jsonl \
+  --index /path/to/indexing/sparse \
+  --fields text \
+  --generator DefaultLuceneDocumentGenerator \
+  --language $LANG_CODE \
+  --threads $NUM_THREADS \
+  --storePositions --storeDocvectors --storeRaw
+```
+* language code의 경우 ISO 639-1 방식을 따름 (e.g., en, ko, ja, zh)
+* multifield를 활용할 경우 collection의 "contents"의 텍스트내에 field들을 \n으로 구분하고, --fields에 field 이름들(i.e., --fields title text)을 넣어줌.
+  * mrtydi의 경우 delimiter를 '\n\n'으로 사용함
+```
+{"id": "5#1", "contents": "지미 카터\n\n지미 카터는 조지아주 섬터 카운티 플레인스 마을에서 태어났다. 조지아 공과대학교를 졸업하였다. 그 후 해군에 들어가 전함·원자력·잠수함의 승무원으로 일하였다. 1953년 미국 해군 대위로 예편하였고 이후 땅콩·���화 등을 가꿔 많은 돈을 벌었다. 그의 별명이 \"땅콩 농부\" (Peanut Farmer)로 알려졌다."}
+```
+* MAX_LENGTH: positional embedding의 최대 길이 (e.g., BERT: 512, DPR: 256)
+* 결과물 (dir: /path/to/indexing)
+    - docid: sets of passage id
+    - index: concatenation of (compressed) index vectors, binary file
+### 2. Search
+* Indexing된 collection에 대하여 query에 대한 ranking 수행
+#### online
+* with sparse indexing
+```
+export QUERY="최초로 전기 자동차를 개발한 기업은 어디인가?"
+python search_online.py --index_type sparse --index /path/to/indexing/sparse --query "$QUERY" --lang_abbr $LANG_CODE
+```
+<details>
+  <summary>결과 예시</summary>
+<pre><code>
+ 1 1830196#0       21.52590
+{
+  "id" : "1830196#0",
+  "contents" : "창안 자동차(, )는 중화인민공화국의 자동차 제조 기업이다. 본사는 충칭 시에 있다. 디이 자동차, 둥펑 자동차, 상하이 자동차, 체리 자동차와 함께 중화인민공화국의 5대 자동차 제조 기업으로 여겨진다. 중화인민공화국의 자동차 제조 및 판매, 자동차 엔진 제품 제조 업체이다. 1862년 상하이 시에서 이홍장에 의해 설립되었으며 1950년대 말에 지프를 최초로 생산하면서 자동차 제조 기업이 되었다. 1996년 10월 31일 법인설립되었고 대표자는 장 바오린이다. 1984년에는 일본의 자동차 제조 기업인 스즈키와 제휴 관계를 수립했고 2001년에는 포드 모터 컴퍼니를 합병하면서 창안 포드 자동차(長安福特汽車)가 설립되었다. 2009년에는 하페이 자동차(哈飛汽車), 창허 자동차(昌河汽車)를 합병했다. 충칭 자동차 생산의 태반은 창안자동차가 담당하고 있다. 창안은 1959년 이후 차를 만들어온 국유기업으로 2차대전의 미군용 지프를 본떠 만든 군용트럭이 시발점이었다. 오늘날 라인업은 전기차 하나를 비롯한 17개 모델로 확대됐다. 7개 조립공장과 1개 엔진공장을 통해 한해 약 100만 대를 만든다. 여기에다가 창안은 포드, 푸조와 스즈키와도 합작하고 있어 한해 생산량은 300만 대에 이른다. 창안자동차는 글로벌연구개발시스템을 가동중에 있다. 현재 충칭, 베이징, 허베이, 허페이, 이탈리아 토리노, 일본 요코하마, 영국 버밍엄, 미국 디트로이트 등지에 연구개발센터를 설립하였다. 우리나라 한온시스템은 독일 폴크스바겐, 중국 창안자동차 등에 친환경차용 전동식 컴프레셔를 납품하고 있다."
+}
+ 2 128660#8        19.02320
+{
+  "id" : "128660#8",
+  "contents" : "1990년대에 들어선 직후 가솔린자동차에 의한 환경오염문제가 대두되었다. 1996년 제너럴 모터스(GM)사는 양산 전기차 1호로 볼 수 있는 'EV1' 전기자동차를 개발한다. 이 전기자동차는 미국 캘리포니아 지역에서 임대형식으로 보급된다. 그러나 GM사는 수요가 크지 않아 수익성이 낮다는 이유로 1년만에 전기자동차 'EV1'의 조립라인을 폐쇄한다."
+}
+ 3 320611#0        18.99790
+{
+  "id" : "320611#0",
+  "contents" : "기아 그랜토(Kia Granto) 또는 아시아 그랜토(Asia Granto)는 1995년에 아시아자동차가 생산한 대형 트럭이다. 기아차가 일본 히노 자동차와 기술 제휴해서 히노 프로피아의 차체로 개발한 대형 트럭이다. 기존의 AM 트럭의 후속 차종으로 개발한 트럭으로, 아시아자동차가 창사 30주년을 기념해서 개발한 트럭이다. 선택 사양으로 ABS 브레이크, 속도 제한 장치, 브레이크 라이닝 간극 자동 조정 장치, 오토 그리스, 튜브형 브레이크 파이프, 전기식 변속기 전환 장치 등을 탑재하였다. 1997년에 대한민국산 트럭 최초로 U자형 적재함을 탑재하였으며, 최고 출력 430마력의 FY(8×4) 23톤 덤프 트럭을 출시하였다. 1999년에 아시아자동차가 기아자동차에게 흡수 합병되었으며, 이후 기아자동차에서 생산하다가 2000년 8월에 배기 가스 규제를 충족시키지 못하여 후속 차종 없이 단종되면서, 기아자동차는 대형 트럭 사업을 스카니아 코리아에 양도함에 따라 대형 트럭의 시장에서 완전히 철수하였다."
+}
+ 4 1226703#1       18.78540
+{
+  "id" : "1226703#1",
+  "contents" : "1845년에 회사를 창립 했으며 독일의 전지형 기중기 생산하는 기업 중 가장 오래되었다. 1868년에 말이 끄는 소방차를 개발했으며 1890년에 최초로 증기 소방 차량을 생산했다. 1914년에 최초로 트럭과 특수 차량을 제작했다. 1918년에 안스바흐 자동차 공장과 뉘르베르크 자동차 공장을 합병했다. 1937년에 3축 트럭을 생산 했으며 1943년에 제2차 세계대전으로 기존 공장이 파괴되면서 새로운 공장을 건설했다. 1956년에 군사 목적을 위해 대형 트력과 장비를 개발했다. 1960년대에 ��초로 기중기를 제작하기 시작 했으며 1970년대부터 1980년대까지 개발했다. 1985년에 최대 50톤 용량의 가진 전지형 기중기를 개발했다. 1990년 일본의 기중기 회사였던 타다노에 인수 되었다. 1991년에 일본 수출을 위해 전지형 기중기를 생산했다. 1995년에 회사 창립 150주년이 되었다. 2004년에 최초로 험지형 기중기를 제작한데 이어 2009년에 트럭 기중기를 제작했다. 2013년에 공장을 확장 및 이전하면서 현재에 이르고 있다."
+}
+ 5 1045157#14      18.30410
+{
+  "id" : "1045157#14",
+  "contents" : "2010년 3월 세계최초의 2000cc급 자동차를 위한 15Kw급 BLDC발전기 개발, 전기자동차의 주행거 리 제한 극복 세계최초의 동급 내연이륜차의 성능을 능가하는 전기스쿠터 힐리스 모델출시 및 신차발표회 EV전시장 오픈"
+}
+ 6 128661#7        17.92510
+{
+  "id" : "128661#7",
+  "contents" : "1991년 11월 21일 현대자동차는 한국내에서는 최초의 전기자동차를 독자개발했다고 발표했다."
+}
+ 7 1312657#1       17.78780
+{
+  "id" : "1312657#1",
+  "contents" : "1939년에 이탈리아 나폴리 출신인 빈센조 앙헬레스 게르바지오()와 타예레스 나폴리()에 의해 설립했다. 제2차 세계대전 당시 스페인에서 트럭을 생산하기 위해 차체 및 용접을 했으나, 이후 샤시에 특장 트럭 캡 디자인을 개발했다. 1958년에 최초로 공장이 이전되면서 버스를 생산하기 시작했다. 1960년에 세계 최초로 2층 버스를 생산했다. 1962년에 생산 공장이 재이전 되면서 팩토리아스 나폴리스 SA()에 인수되었다. 이 회사는 상용차를 생산한 업체로 주로 버스와 트럭을 생산했다. 1966년에 바헤이로스 디젤 SA()에 매각했다. 1969년에 다시 크라이슬러에 마각이 되었지만 버스 제조 부문의 경우 1971년에 벨기에의 자동차 제조 기업인 반호르에 매각되었다. 1983년에 반호르가 최대 주주가 되었고 인수 최기에 반호르의 브랜드로 차량 생산을 했지만 이후 이스파노 카로세라 SAL()로 사명이 변경되었다. 1997년에 이탈리아의 자동차 제조 기업인 피닌파라나()와 제휴를 맺고 시내버스 모델인 아비토와 고속버스 모델인 디보를 개발하기 시작했다. 2000년 9월에 모로코의 수도 카사블랑카에 공장을 설립했다. 2005년에 인도의 자동차 제조 기업인 타타자동차가 21%의 지분을 획득한데 이어 2009년에 지분 79%를 인수하면서 자회사가 되었다. 2010년에 현재의 사명으로 변경이 되었다. 2013년 9월에 타타자동차는 사라고사 공장 폐쇄를 발표했다. 매출 하락과 미래 전망이 불투명으로 폐쇄 결정을 내렸다."
+}
+ 8 128660#63       17.71300
+{
+  "id" : "128660#63",
+  "contents" : "후지중공업과 마츠비시 자동차는 2005년 8월에 전기자동차의 개발 계획을 발표하였다. 이 2개 회사가 거의 중지 상태였던 전기자동차의 개발을 재개하고 있다. 2008년에 들어 닛산-르노 연합이 전기자동차로 본격 참여 방침을 표명하였고, 도요타도 2010년대 초반에 전기자동차를 출시하기로 발표하는 등 전기 자동차가 활성화 조짐을 보이고 있다."
+}
+ 9 126891#2        17.63640
+{
+  "id" : "126891#2",
+  "contents" : "2007년, 스웨덴의 대표 자동차 메이커인 볼보는 세계 최초로 에탄올 자동차를 제작해서 자동차 경주에 참가했다. 스웨덴에서는 가솔린 자동차의 도시내 사용을 줄이고, 시민들이 자전거로 생활할 수 있게끔 유도하고 있다. 또한 볼보에서 친환경 자동차를 적극적으로 개발하게 하고, 시민들에게는 친환경 자동차 구입비에 150만 원의 보조금을 지급하며, 연료비는 가솔린의 70% 가격에 주유할 수 있게 하는 등 적극적인 탈석유 정책을 시행하고 있다."
+}
+10 128660#3        17.29680
+{
+  "id" : "128660#3",
+  "contents" : "전기자동차는 디젤 엔진, 가솔린 엔진을 사용하는 오토사이클(정적사이클)방식의 자동차보다 먼저 고안 되었다. 1830년부터 1840년 사이에 영국 스코틀랜드의 사업가 앤더슨이 전기자동차의 시초라고 할 수 있는 세계 최초의 원유전기마차를 발명한다. 1835년에 네덜란드 크리스토퍼 베커는 작은 크기의 전기자동차를 만든다."
+}
+</code></pre>
+</details>
+* with dense indexing
+```
+python search_online.py --index_type dense --index /path/to/indexing/dense --query "$QUERY" --encoder huggingface_model_name_or_checkpoint_path --device $DEVICE
+```
+* DEVICE: 'cpu' or 'cuda:$GPU_ID'
+  * search는 현재는 single gpu만 지원됩니다. multi gpu를 사용하려면 individual process를 만들어서 병렬로 처리해야 합니다.
+* with hybrid (first-pass: sparse, reranking: dense) indexing
+```
+python search_online.py --index_type hybrid --index /path/to/indexing/sparse,/path/to/indexing/dense --query "$QUERY" --encoder huggingface_model_name_or_checkpoint_path --device $DEVICE --alpha $ALPHA_MULTIPLIED_ON_SPARSE_SCORE --normalization --lang_abbr $LANG_CODE
+```
+  * ALPHA_MULTIPLIED_ON_SPARSE_SCORE는 (0,2)에서 line search를 하면서 최적의 값을 찾으며 0.5가 기본값입니다.
+#### batch
+* with dense indexing
+```
+python -m pyserini.search.faiss \
+    --encoder huggingface_model_name_or_checkpoint_path \
+    --index /path/to/indexing_dense \
+    --topics datasets/dataset_name/queries.tsv \
+    --output /path/to/runfile --batch $BATCH_SIZE --threads $NUM_THREADS \
+    --hits $TOPK --remove-query --device $DEVICE
+```
+* BATCH_SIZE, NUM_THREADS는 기본값을 64, 16으로 사용합니다.
+* with sparse indexing
+```
+python -m pyserini.search.lucene --bm25 \
+    --topics datasets/dataset_name/queries.tsv \
+    --index /path/to/indexing_sparse \
+    --hits $TOPK \
+    --language $LANG_CODE \
+    --output /path/to/runfile
+```
+* hybrid model
+```
+python -m pyserini.search.hybrid \
+dense  --index /path/to/indexing_dense \
+        --encoder huggingface_model_name_or_checkpoint_path \
+        --device $DEVICE \
+sparse --index /path/to/indexing_sprase \
+fusion --alpha $ALPHA_MULTIPLIED_ON_SPARSE_SCORE \
+run	--topics datasets/dataset_name/queries.jsonl \
+    --output /path/to/runfile \
+    --threads $NUM_THREADS \
+    --batch-size $BATCH_SIZE \
+    --hits $TOPK
+python -m pyserini.search.hybrid \
+dense  --index path/to/indexing/dense \
+        --encoder huggingface_model_name_or_checkpoint_path \
+        --device $DEVICE \
+sparse --index /path/to/indexing/sprase \
+fusion --alpha $ALPHA_MULTIPLIED_ON_SPARSE_SCORE \
+run --topics datasets/dataset_name/queries.tsv \
+    --output runs/hybrid.run \
+    --threads $NUM_THREADS \
+    --batch-size $BATCH_SIZE \
+    --hits 1000
+```
+* 결과물 (dir: /path/to/runfile)
+    format: qid q_type pid topK score retrieval_type
+    example:
+    ```
+    46 Q0 271267 1 2.134944 Faiss
+    46 Q0 63734 2 2.118700 Faiss
+    46 Q0 174045 3 2.110519 Faiss
+    ...
+    ```
+### 3. Evaluation (optional)
+* **qrels** 파일은 정량평가를 위한 ground truth 파일로, qid q_type pid relevance_score 형식으로 구성되어 있음.
+* **runfile**은 batch로 검색한 결과로, qid q_type pid topK score retrieval_type 형식으로 구성되어 있음.
+* 아래 스크립트는 qrels 파일과 runfile을 비교하여 nDCG@10, MRR@100, Recall@100 등의 지표를 계산함.
+```
+python -m pyserini.eval.trec_eval -c -mndcg_cut.10 -mrecip_rank -mrecall.100 /path/to/qrels /path/to/runfile
+recip_rank            	all	0.3628
+recall_100            	all	0.7158
+ndcg_cut_10           	all	0.3805
+```

__pycache__/search_online.cpython-38.pyc ADDED Viewed

Binary file (4.58 kB). View file

analyze_answer_inclusion_in_retrieval.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import json
+from pyserini.search.lucene import LuceneSearcher
+from tqdm import tqdm
+def convert_unicode_to_normal(data):
+    if isinstance(data, str):
+        return data.encode('utf-8').decode('utf-8')
+    elif isinstance(data, list):
+        assert(isinstance(data[0], str))
+        return [sample.encode('utf-8').decode('utf-8') for sample in data]
+    else:
+        raise ValueError
+K=30
+index_dir="/root/indexes/index-wikipedia-dpr-20210120" # lucene
+runfile_path=f"runs/q=NQtest_c=wikidpr_m=bm25_k={K}.run" # bm25
+qafile_path="/root/nota-fairseq/examples/information_retrieval/open_domain_data/NQ/qa_pairs/test.jsonl"
+logging_path="logging_q=NQ_c=wiki_including_ans.jsonl"
+# define searcher with pre-built indexes
+searcher = LuceneSearcher(index_dir=index_dir)
+# v2. read qa first (due to runfile query name sort)
+print("read qa file")
+pair_by_qid = {}
+with open(qafile_path, 'r') as fr_qa:
+    for pair in tqdm(fr_qa):
+        pair_data = json.loads(pair)
+        qid, query, answers = pair_data["qid"], pair_data["query"], pair_data["answers"] # str, str, list
+        pair_by_qid[qid] = {'query': query, 'answers':answers}
+print("check retrieved passage include answer")
+qid_with_ans_in_retrieval = []
+with open(runfile_path, 'r') as fr_run, open(logging_path, 'w') as fw_log:
+    for result in tqdm(fr_run):
+        fields = result.split(' ')
+        assert(len(fields) == 6) # qid q_type pid k score engine
+        qid_, pid = fields[0], fields[2]
+        assert(qid_ in pair_by_qid.keys())
+        query, answers = pair_by_qid[qid_]['query'], pair_by_qid[qid_]['answers']
+        # get passage
+        psg_txt = searcher.doc(pid)
+        psg_txt = psg_txt.raw()
+        psg_txt = json.loads(psg_txt)
+        psg_txt = psg_txt['contents'].strip()
+        psg_txt = convert_unicode_to_normal(psg_txt)
+        # check if passage contains answer
+        #if any([ans in psg_txt for ans in answers]):
+        for ans in answers:
+            if ans in psg_txt:
+                log_w = {
+                    "qid": qid_,
+                    "pid": pid,
+                    "query": query,
+                    "answer": ans,
+                    "passage": psg_txt
+                }
+                fw_log.write(json.dumps(log_w, ensure_ascii=False) + '\n')
+                if qid_ not in qid_with_ans_in_retrieval:
+                    qid_with_ans_in_retrieval.append(qid_)
+                break # don't have to count check multiple answer in passage
+print(f"#qid in test set: {len(pair_by_qid.keys())}, #qid having answer with retrieval(BM25, K={K}): {len(qid_with_ans_in_retrieval)}, Recall = {len(qid_with_ans_in_retrieval)/len(pair_by_qid.keys())*100}")
+# v1
+"""
+with open(runfile_path, 'r') as fr_run, open(qafile_path, 'r') as fr_qa:
+    for pair in tqdm(fr_qa):
+        pair_data = json.loads(pair)
+        qid, query, answers = pair_data["qid"], pair_data["query"], pair_data["answers"] # str, str, list
+        for k in range(K):
+            result=fr_run.readline()
+            print(result)
+            fields = result.split(' ')
+            assert(len(fields) == 6) # qid q_type pid k score engine
+            qid_, pid = fields[0], fields[2]
+            assert(qid == qid_), f"qid={qid}, qid_={qid_} should be same"
+            # get passage
+            psg_txt = searcher.doc(pid)
+            psg_txt = psg_txt.raw()
+            psg_txt = json.loads(psg_txt)
+            psg_txt = psg_txt['contents'].strip()
+            psg_txt = convert_unicode_to_normal(psg_txt)
+            # check if passage contains answer
+            if any([ans in psg_txt for ans in answers]):
+                import pdb
+                pdb.set_trace()
+"""

collections/.gitkeep ADDED Viewed

	@@ -0,0 +1 @@


1	+ # This is the default directory for document collections. Placeholder so that directory is kept in git.

convert_cast22_eval_json_to_queries_tsv.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import json
+from tqdm import tqdm
+### read: cast 22 eval json
+eval_json_path="/root/Corpus/CAsT22_eval_queries/cqr_inferred_results.json"
+# read data
+with open(eval_json_path, 'r') as fr:
+    data = json.load(fr)
+# write: queries.tsv (format: {qid}\t{query})
+eval_queries_path = "/root/Corpus/CAsT22_eval_queries/queries_cqr_result.tsv"
+with open(eval_queries_path, 'w') as fw:
+    for sample in tqdm(data):
+        conv_id = sample['number']
+        for turn in sample['turn']:
+            turn_id = turn['number']
+            automatic_rewritten_utterance = turn['automatic_rewritten_utterance']
+            q_id = f"{conv_id}_{turn_id}"
+            fw.write(f"{q_id}\t{automatic_rewritten_utterance}\n")

download_oracle_jdk.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+#!/bin/bash
+ARCH=$(uname -m)
+echo "Architecture: ${ARCH}"
+if [ "$ARCH" = "x86_64" ]
+then
+    JDK_URL="https://download.oracle.com/java/20/latest/jdk-20_linux-x64_bin.tar.gz"
+else
+    JDK_URL="https://download.oracle.com/java/20/latest/jdk-20_linux-aarch64_bin.tar.gz"
+fi
+wget --no-check-certificate --no-cookies --header "Cookie: oraclelicense=accept-securebackup-cookie" "${JDK_URL}"
+tar -xzf jdk-*.tar.gz
+rm jdk-*.tar.gz

encode_8shards_cuda0.sh ADDED Viewed

	@@ -0,0 +1,26 @@

+#!/bin/bash
+#SHARD_NUM=8
+SHARDED_CORPUS_PREFIX="/root/Corpus/CAsT22_msmarcov2_kilt_flattened_8shards"
+SHARDED_EMBEDDING_PREFIX="/ssd3/geonminkim/indexes/CAsT_21_22_msmarcov2_kilt/dense"
+START_IDX=0
+END_IDX=1
+SHARD_NUM=8
+GPU_ID=0
+for SHARD_IDX in $(seq $START_IDX $END_IDX); do
+        echo "encoding for SHARD_IDX = $SHARD_IDX"
+        python -m pyserini.encode \
+        input   --corpus $SHARDED_CORPUS_PREFIX/shard_$SHARD_IDX \
+                --fields text \
+                --shard-id $SHARD_IDX \
+                --shard-num $SHARD_NUM \
+        output  --embeddings $SHARDED_EMBEDDING_PREFIX/shard_$SHARD_IDX \
+                --to-faiss \
+        encoder --encoder castorini/tct_colbert-v2-msmarco-cqe \
+                --fields text \
+                --device cuda:$GPU_ID \
+                --batch 128 \
+                --fp16
+done

encode_8shards_cuda1.sh ADDED Viewed

	@@ -0,0 +1,26 @@

+#!/bin/bash
+#SHARD_NUM=8
+SHARDED_CORPUS_PREFIX="/root/Corpus/CAsT22_msmarcov2_kilt_flattened_8shards"
+SHARDED_EMBEDDING_PREFIX="/ssd3/geonminkim/indexes/CAsT_21_22_msmarcov2_kilt/dense"
+START_IDX=2
+END_IDX=3
+SHARD_NUM=8
+GPU_ID=1
+for SHARD_IDX in $(seq $START_IDX $END_IDX); do
+        echo "encoding for SHARD_IDX = $SHARD_IDX"
+        python -m pyserini.encode \
+        input   --corpus $SHARDED_CORPUS_PREFIX/shard_$SHARD_IDX \
+                --fields text \
+                --shard-id $SHARD_IDX \
+                --shard-num $SHARD_NUM \
+        output  --embeddings $SHARDED_EMBEDDING_PREFIX/shard_$SHARD_IDX \
+                --to-faiss \
+        encoder --encoder castorini/tct_colbert-v2-msmarco-cqe \
+                --fields text \
+                --device cuda:$GPU_ID \
+                --batch 128 \
+                --fp16
+done

encode_8shards_cuda2.sh ADDED Viewed

	@@ -0,0 +1,26 @@

+#!/bin/bash
+#SHARD_NUM=8
+SHARDED_CORPUS_PREFIX="/root/Corpus/CAsT22_msmarcov2_kilt_flattened_8shards"
+SHARDED_EMBEDDING_PREFIX="/ssd3/geonminkim/indexes/CAsT_21_22_msmarcov2_kilt/dense"
+START_IDX=4
+END_IDX=5
+SHARD_NUM=8
+GPU_ID=2
+for SHARD_IDX in $(seq $START_IDX $END_IDX); do
+        echo "encoding for SHARD_IDX = $SHARD_IDX"
+        python -m pyserini.encode \
+        input   --corpus $SHARDED_CORPUS_PREFIX/shard_$SHARD_IDX \
+                --fields text \
+                --shard-id $SHARD_IDX \
+                --shard-num $SHARD_NUM \
+        output  --embeddings $SHARDED_EMBEDDING_PREFIX/shard_$SHARD_IDX \
+                --to-faiss \
+        encoder --encoder castorini/tct_colbert-v2-msmarco-cqe \
+                --fields text \
+                --device cuda:$GPU_ID \
+                --batch 128 \
+                --fp16
+done

encode_8shards_cuda3.sh ADDED Viewed

	@@ -0,0 +1,26 @@

+#!/bin/bash
+#SHARD_NUM=8
+SHARDED_CORPUS_PREFIX="/root/Corpus/CAsT22_msmarcov2_kilt_flattened_8shards"
+SHARDED_EMBEDDING_PREFIX="/ssd3/geonminkim/indexes/CAsT_21_22_msmarcov2_kilt/dense"
+START_IDX=6
+END_IDX=7
+SHARD_NUM=8
+GPU_ID=3
+for SHARD_IDX in $(seq $START_IDX $END_IDX); do
+        echo "encoding for SHARD_IDX = $SHARD_IDX"
+        python -m pyserini.encode \
+        input   --corpus $SHARDED_CORPUS_PREFIX/shard_$SHARD_IDX \
+                --fields text \
+                --shard-id $SHARD_IDX \
+                --shard-num $SHARD_NUM \
+        output  --embeddings $SHARDED_EMBEDDING_PREFIX/shard_$SHARD_IDX \
+                --to-faiss \
+        encoder --encoder castorini/tct_colbert-v2-msmarco-cqe \
+                --fields text \
+                --device cuda:$GPU_ID \
+                --batch 128 \
+                --fp16
+done

entrypoint.sh ADDED Viewed

	@@ -0,0 +1,23 @@

+#!/bin/bash
+HOME=/root
+# check pth
+echo "check path for JAVA_HOME"
+echo $JAVA_HOME
+# additional environment variable
+export PYSERINI_CACHE=/root/indexes
+# APP_PATH: assumed to be volume mounted with host
+APP_PATH=$HOME/np_app_text_retrieval_inference
+# EXTERNAL_PATH: assumed having system specific files, not volume mounted with host
+EXTERNAL_PATH=/workspace/external
+# copy jar file
+cp $EXTERNAL_PATH/anserini/target/anserini-*-fatjar.jar $APP_PATH/pyserini/resources/jars
+# launch /bin/bash
+cd $APP_PATH
+exec /bin/bash

flagged/log.csv ADDED Viewed

	@@ -0,0 +1,272 @@

+질의,검색 결과,flag,username,timestamp
+"몸에 반점이 있다면 어떤 질병일 가능성이 있을까?
+","문서 1
+정확한 원인은 밝혀져 있지 않으나 유전적, 환경적 요인이 복합되어 작용하는 것으로 짐작된다. 이 질병을 앓는 가족 구성원이 있다면 본인에게도 영향을 받을 가능성이 있다. 영향을 받은 일란성 쌍둥이가 있다면 30%의 확률로 다른 쌍둥이도 이 질병을 가지고 있을 가능성이 있다. 이 질병은 스트레스, 감염, 출생 시에 발병할 수 있다. 제1형 당뇨병, 류마티스 관절염과 같은 자가면역 질환을 가진 환자들이 영향을 받을 가능성이 있다. 흡연은 이 질병의 위험성을 높이며 안구 문제를 더 악화시킬 수 있다. 이 질병은 TSI라는 항체에서 비롯하며 이는 갑상샘 자극 호르몬과 영향도가 비슷하다. 이러한 항체들은 갑상샘이 갑상샘 호르몬을 과도하게 생산하도록 유도한다. 질병 확인을 위한 혈액 검사, 방사성 요오드 섭취를 통한 증상에 기반하여 진단한다.
+문서 2
+1603년 37세의 나이로 사망하였고, 아들 아흐메트 1세가 왕위를 이어받았다. 사망 원인은 정확하게 알려져 있지 않으며, 자연사나 질병일 가능성이 있다.
+문서 3
+이상의 연구 결과들은 오직 가스 행성 및 갈색 왜성이 존재하지 않음을 밝혀줬을 뿐 지구나 금성처럼 작은 행성이 존재할 가능성은 아직 남아 있다. 만약 뜨거운 목성이 항성 가까이 있었다면 이들은 항성 근처를 돌던 지구형 행성의 궤도를 망가뜨려 생명체 발생 가능성을 낮췄을 것이다. 따라서 가스 행성이 없다는 것은 지구 비슷한 행성이 존재할 가능성을 높여 주는 사실이 된다. 통상적으로 밝혀진 연구에 따르면 중원소 함량이 높은 별 주위에는 행성이 있을 확률이 높고 그렇지 않은 별 주위에는 행성이 있을 확률이 적은 것으로 밝혀졌다. 두꺼운 먼지 원반이 있다는 사실은 항성 가까이에 암석 행성이 존재할 가능성을 높이는 것은 사실이다. 그러나 이는 동시에 그 암석 행성이 폭격을 받을 가능성이 높음을 알려주는 사실이기도 하다. 만약 행성이 발견된다면 이후의 연구 방향은 이 행성에 생명체가 살 수 있는 물과 대기가 존재하는지를 충분한 해상도의 망원경을 통해 알아내는 것이 된다. 지구의 산소가 생명체 존재를 나타내는 척도가 되는 것처럼 무기성의 대기 조성은 원시 생명체가 존재함을 나타내는 징표가 될 수 있다.
+문서 4
+치매나 백혈병, 당뇨, 파킨슨병과 같은 난치병들 중에는 세포의 변이나 사멸로 인한 질병이 대다수이다. 이러한 퇴행성 질병의 경우 세포 치료법을 이용해여 치료하는 경우가 많다. 특이점주의자들에 따르면 줄기세포 연구와 같은 세포 연구는 생명 공학 연구의 일부분이며 유전자 DNA 지도를 완벽하게 구조화할 수 있다면 세포분화 치료 혹은 세포복제 치료를 통해 환자 자신의 DNA를 지니고 텔로미어가 연장된 세포를 공급할 수 있을 것이라고 본다. 예컨데 현재 당뇨병 치료에 쓰이는 거부반응 제어제가 위험한 부작용을 일으킬 가능성이 있는 반면 이러한 세포 치료는 부작용 가능성을 급격히 낮출 수 있다. 이러한 세포 치료는 단순히 난치병의 치료에만 그치는 것이 아니라, 교차분화 기술을 이용한 새로운 신체 기관으로의 교체를 가능하게 한다.
+문서 5
+컴퓨터에 의한 시뮬레이션에는, 보이드로 불리는 틈새나 버그가 있고, 내부로부터도 아는 경우가 있을 지도 모르다. 그러한 것을 찾아내 검증할 수 있다면, 거기에 따라 모의현실의 내부에 있는 것을 증명할 수 있을 가능성이 있다. 그러나, 물리 법칙에 반하는 일은, 그 밖에도 설명할 수 있는 가설을 생각할 수 있다(신 등). 영화 《매트릭스》로 그려진 것처럼, 기시감 등의 일상적인 기묘한 체험도 어떠한 버그로서 설명할 수 있을 가능성이 있다.
+문서 6
+예를 들면 대기가 없는 어떤 행성 깊은 단층 속 그늘진 곳이나 화산 동굴 속에 생태계가 형성되어 있을 가능성이 있다. 저명했던 천문학자 칼 세이건은 이 개념을 태양계에 적용하여, 1976년 논문을 통해 목성의 대기 상층부에 유기체가 살고 있을 가능성을 피력했다. 그러나 목성에는 딱딱한 표면이 없기 때문에 생명체가 존재할 가능성은 거의 없다.
+문서 7
+널리 알려져 있다는 사실이 반드시 참임을 보증하는 것은 아니므로 이런 주장은 논리적으로 오류이다. 개인의 신념이 잘못되어 있을 가능성이 있다면 다수의 인간의 신념도 잘못되어 있을 가능성이 있다. 예를 들면, 75%의 사람이 A라고 대답했다고 해도 25%의 확률로 A가 아닐 가능성도 있다. 이 확률이 어떻게 되든 다수가 옳다는 것은 논리적이라고 할 수 없다. 만약 만장일치라고 해도, 표본의 수가 불충분할지도 모르고, 혹은 그 사람들이 모르는 사실이 존재하고 있어서 그 사실을 알면 결과가 바뀔지도 모른다.
+문서 8
+""(a and b)""와 같은 논리식을 계산한다고 하면 ""a""항이 거짓인 경우에, ""b""항을 계산하지 않아도 전체 식의 답을 알 수 있다. ""(a or b)""에서 ""a""항이 참인 경우에도 마찬가지이다. 여기서 항이 복잡한 식이면 이점이 많고, 식에서 결과가 참이나 거짓일 가능성과 계산의 비용에 따라 어떤 항이 먼저 계산되어야 좋은지 알 수 있다. 따라서 ""(a or b or c)""와 같은 식에서 ""a""항이 참값을 가질 가능성이 많다면, 전체 식을 쉽게 계산할 수 있다. 이런 가능성을 보장하기 위해, 컴파일러는 더 계산해야 할 것인지, 다른 항을 지름길 계산 해야 할 것인지를 검사하기도 한다. 이런 검사는 계산을 줄이는 것을 실패할 경우나 꼭 필요한 경우 무조건 전체 식을 계산해야 할 때 시간이 더 많이 걸리게 된다.
+문서 9
+생물학적으로는 인간의 뇌에 필요한 유전 정보를 가지는 인공적인 게놈을 적당한 숙주의 세포에 짜넣는 것으로 인공적으로 생명을 만드는 것도 가능할지도 모른다고 생각되며, 그러한 인공생명체는 의식을 가질 가능성이 높다. 그렇지만 그 생명체 안의 어떤 속성이 의식을 낳는 것일까? 비슷한 것을 비생물학적인 부품에서 만들 수 없는 것인지? 컴퓨터를 설계하기 위한 기술로 그러한 의식체를 낳을 수 없을까? 그러한 행위는 윤리적으로 문제 없을까?와 같은 여러 문제를 안고 있다.
+문서 10
+3부 리그",,,2023-06-16 00:57:35.174375
+"모델 경량화 및 최적화와 관련하여 Netspresso bot에게 물어보세요.
+예를들어
+- Why do I need to use Netspresso?
+- Summarize how to compress the model with netspresso.
+- Tell me what the pruning is.
+- What kinds of hardware can I use with this toolkit?
+- Can I use YOLOv8 with this tool? If so, tell me the examples.","1. Why do I need to use Netspresso?
+Netspresso is a machine learning pipeline that provides tools and methods for model compression and optimization. By using Netspresso, you can reduce the size and computational cost of your models, making them more efficient for deployment on resource-constrained devices. It also allows you to fine-tune and optimize your models for better performance. Overall, Netspresso helps you achieve lightweight and optimized models for real-time applications.
+2. Summarize how to compress the model with Netspresso.
+To compress a model with Netspresso, you can follow these steps:
+- Step 1: Prepare your model.
+- Step 2: Upload your model to Netspresso.
+- Step 3: Choose the compression method (e.g., structured pruning, filter decomposition).
+- Step 4: Configure the compression settings (e.g., pruning ratio, compression algorithm).
+- Step 5: Start the compression process.
+- Step 6: Check the compression result and download the compressed model.
+3. Tell me what pruning is.
+Pruning is a model compression technique that involves removing unnecessary parameters or structures from a complex model to make it more compact and efficient. In the context of Netspresso, pruning can be done using structured pruning, which removes entire neurons, filters, or channels from the model. This results in a simplified model that can be deployed without any specific hardware or software requirements for acceleration.
+4. What kinds of hardware can I use with this toolkit?
+Netspresso supports various hardware platforms for model deployment. Some of the supported hardware includes NVIDIA Jetson family devices, which can be used with TensorRT for accelerated inference. Other supported platforms include TensorFlow Lite and OpenVINO, which provide optimization for different hardware architectures. The specific hardware compatibility may depend on the compression method and the target framework used in Netspresso.
+5. Can I use YOLOv8 with this tool? If so, tell me the examples.
+Yes, you can use YOLOv8 with Netspresso for model compression and optimization. YOLOv8 is a popular object detection model, and Netspresso provides support for object detection models. You can follow the steps in the Netspresso documentation to prepare, upload, compress, and optimize your YOLOv8 model using the available methods and tools in Netspresso. The documentation provides examples and guidelines for using Netspresso with different models, including YOLOv8.","Source 0
+Step 4: Package model (beta)
+Jump to ContentDocumentationHomeDocumentationv1.3.0DocumentationLog InLog InMoon (Dark Mode)Sun (Light Mode)v1.3.0HomeDocumentationSearchINTRODUCTIONWelcomeWhat is NetsPresso?Machine Learning Pipeline with NetsPressoAdvantages of NetsPressoBENCHMARKSModel Searcher- Object DetectionModel Compressor- Image Classification- Object Detection- Semantic Segmentation- Super ResolutionUSE CASESFree People Detection ModelsFree Vehicle Detection ModelsNETSPRESSO MODEL SEARCHERFeatures & Scope of supportStep 1: Prepare datasetStep 2: Upload datasetStep 3: Create project (Quick Search)Step 3: Create project (Retraining)Step 4: Check the project result and download a modelNETSPRESSO MODEL COMPRESSORFeatures & Scope of supportMethod: Structured PruningMethod: Filter DecompositionSupported modelsSupported ONNX operatorsStep 1: Prepare modelStep 2: Upload modelStep 3: Compress model (Automatic Compression)Step 3: Compress model (Advanced Compression)Step 4: Check the compression result and download a modelStep 5: Retrain the compressed modelNETSPRESSO MODEL LAUNCHERFeatures & Scope of supportStep 1: Prepare modelStep 2: Upload modelStep 3: Convert model (beta)Step 4: Package model (beta)Personal serverRequirementsConnect Personal ServerRELEASE NOTESNetsPresso ReleasesFAQAbout Credit?Powered by Step 4: Package model (beta)Suggest EditsPackage the model so that it can be deployed directly to the device.
+1. Go to Package page
+Select New package at the drop-down menu that appears when you click Download button.
+2. Package the model
+Enter the package name and select a base model to package.
+Please note that the package name will be the library name and the name cannot be changed after packaging.
+You can include pre/post processing codes (.py) with the model for the package (optional).
+Download the pre/post processing code example and modify for your use cases.
+3. Download package file and run the package
+Packaged file will be automatically downloaded.
+To run the package, use the code written below. {package_name} must be changed to your package name.
+Pythonfrom np_{package_name}.models.model import NPModel
+NPModel.initialize(num_threads=1) # Initialize
+npmodel = NPModel()
+image_path = ""./test.jpg"" #Image path
+print(npmodel.run(image_path)) # Inference
+NPModel.finalize() # Memory management
+Updated about 1 month ago Table of Contents
+1. Go to Package page
+2. Package the model
+3. Download package file and run the package
+######################################################
+Source 1
+Step 3: Convert model (beta)
+Jump to ContentDocumentationHomeDocumentationv1.3.0DocumentationLog InLog InMoon (Dark Mode)Sun (Light Mode)v1.3.0HomeDocumentationSearchINTRODUCTIONWelcomeWhat is NetsPresso?Machine Learning Pipeline with NetsPressoAdvantages of NetsPressoBENCHMARKSModel Searcher- Object DetectionModel Compressor- Image Classification- Object Detection- Semantic Segmentation- Super ResolutionUSE CASESFree People Detection ModelsFree Vehicle Detection ModelsNETSPRESSO MODEL SEARCHERFeatures & Scope of supportStep 1: Prepare datasetStep 2: Upload datasetStep 3: Create project (Quick Search)Step 3: Create project (Retraining)Step 4: Check the project result and download a modelNETSPRESSO MODEL COMPRESSORFeatures & Scope of supportMethod: Structured PruningMethod: Filter DecompositionSupported modelsSupported ONNX operatorsStep 1: Prepare modelStep 2: Upload modelStep 3: Compress model (Automatic Compression)Step 3: Compress model (Advanced Compression)Step 4: Check the compression result and download a modelStep 5: Retrain the compressed modelNETSPRESSO MODEL LAUNCHERFeatures & Scope of supportStep 1: Prepare modelStep 2: Upload modelStep 3: Convert model (beta)Step 4: Package model (beta)Personal serverRequirementsConnect Personal ServerRELEASE NOTESNetsPresso ReleasesFAQAbout Credit?Powered by Step 3: Convert model (beta)Suggest EditsConvert the model format according to the target device.
+1. Go to Convert page
+Click the Convert button on Models page.
+2. Covert model
+Enter the name and memo for the converted model. Select a base model to be converted and the target hardware to benchmark the model.
+Depending on the framework of the base model, the options available for converting are different.
+Models built with Model Searcher → TensorRT, TensorFlow Lite, OpenVINO
+Custom models
+ONNX → TensorRT, TensorFlow Lite, OpenVINO
+Click the Start converting button to convert the model. (Converting for the NVIDIA Jetson family (TensorRT) may take up to 1 hour.)
+3. Check the converting result
+Converted model will be displayed on the Models page with performance benchmarks on the selected target hardware.Updated 6 months ago Table of Contents
+1. Go to Convert page
+2. Covert model
+3. Check the converting result
+######################################################
+Source 2
+Method: Structured Pruning
+Jump to ContentDocumentationHomeDocumentationv1.3.0DocumentationLog InLog InMoon (Dark Mode)Sun (Light Mode)v1.3.0HomeDocumentationSearchINTRODUCTIONWelcomeWhat is NetsPresso?Machine Learning Pipeline with NetsPressoAdvantages of NetsPressoBENCHMARKSModel Searcher- Object DetectionModel Compressor- Image Classification- Object Detection- Semantic Segmentation- Super ResolutionUSE CASESFree People Detection ModelsFree Vehicle Detection ModelsNETSPRESSO MODEL SEARCHERFeatures & Scope of supportStep 1: Prepare datasetStep 2: Upload datasetStep 3: Create project (Quick Search)Step 3: Create project (Retraining)Step 4: Check the project result and download a modelNETSPRESSO MODEL COMPRESSORFeatures & Scope of supportMethod: Structured PruningMethod: Filter DecompositionSupported modelsSupported ONNX operatorsStep 1: Prepare modelStep 2: Upload modelStep 3: Compress model (Automatic Compression)Step 3: Compress model (Advanced Compression)Step 4: Check the compression result and download a modelStep 5: Retrain the compressed modelNETSPRESSO MODEL LAUNCHERFeatures & Scope of supportStep 1: Prepare modelStep 2: Upload modelStep 3: Convert model (beta)Step 4: Package model (beta)Personal serverRequirementsConnect Personal ServerRELEASE NOTESNetsPresso ReleasesFAQAbout Credit?Powered by Method: Structured PruningSuggest EditsModel Compression
+The goal of model compression is to achieve a model that is simplified from the original without performance deterioration. By compressing the large model, the user can reduce the storage and computational cost and allow to use in real-time applications.
+NetsPresso supports the following compression methods.
+Structured Pruning
+Filter Decomposition
+This page describes for Structured Pruning.
+What is ""Pruning""?
+	Pruning is the process of removing individual or groups of parameters from a complex model to make it faster and more compact. This compressing procedure is divided into unstructured pruning and structured pruning by the pruning objects.
+Unstructured Pruning
+			: Removes individual parameters and returns a sparse model, which requires an additional device to be accelerated.
+Structured Pruning
+			: Removes entire neurons, filters, or channels and returns a model, which does not require any particular hardware or software to be accelerated.
+  The goal of pruning is to reduce the computational resources and accelerate the model by removing unnecessary filters (Model Compressor only supports structured pruning. Unstructured pruning will be published in near future.).
+    However, the fine-tuning process is necessary to compensate for the loss of accuracy.
+Structured Pruning
+Supported functions
+  Pruning in Model Compressor provides two pruning functions (Pruning by Channel Index / Criteria) and one recommendation (SLAMP) to fulfill the user's demand on model compression.
+Pruning by Channel Index
+  Removes the filters that a user wants to. If the selected filters are redundant or less important, it will return a better performing model.
+ Pruning by Criteria
+L2 Norm
+          : L2-Norm is used to represent the importance of the corresponding filter. In other words, this method prunes filters based on the magnitude of weights.
+Nuclear Norm
+           : The nuclear norm is the sum of the singular values representing the energy. It computes the nuclear norm on the feature map to determine the filter's relevance. For this reason, a portion of the dataset is needed. For more detail, please refer to the following paper.
+Seul-Ki Yeom, Kyung-Hwan Shim, and Jee-Hyun Hwang. Toward compact deep neural networks via energy-aware pruning. arXiv preprint, 2021.
+Geometric Median
+        	: Geometric Median is used to measure the redundancy of the corresponding filter and remove redundant filters. For more detail, please refer to the following paper.
+Yang He, Ping Liu, Ziwei Wang, Zhilan Hu, and Yi Yang. Filter pruning via geometric median for deep convolutional neural networks acceleration. In CVPR, 2019.
+Normalization
+The distribution and magnitude of the layers are varied, it is vital to compare those different distributions from the same perspective. For this reason, all of the criterion values are normalized by layer as follows.
+ ""Recommendation"" in Model Compressor
+  The ""Recommendation"" enables a so-called global pruning, which allocates the pruning ratio for each layer at ease. Current version only supports SLAMP.
+SLAMP (Structured Layer-adaptive Sparsity for the Magnitude-based Pruning)
+SLAMP is inspired by the ""Layer-adaptive Sparsity for the Magnitude-based Pruning"" from ICLR 2021, which is called LAMP.
+Layer-Adaptive sparsity for the Magnitude-based Pruning (LAMP) is an unstructured pruning method, but here, it is modified and developed to measure the layer-wise importance for the Structured pruning.
+Normalization function
+ Following normalization function is adopted into the above criteria value.
+What you can do with Model Compressor
+ Choose one of ""Pruning by Channel Index"" or ""Pruning by Criteria"" by the purpose of the user.
+ ""Pruning by Channel Index"" is recommended for the expert, who already knows which filter is unnecessary to solve the issue.
+       ""Pruning by Criteria"" is recommended for the user who wants to prune certain amounts of ratio on specific layers or beginner of the model compression.
+To use ""Pruning by Channel Index""
+ Check ""Pruning by Channel Index"".
+         Check the layers to be pruned.
+         Insert the filter index to be pruned (ex. 105, 8, 9, 11-13).
+ To use ""Pruning by Criteria"".
+ Check one of the criteria (ex. L2-Norm) to calculate the importance score of each filter.
+       Prune specific layers with a certain amount of ratio.
+ Check the layers to be pruned.
+           Insert the amounts of the ratio to be pruned (ex. 0.2).
+ Compress whole layers automatically for the given amounts of ratio.
+ Press ""Recommendation"" and insert the amounts of the ratio (ex. 0.8) to compress the model.
+Verification of the Structured Pruning
+  Pruning in Model Compressor allows to remove the specific filters and preserve the previous knowledge. To verify the pruning method in Model Compressor, we assess the categorical accuracy of the returned compressed model and its fine-tuned model.
+ Cifar100 Accuracy of the Compressed Model and Fine-tuned Model
+The compressed models are compressed by the following strategies: L2Norm, SLAMP, intersection. (Original models are from  here)
+The dashed and solid lines in the above image indicate the compressed model from Model Compressor and the model after fine-tuning, respectively, for the given pruning ratio.
+🚧Not Supported Layers
+Group convolutional layer is currently not supported and will be updated in the near future.
+Updated 5 months ago Table of Contents
+Model Compression
+What is ""Pruning""?
+Structured Pruning
+Supported functions
+What you can do with Model Compressor
+Verification of the Structured Pruning
+",,,2023-06-28 07:12:14.309344
+What is NetsPresoo?,"NetsPresso is a machine learning pipeline that provides various tools and functionalities for model development and optimization. It includes components such as Model Searcher, Model Compressor, and Model Launcher. NetsPresso aims to simplify the process of developing and deploying machine learning models by offering features like dataset preparation, model compression, and model deployment. It supports tasks like image classification, object detection, semantic segmentation, and super resolution. NetsPresso also provides benchmark results and best practices for different models and datasets.","Source 0
+Model Compressor
+Jump to ContentDocumentationHomeDocumentationv1.3.0DocumentationLog InLog InMoon (Dark Mode)Sun (Light Mode)v1.3.0HomeDocumentationSearchINTRODUCTIONWelcomeWhat is NetsPresso?Machine Learning Pipeline with NetsPressoAdvantages of NetsPressoBENCHMARKSModel Searcher- Object DetectionModel Compressor- Image Classification- Object Detection- Semantic Segmentation- Super ResolutionUSE CASESFree People Detection ModelsFree Vehicle Detection ModelsNETSPRESSO MODEL SEARCHERFeatures & Scope of supportStep 1: Prepare datasetStep 2: Upload datasetStep 3: Create project (Quick Search)Step 3: Create project (Retraining)Step 4: Check the project result and download a modelNETSPRESSO MODEL COMPRESSORFeatures & Scope of supportMethod: Structured PruningMethod: Filter DecompositionSupported modelsSupported ONNX operatorsStep 1: Prepare modelStep 2: Upload modelStep 3: Compress model (Automatic Compression)Step 3: Compress model (Advanced Compression)Step 4: Check the compression result and download a modelStep 5: Retrain the compressed modelNETSPRESSO MODEL LAUNCHERFeatures & Scope of supportStep 1: Prepare modelStep 2: Upload modelStep 3: Convert model (beta)Step 4: Package model (beta)Personal serverRequirementsConnect Personal ServerRELEASE NOTESNetsPresso ReleasesFAQAbout Credit?Powered by Model CompressorSuggest EditsUsing NetsPresso Model Compressor on a variety of well-known public datasets, this article shows users how to get outstanding compression results.
+The fine-tuning procedure is necessary for each compression. It usually follows the original model's training configuration, except the learning rate. After a few batches of training, the learning rate is optimized by determining if the loss has converged or not.
+All of the original and compressed models can be downloaded easily on the Model Compressor Model Zoo.
+See Image Classification Results
+See Object Detection Results
+See Semantic Segmentation Results
+See Super Resolution ResultsUpdated 7 months ago Table of Contents
+See Image Classification Results
+See Object Detection Results
+See Semantic Segmentation Results
+See Super Resolution Results
+######################################################
+Source 1
+Connect Personal Server
+Jump to ContentDocumentationHomeDocumentationv1.3.0DocumentationLog InLog InMoon (Dark Mode)Sun (Light Mode)v1.3.0HomeDocumentationSearchINTRODUCTIONWelcomeWhat is NetsPresso?Machine Learning Pipeline with NetsPressoAdvantages of NetsPressoBENCHMARKSModel Searcher- Object DetectionModel Compressor- Image Classification- Object Detection- Semantic Segmentation- Super ResolutionUSE CASESFree People Detection ModelsFree Vehicle Detection ModelsNETSPRESSO MODEL SEARCHERFeatures & Scope of supportStep 1: Prepare datasetStep 2: Upload datasetStep 3: Create project (Quick Search)Step 3: Create project (Retraining)Step 4: Check the project result and download a modelNETSPRESSO MODEL COMPRESSORFeatures & Scope of supportMethod: Structured PruningMethod: Filter DecompositionSupported modelsSupported ONNX operatorsStep 1: Prepare modelStep 2: Upload modelStep 3: Compress model (Automatic Compression)Step 3: Compress model (Advanced Compression)Step 4: Check the compression result and download a modelStep 5: Retrain the compressed modelNETSPRESSO MODEL LAUNCHERFeatures & Scope of supportStep 1: Prepare modelStep 2: Upload modelStep 3: Convert model (beta)Step 4: Package model (beta)Personal serverRequirementsConnect Personal ServerRELEASE NOTESNetsPresso ReleasesFAQAbout Credit?Powered by Connect Personal ServerSuggest EditsTo connect a personal server, start by clicking a 'New Server' button on the screen below.
+location : My Account > Resources
+The way to connect a personal server to NetsPresso is to install an agent on the personal server.
+The process is as follows.
+Step 1. Create Server
+Specifies an identifiable name and the path where the agent will be installed.
+Step 2. Set Server
+Copy the script that pre-checks the server environment and receives server information.
+Make the copied script into sh file and run it in the shell.
+As a result of executing the script, you can see server information in json format as shown below.
+Copy the server information in json format and paste it into the result input field.
+Step 3. Connect Server
+As shown below, check the server information and copy the connect script that can connect the server.
+Make the copied script into sh file and run it in the shell.
+As a result of execution, the server is connected as shown below.
+You can check the server you have connected to on the Resources page.
+Updated about 1 month ago Table of Contents
+Step 1. Create Server
+Step 2. Set Server
+Step 3. Connect Server
+######################################################
+Source 2
+- Object Detection
+Jump to ContentDocumentationHomeDocumentationv1.3.0DocumentationLog InLog InMoon (Dark Mode)Sun (Light Mode)v1.3.0HomeDocumentationSearchINTRODUCTIONWelcomeWhat is NetsPresso?Machine Learning Pipeline with NetsPressoAdvantages of NetsPressoBENCHMARKSModel Searcher- Object DetectionModel Compressor- Image Classification- Object Detection- Semantic Segmentation- Super ResolutionUSE CASESFree People Detection ModelsFree Vehicle Detection ModelsNETSPRESSO MODEL SEARCHERFeatures & Scope of supportStep 1: Prepare datasetStep 2: Upload datasetStep 3: Create project (Quick Search)Step 3: Create project (Retraining)Step 4: Check the project result and download a modelNETSPRESSO MODEL COMPRESSORFeatures & Scope of supportMethod: Structured PruningMethod: Filter DecompositionSupported modelsSupported ONNX operatorsStep 1: Prepare modelStep 2: Upload modelStep 3: Compress model (Automatic Compression)Step 3: Compress model (Advanced Compression)Step 4: Check the compression result and download a modelStep 5: Retrain the compressed modelNETSPRESSO MODEL LAUNCHERFeatures & Scope of supportStep 1: Prepare modelStep 2: Upload modelStep 3: Convert model (beta)Step 4: Package model (beta)Personal serverRequirementsConnect Personal ServerRELEASE NOTESNetsPresso ReleasesFAQAbout Credit?Powered by - Object DetectionSuggest EditsAll of the original and compressed models can be downloaded easily on the Model Compressor Model Zoo.
+You can get Compressed results with Automatic Compression and Compressed (Adv.) results with Advanced Compression.
+PyTorch
+ModelBest PracticeTypeDatasetmAP(0.5) (%)mAP(0.5:0.95)(%)FLOPs (M)Params (M)Latency (ms)Model Size (MB)YOLOXOriginalCOCO68.049.7156006.2054.2112239.46207.37YOLOXGoogle ColabCompressed-1COCO67.16 (-0.84)48.64 (-1.06)101804.06 (1.53x)19.96 (2.7x)8502.72 (1.44x)76.61 (2.7x)YOLOXGoogle ColabCompressed-2COCO61.43 (-6.57)43.23 (-5.47)38607.03 (4.04x)4.93 (11.0x)4235.37 (2.89x)19.17 (10.80x)
+The model’s latency is measured on Raspberry Pi 4B (1.5GHz ARM Cortex).
+Options: FP32, ONNX runtime
+TensorFlow-Keras
+ModelBest PracticeTypeDatasetmAP(0.5) (%)mAP(0.5:0.95)(%)FLOPs (M)Params (M)Latency (ms)Model Size (MB)YOLOv4OriginalPASCAL VOC82.22-61871.8265.3264318.70262.90YOLOv4Google ColabCompressed-1PASCAL VOC87.23 (+5.01)-11459.69 (5.4x)10.59 (6.17x)28651.70 (2.16x)44.12 (5.96x)YOLOv4Google ColabCompressed-2PASCAL VOC87.91 (+5.69)-14442.96 (4.28x)10.71 (6.1x)28976.40 (2.14x)44.36 (5.93x)
+YOLOv4 model with EfficientNet B1 based backbone.
+The model’s latency is measured on Raspberry Pi 4B (1.5GHz ARM Cortex).
+Options: FP32, TFLite
+Updated about 2 months ago Table of Contents
+PyTorch
+TensorFlow-Keras
+######################################################
+Source 3
+- Object Detection
+Jump to ContentDocumentationHomeDocumentationv1.3.0DocumentationLog InLog InMoon (Dark Mode)Sun (Light Mode)v1.3.0HomeDocumentationSearchINTRODUCTIONWelcomeWhat is NetsPresso?Machine Learning Pipeline with NetsPressoAdvantages of NetsPressoBENCHMARKSModel Searcher- Object DetectionModel Compressor- Image Classification- Object Detection- Semantic Segmentation- Super ResolutionUSE CASESFree People Detection ModelsFree Vehicle Detection ModelsNETSPRESSO MODEL SEARCHERFeatures & Scope of supportStep 1: Prepare datasetStep 2: Upload datasetStep 3: Create project (Quick Search)Step 3: Create project (Retraining)Step 4: Check the project result and download a modelNETSPRESSO MODEL COMPRESSORFeatures & Scope of supportMethod: Structured PruningMethod: Filter DecompositionSupported modelsSupported ONNX operatorsStep 1: Prepare modelStep 2: Upload modelStep 3: Compress model (Automatic Compression)Step 3: Compress model (Advanced Compression)Step 4: Check the compression result and download a modelStep 5: Retrain the compressed modelNETSPRESSO MODEL LAUNCHERFeatures & Scope of supportStep 1: Prepare modelStep 2: Upload modelStep 3: Convert model (beta)Step 4: Package model (beta)Personal serverRequirementsConnect Personal ServerRELEASE NOTESNetsPresso ReleasesFAQAbout Credit?Powered by - Object DetectionSuggest EditsQuick Search supports to train a model based on open sources and NPNets will be available with Advanced Search.
+YOLOv5 and YOLOv5-NPNets
+DatasetModelmAP(0.5) (%)mAP(0.5:0.95) (%)GFLOPsParameters (M)Model Size (MB)PASCAL VOCYOLOv5n72.2444.31.793.62PASCAL VOCYOLOv5n-NPNet73.446.33.51.182.49PASCAL VOCYOLOv5s77.952.2167.0713.7PASCAL VOCYOLOv5s-NPNet80.25612.84.619.05PASCAL VOCYOLOv5m82.159.348.320.9540.2PASCAL VOCYOLOv5m-NPNet83.460.63712.2623.7PASCAL VOCYOLOv5l82.961108.346.2488.6PASCAL VOCYOLOv5l-NPNet85.163.88125.1948.5Updated 5 months ago Table of Contents
+YOLOv5 and YOLOv5-NPNets
+",,,2023-08-31 13:29:07.271798

integrations/clprf/test_clprf.py ADDED Viewed

	@@ -0,0 +1,951 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+import shutil
+import unittest
+from random import randint
+from integrations.lucenesearcher_score_checker import LuceneSearcherScoreChecker
+from integrations.utils import run_command, parse_score
+class TestSearchIntegration(unittest.TestCase):
+    def setUp(self):
+        # The current directory depends on if you're running inside an IDE or from command line.
+        curdir = os.getcwd()
+        if curdir.endswith('clprf'):
+            self.pyserini_root = '../..'
+            self.anserini_root = '../../../anserini'
+        else:
+            self.pyserini_root = '.'
+            self.anserini_root = '../anserini'
+        self.tmp = f'{self.pyserini_root}/integrations/tmp{randint(0, 10000)}'
+        if os.path.exists(self.tmp):
+            shutil.rmtree(self.tmp)
+        else:
+            os.mkdir(self.tmp)
+        self.pyserini_search_cmd = 'python -m pyserini.search.lucene'
+        self.pyserini_fusion_cmd = 'python -m pyserini.fusion'
+        self.pyserini_eval_cmd = 'python -m pyserini.eval.trec_eval'
+        self.core17_index_path = os.path.join(self.anserini_root, 'indexes/lucene-index.nyt')
+        self.core17_qrels_path = os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.core17.txt')
+        self.core18_index_path = os.path.join(self.anserini_root, 'indexes/lucene-index.wapo.v2')
+        self.core18_qrels_path = os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.core18.txt')
+        self.robust04_index_path = os.path.join(self.anserini_root, 'indexes/lucene-index.disk45')
+        self.robust04_qrels_path = os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.robust04.txt')
+        self.robust05_index_path = os.path.join(self.anserini_root, 'indexes/lucene-index.robust05')
+        self.robust05_qrels_path = os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.robust05.txt')
+        self.core17_checker = LuceneSearcherScoreChecker(
+            index=self.core17_index_path,
+            topics=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/topics.core17.txt'),
+            pyserini_topics='core17',
+            qrels=self.core17_qrels_path,
+            eval=f'{self.pyserini_eval_cmd} -m map -m P.30')
+        self.core18_checker = LuceneSearcherScoreChecker(
+            index=self.core18_index_path,
+            topics=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/topics.core18.txt'),
+            pyserini_topics='core18',
+            qrels=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.core18.txt'),
+            eval=f'{self.pyserini_eval_cmd} -m map -m P.30')
+        self.robust04_checker = LuceneSearcherScoreChecker(
+            index=self.robust04_index_path,
+            topics=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/topics.robust04.txt'),
+            pyserini_topics='robust04',
+            qrels=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.robust04.txt'),
+            eval=f'{self.pyserini_eval_cmd} -m map -m P.30')
+        self.robust05_checker = LuceneSearcherScoreChecker(
+            index=self.robust05_index_path,
+            topics=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/topics.robust05.txt'),
+            pyserini_topics='robust05',
+            qrels=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.robust05.txt'),
+            eval=f'{self.pyserini_eval_cmd} -m map -m P.30')
+    def test_cross_validation(self):
+        pyserini_topics = 'core17'
+        os.mkdir(f'{self.tmp}/core17')
+        for alpha in [x / 10.0 for x in range(0, 11)]:
+            run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} \
+                               --topics {pyserini_topics} --output {self.tmp}/core17/core17_lr_A{alpha}_bm25.txt \
+                               --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha {alpha}'
+            status = os.system(run_file_cmd)
+            self.assertEqual(status, 0)
+        os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \
+                      --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \
+                      --collection core17 --output {self.tmp}/core17_lr.txt --classifier lr ')
+        cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \
+                {self.tmp}/core17_lr.txt'
+        status = os.system(cmd)
+        stdout, stderr = run_command(cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2462, delta=0.0001)
+    def test_core17(self):
+        self.assertTrue(self.core17_checker.run('core17_bm25', '--bm25', 0.2087))
+    def test_core17_rm3(self):
+        self.assertTrue(self.core17_checker.run('core17_bm25', '--bm25 --rm3', 0.2798))
+    def test_core17_lr(self):
+        pyserini_topics = 'core17'
+        run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} \
+                           --topics {pyserini_topics} --output {self.tmp}/core17_lr.txt \
+                           --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.7'
+        status = os.system(run_file_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \
+                      {self.tmp}/core17_lr.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2473, delta=0.0001)
+    def test_core17_lr_rm3(self):
+        pyserini_topics = 'core17'
+        run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} \
+                           --topics {pyserini_topics} --output {self.tmp}/core17_lr_rm3.txt \
+                           --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.4 --rm3'
+        status = os.system(run_file_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \
+                      {self.tmp}/core17_lr_rm3.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2926, delta=0.0001)
+    def test_core17_svm(self):
+        pyserini_topics = 'core17'
+        run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} \
+                           --topics {pyserini_topics} --output {self.tmp}/core17_svm.txt \
+                           --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.7'
+        status = os.system(run_file_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \
+                      {self.tmp}/core17_svm.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2385, delta=0.0001)
+    def test_core17_svm_rm3(self):
+        pyserini_topics = 'core17'
+        run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} \
+                           --topics {pyserini_topics} --output {self.tmp}/core17_svm_rm3.txt \
+                           --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.4 --rm3'
+        status = os.system(run_file_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \
+                      {self.tmp}/core17_svm_rm3.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2956, delta=0.0001)
+    def test_core17_avg(self):
+        pyserini_topics = 'core17'
+        run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} \
+                           --topics {pyserini_topics} --output {self.tmp}/core17_avg.txt \
+                           --prcl lr svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.6'
+        status = os.system(run_file_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \
+                      {self.tmp}/core17_avg.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2442, delta=0.0001)
+    def test_core17_avg_rm3(self):
+        pyserini_topics = 'core17'
+        run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} \
+                           --topics {pyserini_topics} --output {self.tmp}/core17_avg_rm3.txt \
+                           --prcl lr svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5 --rm3'
+        status = os.system(run_file_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \
+                      {self.tmp}/core17_avg_rm3.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2950, delta=0.0001)
+    def test_core17_rrf(self):
+        pyserini_topics = 'core17'
+        lr_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} \
+                     --topics {pyserini_topics} --output {self.tmp}/core17_lr.txt \
+                     --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.7'
+        status = os.system(lr_cmd)
+        self.assertEqual(status, 0)
+        svm_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} \
+                      --topics {pyserini_topics} --output {self.tmp}/core17_svm.txt \
+                      --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.7'
+        status = os.system(svm_cmd)
+        self.assertEqual(status, 0)
+        rrf_cmd = f'{self.pyserini_fusion_cmd} \
+                      --runs {self.tmp}/core17_lr.txt {self.tmp}/core17_svm.txt \
+                      --output {self.tmp}/core17_rrf.txt --resort'
+        status = os.system(rrf_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \
+                      {self.tmp}/core17_rrf.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2446, delta=0.0001)
+    def test_core17_rrf_rm3(self):
+        pyserini_topics = 'core17'
+        lr_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} \
+                     --topics {pyserini_topics} --output {self.tmp}/core17_lr_rm3.txt \
+                     --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.4 --rm3'
+        status = os.system(lr_cmd)
+        self.assertEqual(status, 0)
+        svm_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} \
+                      --topics {pyserini_topics} --output {self.tmp}/core17_svm_rm3.txt \
+                      --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.4 --rm3'
+        status = os.system(svm_cmd)
+        self.assertEqual(status, 0)
+        rrf_cmd = f'{self.pyserini_fusion_cmd} \
+                      --runs {self.tmp}/core17_lr_rm3.txt {self.tmp}/core17_svm_rm3.txt \
+                      --output {self.tmp}/core17_rrf_rm3.txt --resort'
+        status = os.system(rrf_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \
+                      {self.tmp}/core17_rrf_rm3.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2957, delta=0.0001)
+    def test_core18(self):
+        self.assertTrue(self.core18_checker.run('core18_bm25', '--bm25', 0.2496))
+    def test_core18_rm3(self):
+        self.assertTrue(self.core18_checker.run('core18_bm25', '--bm25 --rm3', 0.3129))
+    def test_core18_lr(self):
+        pyserini_topics = 'core18'
+        run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} \
+                           --topics {pyserini_topics} --output {self.tmp}/core18_lr.txt \
+                           --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.6'
+        status = os.system(run_file_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \
+                      {self.tmp}/core18_lr.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2839, delta=0.0001)
+    def test_core18_lr_rm3(self):
+        pyserini_topics = 'core18'
+        run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} \
+                           --topics {pyserini_topics} --output {self.tmp}/core18_lr_rm3.txt \
+                           --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5 --rm3'
+        status = os.system(run_file_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \
+                      {self.tmp}/core18_lr_rm3.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.3216, delta=0.0001)
+    def test_core18_svm(self):
+        pyserini_topics = 'core18'
+        run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} \
+                           --topics {pyserini_topics} --output {self.tmp}/core18_svm.txt \
+                           --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.6'
+        status = os.system(run_file_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \
+                      {self.tmp}/core18_svm.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2841, delta=0.0001)
+    def test_core18_svm_rm3(self):
+        pyserini_topics = 'core18'
+        run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} \
+                           --topics {pyserini_topics} --output {self.tmp}/core18_svm_rm3.txt \
+                           --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5 --rm3'
+        status = os.system(run_file_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \
+                      {self.tmp}/core18_svm_rm3.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.3200, delta=0.0001)
+    def test_core18_avg(self):
+        pyserini_topics = 'core18'
+        run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} \
+                           --topics {pyserini_topics} --output {self.tmp}/core18_avg.txt \
+                           --prcl lr svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.4'
+        status = os.system(run_file_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \
+                      {self.tmp}/core18_avg.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2860, delta=0.0001)
+    def test_core18_avg_rm3(self):
+        pyserini_topics = 'core18'
+        run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} \
+                           --topics {pyserini_topics} --output {self.tmp}/core18_avg_rm3.txt \
+                           --prcl lr svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.4 --rm3'
+        status = os.system(run_file_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \
+                      {self.tmp}/core18_avg_rm3.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.3215, delta=0.0001)
+    def test_core18_rrf(self):
+        pyserini_topics = 'core18'
+        lr_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} \
+                     --topics {pyserini_topics} --output {self.tmp}/core18_lr.txt \
+                     --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.6'
+        status = os.system(lr_cmd)
+        self.assertEqual(status, 0)
+        svm_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} \
+                      --topics {pyserini_topics} --output {self.tmp}/core18_svm.txt \
+                      --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.6'
+        status = os.system(svm_cmd)
+        self.assertEqual(status, 0)
+        rrf_cmd = f'{self.pyserini_fusion_cmd} \
+                     --runs {self.tmp}/core18_lr.txt {self.tmp}/core18_svm.txt \
+                     --output {self.tmp}/core18_rrf.txt --resort'
+        status = os.system(rrf_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \
+                      {self.tmp}/core18_rrf.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2881, delta=0.0001)
+    def test_core18_rrf_rm3(self):
+        pyserini_topics = 'core18'
+        lr_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} \
+                     --topics {pyserini_topics} --output {self.tmp}/core18_lr_rm3.txt \
+                     --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5 --rm3'
+        status = os.system(lr_cmd)
+        self.assertEqual(status, 0)
+        svm_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} \
+                      --topics {pyserini_topics} --output {self.tmp}/core18_svm_rm3.txt \
+                      --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5 --rm3'
+        status = os.system(svm_cmd)
+        self.assertEqual(status, 0)
+        rrf_cmd = f'{self.pyserini_fusion_cmd} \
+                      --runs {self.tmp}/core18_lr_rm3.txt {self.tmp}/core18_svm_rm3.txt \
+                      --output {self.tmp}/core18_rrf_rm3.txt --resort'
+        status = os.system(rrf_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \
+                      {self.tmp}/core18_rrf_rm3.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.3205, delta=0.0001)
+    def test_robust04(self):
+        self.assertTrue(self.robust04_checker.run('robust04_bm25', '--bm25', 0.2531))
+    def test_robust04_rm3(self):
+        self.assertTrue(self.robust04_checker.run('robust04_bm25_rm3', '--bm25 --rm3', 0.2908))
+    def test_robust04_lr(self):
+        pyserini_topics = 'robust04'
+        run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} \
+                           --topics {pyserini_topics} --output {self.tmp}/robust04_lr.txt \
+                           --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5'
+        status = os.system(run_file_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \
+                      {self.tmp}/robust04_lr.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2747, delta=0.0001)
+    def test_robust04_lr_rm3(self):
+        pyserini_topics = 'robust04'
+        run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} \
+                           --topics {pyserini_topics} --output {self.tmp}/robust04_lr_rm3.txt \
+                           --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.3 --rm3'
+        status = os.system(run_file_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \
+                      {self.tmp}/robust04_lr_rm3.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2969, delta=0.0001)
+    def test_robust04_svm(self):
+        pyserini_topics = 'robust04'
+        run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} \
+                           --topics {pyserini_topics} --output {self.tmp}/robust04_svm.txt \
+                           --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5'
+        status = os.system(run_file_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \
+                      {self.tmp}/robust04_svm.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2726, delta=0.0001)
+    def test_robust04_svm_rm3(self):
+        pyserini_topics = 'robust04'
+        run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} \
+                           --topics {pyserini_topics} --output {self.tmp}/robust04_svm_rm3.txt \
+                           --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.3 --rm3'
+        status = os.system(run_file_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \
+                      {self.tmp}/robust04_svm_rm3.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2972, delta=0.0001)
+    def test_robust04_avg(self):
+        pyserini_topics = 'robust04'
+        run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} \
+                           --topics {pyserini_topics} --output {self.tmp}/robust04_avg.txt \
+                           --prcl lr svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5'
+        status = os.system(run_file_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \
+                      {self.tmp}/robust04_avg.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.276, delta=0.0001)
+    def test_robust04_avg_rm3(self):
+        pyserini_topics = 'robust04'
+        run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} \
+                           --topics {pyserini_topics} --output {self.tmp}/robust04_avg_rm3.txt \
+                           --prcl lr svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.3 --rm3'
+        status = os.system(run_file_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \
+                      {self.tmp}/robust04_avg_rm3.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2980, delta=0.0001)
+    def test_robust04_rrf(self):
+        pyserini_topics = 'robust04'
+        lr_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} \
+                     --topics {pyserini_topics} --output {self.tmp}/robust04_lr.txt \
+                     --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5'
+        status = os.system(lr_cmd)
+        self.assertEqual(status, 0)
+        svm_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} \
+                      --topics {pyserini_topics} --output {self.tmp}/robust04_svm.txt \
+                      --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5'
+        status = os.system(svm_cmd)
+        self.assertEqual(status, 0)
+        rrf_cmd = f'{self.pyserini_fusion_cmd} \
+                      --runs {self.tmp}/robust04_lr.txt {self.tmp}/robust04_svm.txt \
+                      --output {self.tmp}/robust04_rrf.txt --resort'
+        status = os.system(rrf_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \
+                      {self.tmp}/robust04_rrf.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.275, delta=0.0001)
+    def test_robust04_rrf_rm3(self):
+        pyserini_topics = 'robust04'
+        lr_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} \
+                     --topics {pyserini_topics} --output {self.tmp}/robust04_lr_rm3.txt \
+                     --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.3 --rm3'
+        status = os.system(lr_cmd)
+        self.assertEqual(status, 0)
+        svm_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} \
+                      --topics {pyserini_topics} --output {self.tmp}/robust04_svm_rm3.txt \
+                      --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.3 --rm3'
+        status = os.system(svm_cmd)
+        self.assertEqual(status, 0)
+        rrf_cmd = f'{self.pyserini_fusion_cmd} \
+                      --runs {self.tmp}/robust04_lr_rm3.txt {self.tmp}/robust04_svm_rm3.txt \
+                      --output {self.tmp}/robust04_rrf_rm3.txt --resort'
+        status = os.system(rrf_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \
+                      {self.tmp}/robust04_rrf_rm3.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2977, delta=0.0001)
+    def test_robust05(self):
+        self.assertTrue(self.robust05_checker.run('robust05_bm25', '--bm25', 0.2032))
+    def test_robust05_rm3(self):
+        self.assertTrue(self.robust05_checker.run('robust05_bm25_rm3', '--bm25 --rm3', 0.2624))
+    def test_robust05_lr(self):
+        pyserini_topics = 'robust05'
+        run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} \
+                           --topics {pyserini_topics} --output {self.tmp}/robust05_lr.txt \
+                           --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.8'
+        status = os.system(run_file_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \
+                      {self.tmp}/robust05_lr.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2476, delta=0.0001)
+    def test_robust05_lr_rm3(self):
+        pyserini_topics = 'robust05'
+        run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} \
+                           --topics {pyserini_topics} --output {self.tmp}/robust05_lr_rm3.txt \
+                           --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.6 --rm3'
+        status = os.system(run_file_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \
+                      {self.tmp}/robust05_lr_rm3.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2872, delta=0.0001)
+    def test_robust05_svm(self):
+        pyserini_topics = 'robust05'
+        run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} \
+                           --topics {pyserini_topics} --output {self.tmp}/robust05_svm.txt \
+                           --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.8'
+        status = os.system(run_file_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \
+                      {self.tmp}/robust05_svm.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2486, delta=0.0001)
+    def test_robust05_svm_rm3(self):
+        pyserini_topics = 'robust05'
+        run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} \
+                           --topics {pyserini_topics} --output {self.tmp}/robust05_svm_rm3.txt \
+                           --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.6 --rm3'
+        status = os.system(run_file_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \
+                      {self.tmp}/robust05_svm_rm3.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2871, delta=0.0001)
+    def test_robust05_avg(self):
+        pyserini_topics = 'robust05'
+        run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} \
+                           --topics {pyserini_topics} --output {self.tmp}/robust05_avg.txt \
+                           --prcl lr svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.8'
+        status = os.system(run_file_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \
+                      {self.tmp}/robust05_avg.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2485, delta=0.0001)
+    def test_robust05_avg_rm3(self):
+        pyserini_topics = 'robust05'
+        run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} \
+                           --topics {pyserini_topics} --output {self.tmp}/robust05_avg_rm3.txt \
+                           --prcl lr svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.6 --rm3'
+        status = os.system(run_file_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \
+                      {self.tmp}/robust05_avg_rm3.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2880, delta=0.0001)
+    def test_robust05_rrf(self):
+        pyserini_topics = 'robust05'
+        lr_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} \
+                     --topics {pyserini_topics} --output {self.tmp}/robust05_lr.txt \
+                     --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5'
+        status = os.system(lr_cmd)
+        self.assertEqual(status, 0)
+        svm_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} \
+                      --topics {pyserini_topics} --output {self.tmp}/robust05_svm.txt \
+                      --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5'
+        status = os.system(svm_cmd)
+        self.assertEqual(status, 0)
+        rrf_cmd = f'{self.pyserini_fusion_cmd} \
+                      --runs {self.tmp}/robust05_lr.txt {self.tmp}/robust05_svm.txt \
+                      --output {self.tmp}/robust05_rrf.txt --resort'
+        status = os.system(rrf_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \
+                      {self.tmp}/robust05_rrf.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2401, delta=0.0001)
+    def test_robust05_rrf_rm3(self):
+        pyserini_topics = 'robust05'
+        lr_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} \
+                     --topics {pyserini_topics} --output {self.tmp}/robust05_lr_rm3.txt \
+                     --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.3 --rm3'
+        status = os.system(lr_cmd)
+        self.assertEqual(status, 0)
+        svm_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} \
+                      --topics {pyserini_topics} --output {self.tmp}/robust05_svm_rm3.txt \
+                      --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.3 --rm3'
+        status = os.system(svm_cmd)
+        self.assertEqual(status, 0)
+        rrf_cmd = f'{self.pyserini_fusion_cmd} \
+                      --runs {self.tmp}/robust05_lr_rm3.txt {self.tmp}/robust05_svm_rm3.txt \
+                      --output {self.tmp}/robust05_rrf_rm3.txt --resort'
+        status = os.system(rrf_cmd)
+        self.assertEqual(status, 0)
+        score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
+                      {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \
+                      {self.tmp}/robust05_rrf_rm3.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertAlmostEqual(score, 0.2808, delta=0.0001)
+    def tearDown(self):
+        shutil.rmtree(f'{self.tmp}')
+if __name__ == '__main__':
+    unittest.main()

integrations/clprf/test_trec_covid_r3.py ADDED Viewed

	@@ -0,0 +1,85 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import json
+import os
+import re
+import shutil
+import unittest
+from random import randint
+from pyserini.util import download_url, download_prebuilt_index
+class TestSearchIntegration(unittest.TestCase):
+    def setUp(self):
+        curdir = os.getcwd()
+        if curdir.endswith('clprf'):
+            self.pyserini_root = '../..'
+        else:
+            self.pyserini_root = '.'
+        self.tmp = f'{self.pyserini_root}/integrations/tmp{randint(0, 10000)}'
+        # In the rare event there's a collision
+        if os.path.exists(self.tmp):
+            shutil.rmtree(self.tmp)
+        os.mkdir(self.tmp)
+        os.mkdir(f'{self.tmp}/runs')
+        self.round3_runs = {
+            'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round3-cumulative.txt':
+                'dfccc32efd58a8284ae411e5c6b27ce9',
+        }
+        download_url('https://ir.nist.gov/covidSubmit/archive/round3/covidex.r3.monot5',
+                     f'{self.tmp}/runs')
+        for url in self.round3_runs:
+            print(f'Verifying stored run at {url}...')
+            filename = url.split('/')[-1]
+            filename = re.sub('\\?dl=1$', '', filename)  # Remove the Dropbox 'force download' parameter
+            download_url(url, self.tmp, md5=self.round3_runs[url], force=True)
+            self.assertTrue(os.path.exists(os.path.join(self.tmp, filename)))
+    def test_bm25(self):
+        tmp_folder_name = self.tmp.split('/')[-1]
+        prebuilt_index_path = download_prebuilt_index('trec-covid-r3-abstract')
+        os.system(f'python {self.pyserini_root}/scripts/classifier_prf/rank_trec_covid.py \
+                    -alpha 0.5 \
+                    -clf lr \
+                    -vectorizer tfidf \
+                    -new_qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round3.txt \
+                    -base {self.tmp}/runs/covidex.r3.monot5 \
+                    -tmp_base {tmp_folder_name} \
+                    -qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round2-cumulative.txt \
+                    -index {prebuilt_index_path} \
+                    -tag covidex.r3.t5.lr \
+                    -output {self.tmp}/output.json')
+        with open(f'{self.tmp}/output.json') as json_file:
+            data = json.load(json_file)
+            self.assertEqual("0.3333", data['map'])
+            self.assertEqual("0.6916", data['ndcg'])
+    def tearDown(self):
+        shutil.rmtree(self.tmp)
+if __name__ == '__main__':
+    unittest.main()

integrations/clprf/test_trec_covid_r4.py ADDED Viewed

	@@ -0,0 +1,91 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import gzip
+import json
+import os
+import re
+import shutil
+import unittest
+from random import randint
+from pyserini.util import download_url, download_prebuilt_index
+class TestSearchIntegration(unittest.TestCase):
+    def setUp(self):
+        curdir = os.getcwd()
+        if curdir.endswith('clprf'):
+            self.pyserini_root = '../..'
+        else:
+            self.pyserini_root = '.'
+        self.tmp = f'{self.pyserini_root}/integrations/tmp{randint(0, 10000)}'
+        # In the rare event there's a collision
+        if os.path.exists(self.tmp):
+            shutil.rmtree(self.tmp)
+        os.mkdir(self.tmp)
+        os.mkdir(f'{self.tmp}/runs')
+        self.round4_runs = {
+            'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round4-cumulative.txt':
+                '7a5c27e8e052c49ff72d557051825973',
+        }
+        download_url('https://ir.nist.gov/covidSubmit/archive/round4/covidex.r4.d2q.duot5.gz',
+                     f'{self.tmp}/runs')
+        with gzip.open(f'{self.tmp}/runs/covidex.r4.d2q.duot5.gz', 'rb') as f_in:
+            with open(f'{self.tmp}/runs/covidex.r4.d2q.duot5', 'wb') as f_out:
+                shutil.copyfileobj(f_in, f_out)
+        for url in self.round4_runs:
+            print(f'Verifying stored run at {url}...')
+            filename = url.split('/')[-1]
+            filename = re.sub('\\?dl=1$', '', filename)  # Remove the Dropbox 'force download' parameter
+            download_url(url, self.tmp, md5=self.round4_runs[url], force=True)
+            self.assertTrue(os.path.exists(os.path.join(self.tmp, filename)))
+            print('')
+    def test_bm25(self):
+        tmp_folder_name = self.tmp.split('/')[-1]
+        prebuilt_index_path = download_prebuilt_index('trec-covid-r4-abstract')
+        os.system(f'python {self.pyserini_root}/scripts/classifier_prf/rank_trec_covid.py \
+                    -alpha 0.6 \
+                    -clf lr \
+                    -vectorizer tfidf \
+                    -new_qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round4.txt \
+                    -base {self.tmp}/runs/covidex.r4.d2q.duot5 \
+                    -tmp_base {tmp_folder_name} \
+                    -qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round3-cumulative.txt \
+                    -index {prebuilt_index_path} \
+                    -tag covidex.r4.d2q.duot5.lr \
+                    -output {self.tmp}/output.json')
+        with open(f'{self.tmp}/output.json') as json_file:
+            data = json.load(json_file)
+            self.assertEqual("0.3846", data['map'])
+            self.assertEqual("0.7745", data['ndcg'])
+    def tearDown(self):
+        shutil.rmtree(self.tmp)
+if __name__ == '__main__':
+    unittest.main()

integrations/clprf/test_trec_covid_r5.py ADDED Viewed

	@@ -0,0 +1,141 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+import re
+import shutil
+import unittest
+import json
+import gzip
+from random import randint
+from pyserini.util import download_url, download_prebuilt_index
+class TestSearchIntegration(unittest.TestCase):
+    def setUp(self):
+        curdir = os.getcwd()
+        if curdir.endswith('clprf'):
+            self.pyserini_root = '../..'
+        else:
+            self.pyserini_root = '.'
+        self.tmp = f'{self.pyserini_root}/integrations/tmp{randint(0, 10000)}'
+        # In the rare event there's a collision
+        if os.path.exists(self.tmp):
+            shutil.rmtree(self.tmp)
+        os.mkdir(self.tmp)
+        os.mkdir(f'{self.tmp}/runs')
+        self.round5_runs = {
+            'https://ir.nist.gov/covidSubmit/archive/round5/covidex.r5.d2q.1s.gz':
+                '2181ae5b7fe8bafbd3b41700f3ccde02',
+            'https://ir.nist.gov/covidSubmit/archive/round5/covidex.r5.d2q.2s.gz':
+                'e61f9b6de5ffbe1b5b82d35216968154',
+            'https://ir.nist.gov/covidSubmit/archive/round5/covidex.r5.2s.gz':
+                '6e517a5e044d8b7ce983f7e165cf4aeb',
+            'https://ir.nist.gov/covidSubmit/archive/round5/covidex.r5.1s.gz':
+                'dc9b4b45494294a8448cf0693f07f7fd'
+        }
+        for url in self.round5_runs:
+            print(f'Verifying stored run at {url}...')
+            filename = url.split('/')[-1]
+            filename = re.sub('\\?dl=1$', '', filename)  # Remove the Dropbox 'force download' parameter
+            gzip_filename = '.'.join(filename.split('.')[:-1])
+            download_url(url, f'{self.tmp}/runs/', md5=self.round5_runs[url], force=True)
+            self.assertTrue(os.path.exists(os.path.join(f'{self.tmp}/runs/', filename)))
+            with gzip.open(f'{self.tmp}/runs/{filename}', 'rb') as f_in:
+                with open(f'{self.tmp}/runs/{gzip_filename}', 'wb') as f_out:
+                    shutil.copyfileobj(f_in, f_out)
+    def test_round5(self):
+        tmp_folder_name = self.tmp.split('/')[-1]
+        prebuilt_index_path = download_prebuilt_index('trec-covid-r5-abstract')
+        os.system(f'python {self.pyserini_root}/scripts/classifier_prf/rank_trec_covid.py \
+                    -alpha 0.6 \
+                    -clf lr \
+                    -vectorizer tfidf \
+                    -new_qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round5.txt \
+                    -base {self.tmp}/runs/covidex.r5.d2q.1s \
+                    -tmp_base {tmp_folder_name} \
+                    -qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round4-cumulative.txt \
+                    -index {prebuilt_index_path} \
+                    -tag covidex.r5.d2q.1s \
+                    -output {self.tmp}/output.json')
+        with open(f'{self.tmp}/output.json') as json_file:
+            data = json.load(json_file)
+            self.assertEqual("0.3859", data['map'])
+            self.assertEqual("0.8221", data['ndcg'])
+        os.system(f'python {self.pyserini_root}/scripts/classifier_prf/rank_trec_covid.py \
+                    -alpha 0.6 \
+                    -clf lr \
+                    -vectorizer tfidf \
+                    -new_qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round5.txt \
+                    -base {self.tmp}/runs/covidex.r5.d2q.2s \
+                    -tmp_base {tmp_folder_name} \
+                    -qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round4-cumulative.txt \
+                    -index {prebuilt_index_path} \
+                    -tag covidex.r5.d2q.2s \
+                    -output {self.tmp}/output.json')
+        with open(f'{self.tmp}/output.json') as json_file:
+            data = json.load(json_file)
+            self.assertEqual("0.3875", data['map'])
+            self.assertEqual("0.8304", data['ndcg'])
+        os.system(f'python {self.pyserini_root}/scripts/classifier_prf/rank_trec_covid.py \
+                    -alpha 0.6 \
+                    -clf lr \
+                    -vectorizer tfidf \
+                    -new_qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round5.txt \
+                    -base {self.tmp}/runs/covidex.r5.1s \
+                    -tmp_base {tmp_folder_name} \
+                    -qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round4-cumulative.txt \
+                    -index {prebuilt_index_path} \
+                    -tag covidex.r5.1s \
+                    -output {self.tmp}/output.json')
+        with open(f'{self.tmp}/output.json') as json_file:
+            data = json.load(json_file)
+            self.assertEqual("0.3885", data['map'])
+            self.assertEqual("0.8135", data['ndcg'])
+        os.system(f'python {self.pyserini_root}/scripts/classifier_prf/rank_trec_covid.py \
+                    -alpha 0.6 \
+                    -clf lr \
+                    -vectorizer tfidf \
+                    -new_qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round5.txt \
+                    -base {self.tmp}/runs/covidex.r5.2s \
+                    -tmp_base {tmp_folder_name} \
+                    -qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round4-cumulative.txt \
+                    -index {prebuilt_index_path} \
+                    -tag covidex.r5.2s \
+                    -output {self.tmp}/output.json')
+        with open(f'{self.tmp}/output.json') as json_file:
+            data = json.load(json_file)
+            self.assertEqual("0.3922", data['map'])
+            self.assertEqual("0.8311", data['ndcg'])
+    def tearDown(self):
+        shutil.rmtree(self.tmp)
+if __name__ == '__main__':
+    unittest.main()

integrations/dense/test_ance.py ADDED Viewed

	@@ -0,0 +1,185 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Integration tests for ANCE and ANCE PRF using on-the-fly query encoding."""
+import os
+import socket
+import unittest
+from integrations.utils import clean_files, run_command, parse_score, parse_score_qa, parse_score_msmarco
+from pyserini.search import QueryEncoder
+from pyserini.search import get_topics
+class TestSearchIntegration(unittest.TestCase):
+    def setUp(self):
+        self.temp_files = []
+        self.threads = 16
+        self.batch_size = 256
+        self.rocchio_alpha = 0.4
+        self.rocchio_beta = 0.6
+        # Hard-code larger values for internal servers
+        if socket.gethostname().startswith('damiano') or socket.gethostname().startswith('orca'):
+            self.threads = 36
+            self.batch_size = 144
+    def test_ance_encoded_queries(self):
+        encoded = QueryEncoder.load_encoded_queries('ance-msmarco-passage-dev-subset')
+        topics = get_topics('msmarco-passage-dev-subset')
+        for t in topics:
+            self.assertTrue(topics[t]['title'] in encoded.embedding)
+        encoded = QueryEncoder.load_encoded_queries('ance-dl19-passage')
+        topics = get_topics('dl19-passage')
+        for t in topics:
+            self.assertTrue(topics[t]['title'] in encoded.embedding)
+        encoded = QueryEncoder.load_encoded_queries('ance-dl20')
+        topics = get_topics('dl20')
+        for t in topics:
+            self.assertTrue(topics[t]['title'] in encoded.embedding)
+    def test_msmarco_passage_ance_avg_prf_otf(self):
+        output_file = 'test_run.dl2019.ance.avg-prf.otf.trec'
+        self.temp_files.append(output_file)
+        cmd1 = f'python -m pyserini.search.faiss --topics dl19-passage \
+                                     --index msmarco-passage-ance-bf \
+                                     --encoder castorini/ance-msmarco-passage \
+                                     --batch-size {self.batch_size} \
+                                     --threads {self.threads} \
+                                     --output {output_file} \
+                                     --prf-depth 3 \
+                                     --prf-method avg'
+        cmd2 = f'python -m pyserini.eval.trec_eval -l 2 -m map dl19-passage {output_file}'
+        status = os.system(cmd1)
+        stdout, stderr = run_command(cmd2)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertAlmostEqual(score, 0.4247, delta=0.0001)
+    def test_msmarco_passage_ance_rocchio_prf_otf(self):
+        output_file = 'test_run.dl2019.ance.rocchio-prf.otf.trec'
+        self.temp_files.append(output_file)
+        cmd1 = f'python -m pyserini.search.faiss --topics dl19-passage \
+                                     --index msmarco-passage-ance-bf \
+                                     --encoder castorini/ance-msmarco-passage \
+                                     --batch-size {self.batch_size} \
+                                     --threads {self.threads} \
+                                     --output {output_file} \
+                                     --prf-depth 5 \
+                                     --prf-method rocchio \
+                                     --rocchio-topk 5 \
+                                     --threads {self.threads} \
+                                     --rocchio-alpha {self.rocchio_alpha} \
+                                     --rocchio-beta {self.rocchio_beta}'
+        cmd2 = f'python -m pyserini.eval.trec_eval -l 2 -m map dl19-passage {output_file}'
+        status = os.system(cmd1)
+        stdout, stderr = run_command(cmd2)
+        score = parse_score(stdout, 'map')
+        self.assertEqual(status, 0)
+        self.assertAlmostEqual(score, 0.4211, delta=0.0001)
+    def test_msmarco_doc_ance_bf_otf(self):
+        output_file = 'test_run.msmarco-doc.passage.ance-maxp.otf.txt'
+        self.temp_files.append(output_file)
+        cmd1 = f'python -m pyserini.search.faiss --topics msmarco-doc-dev \
+                             --index msmarco-doc-ance-maxp-bf \
+                             --encoder castorini/ance-msmarco-doc-maxp \
+                             --output {output_file}\
+                             --hits 1000 \
+                             --max-passage \
+                             --max-passage-hits 100 \
+                             --output-format msmarco \
+                             --batch-size {self.batch_size} \
+                             --threads {self.threads}'
+        cmd2 = f'python -m pyserini.eval.msmarco_doc_eval --judgments msmarco-doc-dev --run {output_file}'
+        status = os.system(cmd1)
+        stdout, stderr = run_command(cmd2)
+        score = parse_score_msmarco(stdout, 'MRR @100')
+        self.assertEqual(status, 0)
+        # We get a small difference, 0.3794 on macOS.
+        self.assertAlmostEqual(score, 0.3796, delta=0.0002)
+    def test_msmarco_doc_ance_bf_encoded_queries(self):
+        encoder = QueryEncoder.load_encoded_queries('ance_maxp-msmarco-doc-dev')
+        topics = get_topics('msmarco-doc-dev')
+        for t in topics:
+            self.assertTrue(topics[t]['title'] in encoder.embedding)
+    def test_nq_test_ance_bf_otf(self):
+        output_file = 'test_run.ance.nq-test.multi.bf.otf.trec'
+        retrieval_file = 'test_run.ance.nq-test.multi.bf.otf.json'
+        self.temp_files.extend([output_file, retrieval_file])
+        cmd1 = f'python -m pyserini.search.faiss --topics dpr-nq-test \
+                             --index wikipedia-ance-multi-bf \
+                             --encoder castorini/ance-dpr-question-multi \
+                             --output {output_file} \
+                             --batch-size {self.batch_size} --threads {self.threads}'
+        cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-nq-test \
+                                                           --index wikipedia-dpr \
+                                                           --input {output_file} \
+                                                           --output {retrieval_file}'
+        cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20'
+        status1 = os.system(cmd1)
+        status2 = os.system(cmd2)
+        stdout, stderr = run_command(cmd3)
+        score = parse_score_qa(stdout, 'Top20')
+        self.assertEqual(status1, 0)
+        self.assertEqual(status2, 0)
+        self.assertAlmostEqual(score, 0.8224, places=4)
+    def test_nq_test_ance_encoded_queries(self):
+        encoder = QueryEncoder.load_encoded_queries('dpr_multi-nq-test')
+        topics = get_topics('dpr-nq-test')
+        for t in topics:
+            self.assertTrue(topics[t]['title'] in encoder.embedding)
+    def test_trivia_test_ance_bf_otf(self):
+        output_file = 'test_run.ance.trivia-test.multi.bf.otf.trec'
+        retrieval_file = 'test_run.ance.trivia-test.multi.bf.otf.json'
+        self.temp_files.extend([output_file, retrieval_file])
+        cmd1 = f'python -m pyserini.search.faiss --topics dpr-trivia-test \
+                             --index wikipedia-ance-multi-bf \
+                             --encoder castorini/ance-dpr-question-multi \
+                             --output {output_file} \
+                             --batch-size {self.batch_size} --threads {self.threads}'
+        cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-trivia-test \
+                                                           --index wikipedia-dpr \
+                                                           --input {output_file} \
+                                                           --output {retrieval_file}'
+        cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20'
+        status1 = os.system(cmd1)
+        status2 = os.system(cmd2)
+        stdout, stderr = run_command(cmd3)
+        score = parse_score_qa(stdout, 'Top20')
+        self.assertEqual(status1, 0)
+        self.assertEqual(status2, 0)
+        self.assertAlmostEqual(score, 0.8010, places=4)
+    def test_trivia_test_ance_encoded_queries(self):
+        encoder = QueryEncoder.load_encoded_queries('dpr_multi-trivia-test')
+        topics = get_topics('dpr-trivia-test')
+        for t in topics:
+            self.assertTrue(topics[t]['title'] in encoder.embedding)
+    def tearDown(self):
+        clean_files(self.temp_files)
+if __name__ == '__main__':
+    unittest.main()

integrations/dense/test_distilbert_kd.py ADDED Viewed

	@@ -0,0 +1,45 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Integration tests for DistilBERT KD."""
+import unittest
+from pyserini.search import QueryEncoder
+from pyserini.search import get_topics
+class TestSearchIntegration(unittest.TestCase):
+    # Note that we test actual retrieval in 2CR, so no need to test here.
+    def test_distilbert_kd_encoded_queries(self):
+        encoded = QueryEncoder.load_encoded_queries('distilbert_kd-msmarco-passage-dev-subset')
+        topics = get_topics('msmarco-passage-dev-subset')
+        for t in topics:
+            self.assertTrue(topics[t]['title'] in encoded.embedding)
+        encoded = QueryEncoder.load_encoded_queries('distilbert_kd-dl19-passage')
+        topics = get_topics('dl19-passage')
+        for t in topics:
+            self.assertTrue(topics[t]['title'] in encoded.embedding)
+        encoded = QueryEncoder.load_encoded_queries('distilbert_kd-dl20')
+        topics = get_topics('dl20')
+        for t in topics:
+            self.assertTrue(topics[t]['title'] in encoded.embedding)
+if __name__ == '__main__':
+    unittest.main()

integrations/dense/test_distilbert_tasb.py ADDED Viewed

	@@ -0,0 +1,44 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Integration tests for DistilBERT TAS-B."""
+import unittest
+from pyserini.search import QueryEncoder
+from pyserini.search import get_topics
+class TestSearchIntegration(unittest.TestCase):
+    def test_distilbert_kd_tas_b_encoded_queries(self):
+        encoded = QueryEncoder.load_encoded_queries('distilbert_tas_b-msmarco-passage-dev-subset')
+        topics = get_topics('msmarco-passage-dev-subset')
+        for t in topics:
+            self.assertTrue(topics[t]['title'] in encoded.embedding)
+        encoded = QueryEncoder.load_encoded_queries('distilbert_tas_b-dl19-passage')
+        topics = get_topics('dl19-passage')
+        for t in topics:
+            self.assertTrue(topics[t]['title'] in encoded.embedding)
+        encoded = QueryEncoder.load_encoded_queries('distilbert_tas_b-dl20')
+        topics = get_topics('dl20')
+        for t in topics:
+            self.assertTrue(topics[t]['title'] in encoded.embedding)
+if __name__ == '__main__':
+    unittest.main()

integrations/dense/test_dpr.py ADDED Viewed

	@@ -0,0 +1,339 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Integration tests for DPR model using pre-encoded queries."""
+import json
+import os
+import socket
+import unittest
+from integrations.utils import clean_files, run_command, parse_score_qa
+from pyserini.search import QueryEncoder
+from pyserini.search import get_topics
+class TestSearchIntegration(unittest.TestCase):
+    def setUp(self):
+        self.temp_files = []
+        self.threads = 16
+        self.batch_size = 256
+        # Hard-code larger values for internal servers
+        if socket.gethostname().startswith('damiano') or socket.gethostname().startswith('orca'):
+            self.threads = 36
+            self.batch_size = 144
+    def test_dpr_nq_test_bf_otf(self):
+        output_file = 'test_run.dpr.nq-test.multi.bf.otf.trec'
+        retrieval_file = 'test_run.dpr.nq-test.multi.bf.otf.json'
+        self.temp_files.extend([output_file, retrieval_file])
+        cmd1 = f'python -m pyserini.search.faiss --topics dpr-nq-test \
+                             --index wikipedia-dpr-multi-bf \
+                             --encoder facebook/dpr-question_encoder-multiset-base \
+                             --output {output_file} \
+                             --batch-size {self.batch_size} --threads {self.threads}'
+        cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-nq-test \
+                                                           --index wikipedia-dpr \
+                                                           --input {output_file} \
+                                                           --output {retrieval_file}'
+        cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20'
+        status1 = os.system(cmd1)
+        status2 = os.system(cmd2)
+        stdout, stderr = run_command(cmd3)
+        score = parse_score_qa(stdout, 'Top20')
+        self.assertEqual(status1, 0)
+        self.assertEqual(status2, 0)
+        self.assertAlmostEqual(score, 0.7947, places=4)
+    def test_dpr_nq_test_bf_bm25_hybrid_otf(self):
+        output_file = 'test_run.dpr.nq-test.multi.bf.otf.bm25.trec'
+        retrieval_file = 'test_run.dpr.nq-test.multi.bf.otf.bm25.json'
+        self.temp_files.extend([output_file, retrieval_file])
+        cmd1 = f'python -m pyserini.search.hybrid dense  --index wikipedia-dpr-multi-bf \
+                                    --encoder facebook/dpr-question_encoder-multiset-base \
+                             sparse --index wikipedia-dpr \
+                             fusion --alpha 1.3 \
+                             run    --topics dpr-nq-test \
+                                    --batch-size {self.batch_size} --threads {self.threads} \
+                                    --output {output_file} '
+        cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-nq-test \
+                                                           --index wikipedia-dpr \
+                                                           --input {output_file} \
+                                                           --output {retrieval_file}'
+        cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20'
+        status1 = os.system(cmd1)
+        status2 = os.system(cmd2)
+        stdout, stderr = run_command(cmd3)
+        score = parse_score_qa(stdout, 'Top20')
+        self.assertEqual(status1, 0)
+        self.assertEqual(status2, 0)
+        self.assertAlmostEqual(score, 0.8260, places=4)
+    def test_dpr_nq_test_encoded_queries(self):
+        encoder = QueryEncoder.load_encoded_queries('dpr_multi-nq-test')
+        topics = get_topics('dpr-nq-test')
+        for t in topics:
+            self.assertTrue(topics[t]['title'] in encoder.embedding)
+    def test_dpr_trivia_test_bf_otf(self):
+        output_file = 'test_run.dpr.trivia-test.multi.bf.otf.trec'
+        retrieval_file = 'test_run.dpr.trivia-test.multi.bf.otf.json'
+        self.temp_files.extend([output_file, retrieval_file])
+        cmd1 = f'python -m pyserini.search.faiss --topics dpr-trivia-test \
+                             --encoder facebook/dpr-question_encoder-multiset-base \
+                             --index wikipedia-dpr-multi-bf \
+                             --output {output_file} \
+                             --batch-size {self.batch_size} --threads {self.threads}'
+        cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-trivia-test \
+                                                           --index wikipedia-dpr \
+                                                           --input {output_file} \
+                                                           --output {retrieval_file}'
+        cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20'
+        status1 = os.system(cmd1)
+        status2 = os.system(cmd2)
+        stdout, stderr = run_command(cmd3)
+        score = parse_score_qa(stdout, 'Top20')
+        self.assertEqual(status1, 0)
+        self.assertEqual(status2, 0)
+        self.assertAlmostEqual(score, 0.7887, places=4)
+    def test_dpr_trivia_test_bf_bm25_hybrid_otf(self):
+        output_file = 'test_run.dpr.trivia-test.multi.bf.otf.bm25.trec'
+        retrieval_file = 'test_run.dpr.trivia-test.multi.bf.otf.bm25.json'
+        self.temp_files.extend([output_file, retrieval_file])
+        cmd1 = f'python -m pyserini.search.hybrid dense  --index wikipedia-dpr-multi-bf \
+                                    --encoder facebook/dpr-question_encoder-multiset-base \
+                             sparse --index wikipedia-dpr \
+                             fusion --alpha 0.95 \
+                             run    --topics dpr-trivia-test \
+                                    --batch-size {self.batch_size} --threads {self.threads} \
+                                    --output {output_file} '
+        cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-trivia-test \
+                                                           --index wikipedia-dpr \
+                                                           --input {output_file} \
+                                                           --output {retrieval_file}'
+        cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20'
+        status1 = os.system(cmd1)
+        status2 = os.system(cmd2)
+        stdout, stderr = run_command(cmd3)
+        score = parse_score_qa(stdout, 'Top20')
+        self.assertEqual(status1, 0)
+        self.assertEqual(status2, 0)
+        self.assertAlmostEqual(score, 0.8264, places=4)
+    def test_dpr_trivia_test_encoded_queries(self):
+        encoder = QueryEncoder.load_encoded_queries('dpr_multi-trivia-test')
+        topics = get_topics('dpr-trivia-test')
+        for t in topics:
+            self.assertTrue(topics[t]['title'] in encoder.embedding)
+    def test_dpr_wq_test_bf_otf(self):
+        output_file = 'test_run.dpr.wq-test.multi.bf.otf.trec'
+        retrieval_file = 'test_run.dpr.wq-test.multi.bf.otf.json'
+        self.temp_files.extend([output_file, retrieval_file])
+        cmd1 = f'python -m pyserini.search.faiss --topics dpr-wq-test \
+                             --index wikipedia-dpr-multi-bf \
+                             --encoder facebook/dpr-question_encoder-multiset-base \
+                             --output {output_file} \
+                             --batch-size {self.batch_size} --threads {self.threads}'
+        cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-wq-test \
+                                                           --index wikipedia-dpr \
+                                                           --input {output_file} \
+                                                           --output {retrieval_file}'
+        cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20'
+        status1 = os.system(cmd1)
+        status2 = os.system(cmd2)
+        stdout, stderr = run_command(cmd3)
+        score = parse_score_qa(stdout, 'Top20')
+        self.assertEqual(status1, 0)
+        self.assertEqual(status2, 0)
+        self.assertAlmostEqual(score, 0.7505, places=4)
+    def test_dpr_wq_test_bf_bm25_hybrid_otf(self):
+        output_file = 'test_run.dpr.wq-test.multi.bf.otf.bm25.trec'
+        retrieval_file = 'test_run.dpr.wq-test.multi.bf.otf.bm25.json'
+        self.temp_files.extend([output_file, retrieval_file])
+        cmd1 = f'python -m pyserini.search.hybrid dense  --index wikipedia-dpr-multi-bf \
+                                    --encoder facebook/dpr-question_encoder-multiset-base \
+                             sparse --index wikipedia-dpr \
+                             fusion --alpha 0.95 \
+                             run    --topics dpr-wq-test \
+                                    --batch-size {self.batch_size} --threads {self.threads} \
+                                    --output {output_file} '
+        cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-wq-test \
+                                                           --index wikipedia-dpr \
+                                                           --input {output_file} \
+                                                           --output {retrieval_file}'
+        cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20'
+        status1 = os.system(cmd1)
+        status2 = os.system(cmd2)
+        stdout, stderr = run_command(cmd3)
+        score = parse_score_qa(stdout, 'Top20')
+        self.assertEqual(status1, 0)
+        self.assertEqual(status2, 0)
+        self.assertAlmostEqual(score, 0.7712, places=4)
+    def test_dpr_wq_test_encoded_queries(self):
+        encoder = QueryEncoder.load_encoded_queries('dpr_multi-wq-test')
+        topics = get_topics('dpr-wq-test')
+        for t in topics:
+            self.assertTrue(topics[t]['title'] in encoder.embedding)
+    def test_dpr_curated_test_bf_otf(self):
+        output_file = 'test_run.dpr.curated-test.multi.bf.otf.trec'
+        retrieval_file = 'test_run.dpr.curated-test.multi.bf.otf.json'
+        self.temp_files.extend([output_file, retrieval_file])
+        cmd1 = f'python -m pyserini.search.faiss --topics dpr-curated-test \
+                             --index wikipedia-dpr-multi-bf \
+                             --encoder facebook/dpr-question_encoder-multiset-base \
+                             --output {output_file} \
+                             --batch-size {self.batch_size} --threads {self.threads}'
+        cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-curated-test \
+                                                           --index wikipedia-dpr \
+                                                           --input {output_file} \
+                                                           --output {retrieval_file} \
+                                                           --regex'
+        cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20 --regex'
+        status1 = os.system(cmd1)
+        status2 = os.system(cmd2)
+        stdout, stderr = run_command(cmd3)
+        score = parse_score_qa(stdout, 'Top20')
+        self.assertEqual(status1, 0)
+        self.assertEqual(status2, 0)
+        self.assertAlmostEqual(score, 0.8876, places=4)
+    def test_dpr_curated_test_bf_bm25_hybrid_otf(self):
+        output_file = 'test_run.dpr.curated-test.multi.bf.otf.bm25.trec'
+        retrieval_file = 'test_run.dpr.curated-test.multi.bf.otf.bm25.json'
+        self.temp_files.extend([output_file, retrieval_file])
+        cmd1 = f'python -m pyserini.search.hybrid dense  --index wikipedia-dpr-multi-bf \
+                                    --encoder facebook/dpr-question_encoder-multiset-base \
+                             sparse --index wikipedia-dpr \
+                             fusion --alpha 1.05 \
+                             run    --topics dpr-curated-test \
+                                    --batch-size {self.batch_size} --threads {self.threads} \
+                                    --output {output_file} '
+        cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-curated-test \
+                                                           --index wikipedia-dpr \
+                                                           --input {output_file} \
+                                                           --output {retrieval_file} \
+                                                           --regex'
+        cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20 --regex'
+        status1 = os.system(cmd1)
+        status2 = os.system(cmd2)
+        stdout, stderr = run_command(cmd3)
+        score = parse_score_qa(stdout, 'Top20')
+        self.assertEqual(status1, 0)
+        self.assertEqual(status2, 0)
+        self.assertAlmostEqual(score, 0.9006, places=4)
+    def test_dpr_curated_test_encoded_queries(self):
+        encoder = QueryEncoder.load_encoded_queries('dpr_multi-curated-test')
+        topics = get_topics('dpr-curated-test')
+        for t in topics:
+            self.assertTrue(topics[t]['title'] in encoder.embedding)
+    def test_dpr_squad_test_bf_otf(self):
+        output_file = 'test_run.dpr.squad-test.multi.bf.otf.trec'
+        retrieval_file = 'test_run.dpr.squad-test.multi.bf.otf.json'
+        self.temp_files.extend([output_file, retrieval_file])
+        cmd1 = f'python -m pyserini.search.faiss --topics dpr-squad-test \
+                             --index wikipedia-dpr-multi-bf \
+                             --encoder facebook/dpr-question_encoder-multiset-base \
+                             --output {output_file} \
+                             --batch-size {self.batch_size} --threads {self.threads}'
+        cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-squad-test \
+                                                           --index wikipedia-dpr \
+                                                           --input {output_file} \
+                                                           --output {retrieval_file}'
+        cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20'
+        status1 = os.system(cmd1)
+        status2 = os.system(cmd2)
+        stdout, stderr = run_command(cmd3)
+        score = parse_score_qa(stdout, 'Top20')
+        self.assertEqual(status1, 0)
+        self.assertEqual(status2, 0)
+        self.assertAlmostEqual(score, 0.5199, places=4)
+    def test_dpr_squad_test_bf_bm25_hybrid_otf(self):
+        output_file = 'test_run.dpr.squad-test.multi.bf.otf.bm25.trec'
+        retrieval_file = 'test_run.dpr.squad-test.multi.bf.otf.bm25.json'
+        self.temp_files.extend([output_file, retrieval_file])
+        cmd1 = f'python -m pyserini.search.hybrid dense  --index wikipedia-dpr-multi-bf \
+                                    --encoder facebook/dpr-question_encoder-multiset-base \
+                             sparse --index wikipedia-dpr \
+                             fusion --alpha 2.0 \
+                             run    --topics dpr-squad-test \
+                                    --batch-size {self.batch_size} --threads {self.threads} \
+                                    --output {output_file} '
+        cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-squad-test \
+                                                           --index wikipedia-dpr \
+                                                           --input {output_file} \
+                                                           --output {retrieval_file}'
+        cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20'
+        status1 = os.system(cmd1)
+        status2 = os.system(cmd2)
+        stdout, stderr = run_command(cmd3)
+        score = parse_score_qa(stdout, 'Top20')
+        self.assertEqual(status1, 0)
+        self.assertEqual(status2, 0)
+        # This appears to be a flaky test case; previously, we were getting a score of 0.7511, per
+        # https://github.com/castorini/pyserini/pull/1273/files#diff-799c2c339e1d7defa31fa1e82f9b16886269b37805376ef93f7c8afedcee574e
+        # Sometimes we get 0.7512. Fix is to reduce tolerance.
+        self.assertAlmostEqual(score, 0.7514, places=3)
+    def test_dpr_squad_test_encoded_queries(self):
+        encoder = QueryEncoder.load_encoded_queries('dpr_multi-squad-test')
+        topics = get_topics('dpr-squad-test')
+        for t in topics:
+            self.assertTrue(topics[t]['title'] in encoder.embedding)
+    def test_convert_trec_run_to_dpr_retrieval_run(self):
+        trec_run_file = 'tests/resources/simple_test_run_convert_trec_run_dpr.trec'
+        topics_file = 'tests/resources/simple_topics_dpr.txt'
+        dpr_run_file = 'test_run.convert.trec_run.dpr.json'
+        collection_path = "tests/resources/sample_collection_dense"
+        topic_reader = "io.anserini.search.topicreader.DprNqTopicReader"
+        index_dir = 'temp_index'
+        self.temp_files.extend([dpr_run_file, index_dir])
+        cmd1 = f'python -m pyserini.index.lucene -collection JsonCollection ' + \
+               f'-generator DefaultLuceneDocumentGenerator ' + \
+               f'-threads 1 -input {collection_path} -index {index_dir} -storeRaw'
+        cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics-file {topics_file} \
+                                                           --topics-reader {topic_reader} \
+                                                           --index {index_dir} \
+                                                           --input {trec_run_file} \
+                                                           --output {dpr_run_file}'
+        _ = os.system(cmd1)
+        _ = os.system(cmd2)
+        with open(dpr_run_file) as f:
+            topic_data = json.load(f)
+        self.assertEqual(topic_data["0"]["answers"], ['text'])
+        self.assertEqual(topic_data["0"]["question"], "what is in document three")
+        self.assertEqual(topic_data["1"]["answers"], ['contents'])
+        self.assertEqual(topic_data["1"]["question"], "what is document two")
+    def tearDown(self):
+        clean_files(self.temp_files)
+if __name__ == '__main__':
+    unittest.main()

integrations/dense/test_encode.py ADDED Viewed

	@@ -0,0 +1,137 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Integration tests for create dense index """
+import os
+import shutil
+import unittest
+from urllib.request import urlretrieve
+import faiss
+from pyserini.search.faiss import FaissSearcher
+from pyserini.search.lucene import LuceneImpactSearcher
+class TestSearchIntegration(unittest.TestCase):
+    def setUp(self):
+        curdir = os.getcwd()
+        if curdir.endswith('dense'):
+            self.pyserini_root = '../..'
+        else:
+            self.pyserini_root = '.'
+        self.temp_folders = []
+        self.corpus_url = 'https://github.com/castorini/anserini-data/raw/master/CACM/corpus/jsonl/cacm.json'
+        self.corpus_path = f'{self.pyserini_root}/integrations/dense/temp_cacm/'
+        os.makedirs(self.corpus_path, exist_ok=True)
+        self.temp_folders.append(self.corpus_path)
+        urlretrieve(self.corpus_url, os.path.join(self.corpus_path, 'cacm.json'))
+    def test_dpr_encode_as_faiss(self):
+        index_dir = f'{self.pyserini_root}/temp_index'
+        self.temp_folders.append(index_dir)
+        cmd1 = f'python -m pyserini.encode input   --corpus {self.corpus_path} \
+                                  --fields text \
+                          output  --embeddings {index_dir} --to-faiss \
+                          encoder --encoder facebook/dpr-ctx_encoder-multiset-base \
+                                  --fields text \
+                                  --batch 4 \
+                                  --device cpu'
+        _ = os.system(cmd1)
+        searcher = FaissSearcher(
+            index_dir,
+            'facebook/dpr-question_encoder-multiset-base'
+        )
+        q_emb, hit = searcher.search('What is the solution of separable closed queueing networks?', k=1, return_vector=True)
+        self.assertEqual(hit[0].docid, 'CACM-2445')
+        self.assertAlmostEqual(hit[0].vectors[0], -6.88267112e-01, places=4)
+        self.assertEqual(searcher.num_docs, 3204)
+    def test_dpr_encode_as_faiss_search_with_partitions(self):
+        # Create two partitions of the CACM index, search them individually, and merge results to compute top hit
+        index_dir = f'{self.pyserini_root}/temp_index'
+        os.makedirs(os.path.join(index_dir, 'partition1'), exist_ok=True)
+        os.makedirs(os.path.join(index_dir, 'partition2'), exist_ok=True)
+        self.temp_folders.append(index_dir)
+        cmd1 = f'python -m pyserini.encode input   --corpus {self.corpus_path} \
+                                  --fields text \
+                          output  --embeddings {index_dir} --to-faiss \
+                          encoder --encoder facebook/dpr-ctx_encoder-multiset-base \
+                                  --fields text \
+                                  --batch 4 \
+                                  --device cpu'
+        _ = os.system(cmd1)
+        index = faiss.read_index(os.path.join(index_dir, 'index'))
+        new_index_partition1 = faiss.IndexFlatIP(index.d)
+        new_index_partition2 = faiss.IndexFlatIP(index.d)
+        vectors_partition1 = index.reconstruct_n(0, index.ntotal // 2)
+        vectors_partition2 = index.reconstruct_n(index.ntotal // 2, index.ntotal - index.ntotal // 2)
+        new_index_partition1.add(vectors_partition1)
+        new_index_partition2.add(vectors_partition2)
+        faiss.write_index(new_index_partition1, os.path.join(index_dir, 'partition1/index'))
+        faiss.write_index(new_index_partition2, os.path.join(index_dir, 'partition2/index'))
+        with open(os.path.join(index_dir, 'partition1/docid'), 'w') as docid1, open(os.path.join(index_dir, 'partition2/docid'), 'w') as docid2:
+            with open(os.path.join(index_dir, 'docid'), 'r') as file:
+                for i in range(index.ntotal):
+                    line = next(file)
+                    if i < (index.ntotal // 2):
+                        docid1.write(line)
+                    else:
+                        docid2.write(line)
+        searcher_partition1 = FaissSearcher(index_dir + '/partition1','facebook/dpr-question_encoder-multiset-base')
+        searcher_partition2 = FaissSearcher(index_dir + '/partition2','facebook/dpr-question_encoder-multiset-base')
+        q_emb, hit1 = searcher_partition1.search('What is the solution of separable closed queueing networks?', k=2, return_vector=True)
+        q_emb, hit2 = searcher_partition2.search('What is the solution of separable closed queueing networks?', k=2, return_vector=True)
+        merged_hits = hit1 + hit2
+        merged_hits.sort(key=lambda x: x.score, reverse=True)
+        self.assertEqual(merged_hits[0].docid, 'CACM-2445')
+        self.assertAlmostEqual(merged_hits[0].vectors[0], -6.88267112e-01, places=4)
+        self.assertEqual(searcher_partition1.num_docs, 1602)
+        self.assertEqual(searcher_partition2.num_docs, 1602)
+    def test_unicoil_encode_as_jsonl(self):
+        embedding_dir = f'{self.pyserini_root}/temp_embeddings'
+        self.temp_folders.append(embedding_dir)
+        cmd1 = f'python -m pyserini.encode input   --corpus {self.corpus_path} \
+                                  --fields text \
+                          output  --embeddings {embedding_dir} \
+                          encoder --encoder castorini/unicoil-msmarco-passage \
+                                  --fields text \
+                                  --batch 4 \
+                                  --device cpu'
+        _ = os.system(cmd1)
+        index_dir = f'{self.pyserini_root}/temp_lucene'
+        self.temp_folders.append(index_dir)
+        cmd2 = f'python -m pyserini.index -collection JsonVectorCollection \
+                                          -input {embedding_dir} \
+                                          -index {index_dir} \
+                                          -generator DefaultLuceneDocumentGenerator \
+                                          -impact -pretokenized -threads 12 -storeRaw'
+        _ = os.system(cmd2)
+        searcher = LuceneImpactSearcher(index_dir, query_encoder='castorini/unicoil-msmarco-passage')
+        hits = searcher.search('What is the solution of separable closed queueing networks?', k=1)
+        hit = hits[0]
+        self.assertEqual(hit.docid, 'CACM-2712')
+        self.assertAlmostEqual(hit.score, 18.402, places=3)
+    def tearDown(self):
+        for f in self.temp_folders:
+            shutil.rmtree(f)

integrations/dense/test_kilt.py ADDED Viewed

	@@ -0,0 +1,69 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Integration tests for KILT integration."""
+import os
+import re
+import socket
+import unittest
+from integrations.utils import clean_files, run_command
+def parse_kilt_score(output, metric, digits=4):
+    pattern = re.compile(r"[0-1]\.[0-9]*")
+    for line in output.split('\n')[::-1]:
+        if metric in line:
+            score = float(pattern.search(line).group(0))
+            return round(score, digits)
+    return None
+class TestSearchIntegration(unittest.TestCase):
+    def setUp(self):
+        self.temp_files = []
+        self.threads = 16
+        self.batch_size = 256
+        # Hard-code larger values for internal servers
+        if socket.gethostname().startswith('damiano') or socket.gethostname().startswith('orca'):
+            self.threads = 36
+            self.batch_size = 144
+    def test_kilt_search(self):
+        run_file = 'test_run.fever-dev-kilt.jsonl'
+        self.temp_files.append(run_file)
+        cmd1 = f'python -m pyserini.search --topics fever-dev-kilt \
+                             --topics-format kilt \
+                             --index wikipedia-kilt-doc \
+                             --output {run_file} \
+                             --output-format kilt \
+                             --threads {self.threads} \
+                             --batch-size {self.batch_size}'
+        status = os.system(cmd1)
+        self.assertEqual(status, 0)
+        cmd2 = f'python -m pyserini.eval.evaluate_kilt_retrieval {run_file} fever-dev-kilt --ks 1,100'
+        stdout, stderr = run_command(cmd2)
+        score = parse_kilt_score(stdout, "Rprec")
+        self.assertAlmostEqual(score, 0.3821, delta=0.0001)
+    def tearDown(self):
+        clean_files(self.temp_files)
+if __name__ == '__main__':
+    unittest.main()

integrations/dense/test_sbert.py ADDED Viewed

	@@ -0,0 +1,67 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Integration tests for ANCE model using on-the-fly query encoding."""
+import os
+import socket
+import unittest
+from integrations.utils import clean_files, run_command, parse_score
+from pyserini.search import QueryEncoder
+from pyserini.search import get_topics
+class TestSearchIntegration(unittest.TestCase):
+    def setUp(self):
+        self.temp_files = []
+        self.threads = 16
+        self.batch_size = 256
+        # Hard-code larger values for internal servers
+        if socket.gethostname().startswith('damiano') or socket.gethostname().startswith('orca'):
+            self.threads = 36
+            self.batch_size = 144
+    def test_msmarco_passage_sbert_bf_otf(self):
+        output_file = 'test_run.msmarco-passage.sbert.bf.otf.tsv'
+        self.temp_files.append(output_file)
+        cmd1 = f'python -m pyserini.search.faiss --topics msmarco-passage-dev-subset \
+                             --index msmarco-passage-sbert-bf \
+                             --encoder sentence-transformers/msmarco-distilbert-base-v3 \
+                             --batch-size {self.batch_size} \
+                             --threads {self.threads} \
+                             --output {output_file} \
+                             --output-format msmarco'
+        cmd2 = f'python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset {output_file}'
+        status = os.system(cmd1)
+        stdout, stderr = run_command(cmd2)
+        score = parse_score(stdout, "MRR @10")
+        self.assertEqual(status, 0)
+        self.assertAlmostEqual(score, 0.3314, delta=0.0001)
+    def test_msmarco_passage_sbert_encoded_queries(self):
+        encoded = QueryEncoder.load_encoded_queries('sbert-msmarco-passage-dev-subset')
+        topics = get_topics('msmarco-passage-dev-subset')
+        for t in topics:
+            self.assertTrue(topics[t]['title'] in encoded.embedding)
+    def tearDown(self):
+        clean_files(self.temp_files)
+if __name__ == '__main__':
+    unittest.main()

integrations/dense/test_tct_colbert-v2.py ADDED Viewed

	@@ -0,0 +1,132 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Integration tests for TCT-ColBERTv2 models using on-the-fly query encoding."""
+import os
+import socket
+import unittest
+from integrations.utils import clean_files, run_command, parse_score
+from pyserini.search import QueryEncoder
+from pyserini.search import get_topics
+class TestSearchIntegration(unittest.TestCase):
+    def setUp(self):
+        self.temp_files = []
+        self.threads = 16
+        self.batch_size = 256
+        # Hard-code larger values for internal servers
+        if socket.gethostname().startswith('damiano') or socket.gethostname().startswith('orca'):
+            self.threads = 36
+            self.batch_size = 144
+    def test_msmarco_passage_tct_colbert_v2_bf_otf(self):
+        output_file = 'test_run.msmarco-passage.tct_colbert-v2.bf-otf.tsv'
+        self.temp_files.append(output_file)
+        cmd1 = f'python -m pyserini.search.faiss --topics msmarco-passage-dev-subset \
+                             --index msmarco-passage-tct_colbert-v2-bf \
+                             --encoder castorini/tct_colbert-v2-msmarco \
+                             --batch-size {self.batch_size} \
+                             --threads {self.threads} \
+                             --output {output_file} \
+                             --output-format msmarco'
+        cmd2 = f'python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset {output_file}'
+        status = os.system(cmd1)
+        stdout, stderr = run_command(cmd2)
+        score = parse_score(stdout, "MRR @10")
+        self.assertEqual(status, 0)
+        self.assertAlmostEqual(score, 0.3440, delta=0.0001)
+    def test_msmarco_passage_tct_colbert_v2_hn_otf(self):
+        output_file = 'test_run.msmarco-passage.tct_colbert-v2-hn.bf-otf.tsv'
+        self.temp_files.append(output_file)
+        cmd1 = f'python -m pyserini.search.faiss --topics msmarco-passage-dev-subset \
+                             --index msmarco-passage-tct_colbert-v2-hn-bf \
+                             --encoder castorini/tct_colbert-v2-hn-msmarco \
+                             --batch-size {self.batch_size} \
+                             --threads {self.threads} \
+                             --output {output_file} \
+                             --output-format msmarco'
+        cmd2 = f'python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset {output_file}'
+        status = os.system(cmd1)
+        stdout, stderr = run_command(cmd2)
+        score = parse_score(stdout, "MRR @10")
+        self.assertEqual(status, 0)
+        self.assertAlmostEqual(score, 0.3543, delta=0.0001)
+    def test_msmarco_passage_tct_colbert_v2_hnp_bf_bm25_hybrid_otf(self):
+        output_file = 'test_run.msmarco-passage.tct_colbert-v2-hnp.bf-otf.bm25.tsv'
+        self.temp_files.append(output_file)
+        cmd1 = f'python -m pyserini.search.hybrid dense  --index msmarco-passage-tct_colbert-v2-hnp-bf \
+                                    --encoder castorini/tct_colbert-v2-hnp-msmarco \
+                             sparse --index msmarco-passage \
+                             fusion --alpha 0.06 \
+                             run    --topics msmarco-passage-dev-subset \
+                                    --output {output_file} \
+                                    --batch-size {self.batch_size} --threads {self.threads} \
+                                    --output-format msmarco'
+        cmd2 = f'python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset {output_file}'
+        status = os.system(cmd1)
+        stdout, stderr = run_command(cmd2)
+        score = parse_score(stdout, "MRR @10")
+        self.assertEqual(status, 0)
+        self.assertAlmostEqual(score, 0.3682, delta=0.0001)
+    def test_msmarco_passage_tct_colbert_v2_hnp_bf_d2q_hybrid_otf(self):
+        output_file = 'test_run.msmarco-passage.tct_colbert-v2-hnp.bf-otf.doc2queryT5.tsv'
+        self.temp_files.append(output_file)
+        cmd1 = f'python -m pyserini.search.hybrid dense  --index msmarco-passage-tct_colbert-v2-hnp-bf \
+                                    --encoder castorini/tct_colbert-v2-hnp-msmarco \
+                             sparse --index msmarco-passage-expanded \
+                             fusion --alpha 0.1 \
+                             run    --topics msmarco-passage-dev-subset \
+                                    --output {output_file} \
+                                    --batch-size {self.batch_size} --threads {self.threads} \
+                                    --output-format msmarco'
+        cmd2 = f'python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset {output_file}'
+        status = os.system(cmd1)
+        stdout, stderr = run_command(cmd2)
+        score = parse_score(stdout, "MRR @10")
+        self.assertEqual(status, 0)
+        self.assertAlmostEqual(score, 0.3731, delta=0.0001)
+    def test_msmarco_passage_tct_colbert_v2_encoded_queries(self):
+        encoded = QueryEncoder.load_encoded_queries('tct_colbert-v2-msmarco-passage-dev-subset')
+        topics = get_topics('msmarco-passage-dev-subset')
+        for t in topics:
+            self.assertTrue(topics[t]['title'] in encoded.embedding)
+    def test_msmarco_passage_tct_colbert_v2_hn_encoded_queries(self):
+        encoded = QueryEncoder.load_encoded_queries('tct_colbert-v2-hn-msmarco-passage-dev-subset')
+        topics = get_topics('msmarco-passage-dev-subset')
+        for t in topics:
+            self.assertTrue(topics[t]['title'] in encoded.embedding)
+    def test_msmarco_passage_tct_colbert_v2_hnp_encoded_queries(self):
+        encoded = QueryEncoder.load_encoded_queries('tct_colbert-v2-hnp-msmarco-passage-dev-subset')
+        topics = get_topics('msmarco-passage-dev-subset')
+        for t in topics:
+            self.assertTrue(topics[t]['title'] in encoded.embedding)
+    def tearDown(self):
+        clean_files(self.temp_files)
+if __name__ == '__main__':
+    unittest.main()

integrations/dense/test_tct_colbert.py ADDED Viewed

	@@ -0,0 +1,183 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Integration tests for TCT-ColBERTv1 models using on-the-fly query encoding."""
+import os
+import socket
+import unittest
+from integrations.utils import clean_files, run_command, parse_score
+from pyserini.search import QueryEncoder
+from pyserini.search import get_topics
+class TestSearchIntegration(unittest.TestCase):
+    def setUp(self):
+        self.temp_files = []
+        self.threads = 16
+        self.batch_size = 256
+        # Hard-code larger values for internal servers
+        if socket.gethostname().startswith('damiano') or socket.gethostname().startswith('orca'):
+            self.threads = 36
+            self.batch_size = 144
+    def test_msmarco_passage_tct_colbert_bf_otf(self):
+        output_file = 'test_run.msmarco-passage.tct_colbert.bf-otf.tsv'
+        self.temp_files.append(output_file)
+        cmd1 = f'python -m pyserini.search.faiss --topics msmarco-passage-dev-subset \
+                             --index msmarco-passage-tct_colbert-bf \
+                             --encoder castorini/tct_colbert-msmarco \
+                             --batch-size {self.batch_size} \
+                             --threads {self.threads} \
+                             --output {output_file} \
+                             --output-format msmarco'
+        cmd2 = f'python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset {output_file}'
+        status = os.system(cmd1)
+        stdout, stderr = run_command(cmd2)
+        score = parse_score(stdout, "MRR @10")
+        self.assertEqual(status, 0)
+        # We get a small difference in scores on macOS vs. Linux, better way to check:
+        self.assertAlmostEqual(score, 0.3350, delta=0.0002)
+    def test_msmarco_passage_tct_colbert_hnsw_otf(self):
+        output_file = 'test_run.msmarco-passage.tct_colbert.hnsw-otf.tsv'
+        self.temp_files.append(output_file)
+        cmd1 = f'python -m pyserini.search.faiss --topics msmarco-passage-dev-subset \
+                             --index msmarco-passage-tct_colbert-hnsw \
+                             --encoder castorini/tct_colbert-msmarco \
+                             --output {output_file} \
+                             --output-format msmarco '
+        cmd2 = f'python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset {output_file}'
+        status = os.system(cmd1)
+        stdout, stderr = run_command(cmd2)
+        score = parse_score(stdout, "MRR @10")
+        self.assertEqual(status, 0)
+        self.assertAlmostEqual(score, 0.3345, delta=0.0002)
+    def test_msmarco_passage_tct_colbert_bf_bm25_hybrid_otf(self):
+        output_file = 'test_run.msmarco-passage.tct_colbert.bf-otf.bm25.tsv'
+        self.temp_files.append(output_file)
+        cmd1 = f'python -m pyserini.search.hybrid dense  --index msmarco-passage-tct_colbert-bf \
+                                    --encoder castorini/tct_colbert-msmarco \
+                             sparse --index msmarco-passage \
+                             fusion --alpha 0.12 \
+                             run    --topics msmarco-passage-dev-subset \
+                                    --output {output_file} \
+                                    --batch-size {self.batch_size} --threads {self.threads} \
+                                    --output-format msmarco'
+        cmd2 = f'python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset {output_file}'
+        status = os.system(cmd1)
+        stdout, stderr = run_command(cmd2)
+        score = parse_score(stdout, "MRR @10")
+        self.assertEqual(status, 0)
+        self.assertAlmostEqual(score, 0.3529, places=4)
+    def test_msmarco_passage_tct_colbert_bf_d2q_hybrid_otf(self):
+        output_file = 'test_run.msmarco-passage.tct_colbert.bf-otf.doc2queryT5.tsv'
+        self.temp_files.append(output_file)
+        cmd1 = f'python -m pyserini.search.hybrid dense  --index msmarco-passage-tct_colbert-bf \
+                                    --encoder castorini/tct_colbert-msmarco \
+                             sparse --index msmarco-passage-expanded \
+                             fusion --alpha 0.22 \
+                             run    --topics msmarco-passage-dev-subset \
+                                    --output {output_file} \
+                                    --batch-size {self.batch_size} --threads {self.threads} \
+                                    --output-format msmarco'
+        cmd2 = f'python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset {output_file}'
+        status = os.system(cmd1)
+        stdout, stderr = run_command(cmd2)
+        score = parse_score(stdout, "MRR @10")
+        self.assertEqual(status, 0)
+        self.assertAlmostEqual(score, 0.3647, places=4)
+    def test_msmarco_passage_tct_colbert_encoded_queries(self):
+        encoded = QueryEncoder.load_encoded_queries('tct_colbert-msmarco-passage-dev-subset')
+        topics = get_topics('msmarco-passage-dev-subset')
+        for t in topics:
+            self.assertTrue(topics[t]['title'] in encoded.embedding)
+    def test_msmarco_doc_tct_colbert_bf_otf(self):
+        output_file = 'test_run.msmarco-doc.passage.tct_colbert-otf.txt'
+        self.temp_files.append(output_file)
+        cmd1 = f'python -m pyserini.search.faiss --topics msmarco-doc-dev \
+                             --index msmarco-doc-tct_colbert-bf \
+                             --encoder castorini/tct_colbert-msmarco \
+                             --output {output_file} \
+                             --hits 1000 \
+                             --max-passage \
+                             --max-passage-hits 100 \
+                             --output-format msmarco \
+                             --batch-size {self.batch_size} \
+                             --threads {self.threads}'
+        cmd2 = f'python -m pyserini.eval.msmarco_doc_eval --judgments msmarco-doc-dev --run {output_file}'
+        status = os.system(cmd1)
+        stdout, stderr = run_command(cmd2)
+        score = parse_score(stdout, "MRR @100")
+        self.assertEqual(status, 0)
+        self.assertAlmostEqual(score, 0.3323, places=4)
+    def test_msmarco_doc_tct_colbert_bf_bm25_hybrid_otf(self):
+        output_file = 'test_run.msmarco-doc.tct_colbert.bf-otf.bm25.tsv'
+        self.temp_files.append(output_file)
+        cmd1 = f'python -m pyserini.search.hybrid dense  --index msmarco-doc-tct_colbert-bf \
+                                    --encoder castorini/tct_colbert-msmarco \
+                             sparse --index msmarco-doc-per-passage \
+                             fusion --alpha 0.25 \
+                             run    --topics msmarco-doc-dev \
+                                    --output {output_file} \
+                                    --hits 1000 --max-passage --max-passage-hits 100 \
+                                    --batch-size {self.batch_size} --threads {self.threads} \
+                                    --output-format msmarco'
+        cmd2 = f'python -m pyserini.eval.msmarco_doc_eval --judgments msmarco-doc-dev --run {output_file}'
+        status = os.system(cmd1)
+        stdout, stderr = run_command(cmd2)
+        score = parse_score(stdout, "MRR @100")
+        self.assertEqual(status, 0)
+        self.assertAlmostEqual(score, 0.3701, places=4)
+    def test_msmarco_doc_tct_colbert_bf_d2q_hybrid_otf(self):
+        output_file = 'test_run.msmarco-doc.tct_colbert.bf-otf.doc2queryT5.tsv'
+        self.temp_files.append(output_file)
+        cmd1 = f'python -m pyserini.search.hybrid dense  --index msmarco-doc-tct_colbert-bf \
+                                    --encoder castorini/tct_colbert-msmarco \
+                             sparse --index msmarco-doc-expanded-per-passage \
+                             fusion --alpha 0.32 \
+                             run    --topics msmarco-doc-dev \
+                                    --output {output_file} \
+                                    --hits 1000 --max-passage --max-passage-hits 100 \
+                                    --batch-size {self.batch_size} --threads {self.threads} \
+                                    --output-format msmarco'
+        cmd2 = f'python -m pyserini.eval.msmarco_doc_eval --judgments msmarco-doc-dev --run {output_file}'
+        status = os.system(cmd1)
+        stdout, stderr = run_command(cmd2)
+        score = parse_score(stdout, "MRR @100")
+        self.assertEqual(status, 0)
+        self.assertAlmostEqual(score, 0.3784, places=4)
+    def test_msmarco_doc_tct_colbert_encoded_queries(self):
+        encoded = QueryEncoder.load_encoded_queries('tct_colbert-msmarco-doc-dev')
+        topics = get_topics('msmarco-doc-dev')
+        for t in topics:
+            self.assertTrue(topics[t]['title'] in encoded.embedding)
+    def tearDown(self):
+        clean_files(self.temp_files)
+if __name__ == '__main__':
+    unittest.main()

integrations/lucenesearcher_anserini_checker.py ADDED Viewed

	@@ -0,0 +1,78 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import filecmp
+import os
+from typing import List
+class LuceneSearcherAnseriniMatchChecker:
+    def __init__(self, anserini_root: str, index: str, topics: str, pyserini_topics: str, qrels: str, eval_root: str):
+        self.anserini_root = anserini_root
+        self.index_path = index
+        self.topics = topics
+        self.qrels = qrels
+        self.pyserini_topics = pyserini_topics
+        self.anserini_base_cmd = os.path.join(self.anserini_root,
+                                              'target/appassembler/bin/SearchCollection -topicreader Trec')
+        self.pyserini_base_cmd = 'python -m pyserini.search.lucene'
+        self.eval_base_cmd = os.path.join(eval_root, 'tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30')
+    @staticmethod
+    def _cleanup(files: List[str]):
+        for file in files:
+            if os.path.exists(file):
+                os.remove(file)
+    def run(self, runtag: str, anserini_extras: str, pyserini_extras: str):
+        print('-------------------------')
+        print(f'Running {runtag}:')
+        print('-------------------------')
+        anserini_output = f'verify.anserini.{runtag}.txt'
+        pyserini_output = f'verify.pyserini.{runtag}.txt'
+        anserini_cmd = f'{self.anserini_base_cmd} -index {self.index_path} ' \
+                       + f'-topics {self.topics} -output {anserini_output} {anserini_extras}'
+        pyserini_cmd = f'{self.pyserini_base_cmd} --index {self.index_path} ' \
+                       + f'--topics {self.pyserini_topics} --output {pyserini_output} {pyserini_extras}'
+        status = os.system(anserini_cmd)
+        if not status == 0:
+            self._cleanup([anserini_output, pyserini_output])
+            return False
+        status = os.system(pyserini_cmd)
+        if not status == 0:
+            self._cleanup([anserini_output, pyserini_output])
+            return False
+        res = filecmp.cmp(anserini_output, pyserini_output)
+        if res is True:
+            eval_cmd = f'{self.eval_base_cmd} {self.qrels} {anserini_output}'
+            status = os.system(eval_cmd)
+            if not status == 0:
+                print(f'[FAIL] {runtag} evaluation failure!')
+                self._cleanup([anserini_output, pyserini_output])
+                return False
+            print(f'[SUCCESS] {runtag} results verified!')
+            self._cleanup([anserini_output, pyserini_output])
+            return True
+        else:
+            print(f'[FAIL] {runtag} result do not match!')
+            self._cleanup([anserini_output, pyserini_output])
+            return False

integrations/lucenesearcher_score_checker.py ADDED Viewed

	@@ -0,0 +1,71 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+from typing import List
+from integrations.utils import run_command, parse_score
+class LuceneSearcherScoreChecker:
+    def __init__(self, index: str, topics: str, pyserini_topics: str, qrels: str, eval:str):
+        self.index_path = index
+        self.topics = topics
+        self.qrels = qrels
+        self.pyserini_topics = pyserini_topics
+        self.pyserini_base_cmd = 'python -m pyserini.search.lucene'
+        self.eval_base_cmd = eval
+    @staticmethod
+    def _cleanup(files: List[str]):
+        for file in files:
+            if os.path.exists(file):
+                os.remove(file)
+    def run(self, runtag: str, pyserini_extras: str, actualscore: float, tokenizer = None):
+        print('-------------------------')
+        print(f'Running {runtag}:')
+        print('-------------------------')
+        pyserini_output = f'verify.pyserini.{runtag}.txt'
+        pyserini_cmd = f'{self.pyserini_base_cmd} --index {self.index_path} \
+                           --topics {self.pyserini_topics} --output {pyserini_output} {pyserini_extras}'
+        if tokenizer is not None:
+            pyserini_cmd = pyserini_cmd + f' --tokenizer {tokenizer}'
+        status = os.system(pyserini_cmd)
+        if not status == 0:
+            return False
+        eval_cmd = f'{self.eval_base_cmd} {self.qrels} {pyserini_output}'
+        status = os.system(eval_cmd)
+        if not status == 0:
+            return False
+        stdout, stderr = run_command(eval_cmd)
+        score = parse_score(stdout, 'map')
+        self._cleanup([pyserini_output])
+        if actualscore != score:
+            return False
+        return True

integrations/papers/test_ecir2023.py ADDED Viewed

	@@ -0,0 +1,91 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Integration tests for commands in Pradeep et al. resource paper at ECIR 2023."""
+import os
+import unittest
+from integrations.utils import clean_files, run_command, parse_score, parse_score_qa
+class TestECIR2023(unittest.TestCase):
+    def setUp(self):
+        self.temp_files = []
+    def test_section5_sub2_first(self):
+        """Sample code of the first command in Section 5.2."""
+        metrics = ["Top5", "Top20", "Top100"]
+        ground_truth = [73.8, 84.27, 89.34]
+        output_file = 'runs/run.nq-test.dkrr.trec'
+        json_file = 'runs/run.nq-test.dkrr.json'
+        self.temp_files.append(output_file)
+        self.temp_files.append(json_file)
+        # retrieval
+        run_cmd = f'python -m pyserini.search.faiss \
+                      --index wikipedia-dpr-dkrr-nq \
+                      --topics nq-test \
+                      --encoder castorini/dkrr-dpr-nq-retriever \
+                      --output {output_file} --query-prefix question: \
+                      --threads 72 --batch-size 72 \
+                      --hits 100'
+        status = os.system(run_cmd)
+        self.assertEqual(status, 0)
+        # conversion
+        convert_cmd = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \
+                        --topics nq-test \
+                        --index wikipedia-dpr \
+                        --input {output_file} \
+                        --output {json_file}'
+        status = os.system(convert_cmd)
+        self.assertEqual(status, 0)
+        # evaluation
+        eval_cmd = f'python -m pyserini.eval.evaluate_dpr_retrieval \
+                       --retrieval {json_file} \
+                       --topk 5 20 100'
+        stdout, stderr = run_command(eval_cmd)
+        scores = []
+        for mt in metrics:
+            scores.append(parse_score_qa(stdout, mt, 4) * 100)
+        for score in zip(scores, ground_truth):
+            self.assertAlmostEqual(score[0], score[1], delta=0.02)
+    def test_section5_sub2_second(self):
+        """Sample code of the second command in Section 5.2."""
+        cmd_nq = 'python scripts/repro_matrix/run_all_odqa.py --topics nq'
+        cmd_tqa = 'python scripts/repro_matrix/run_all_odqa.py --topics nq'
+        # run both commands, check if all tests passed (i.e., returned OK)
+        stdout_nq, stderr_nq = run_command(cmd_nq)
+        self.assertEqual(stdout_nq.count('[OK]'), 21)
+        stdout_tqa, stderr_tqa = run_command(cmd_tqa)
+        self.assertEqual(stdout_tqa.count('[OK]'), 21)
+    def tearDown(self):
+        clean_files(self.temp_files)
+if __name__ == '__main__':
+    unittest.main()

integrations/papers/test_sigir2021.py ADDED Viewed

	@@ -0,0 +1,180 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Integration tests for commands in Lin et al. (SIGIR 2021) paper."""
+import os
+import unittest
+from integrations.utils import clean_files, run_command, parse_score_msmarco
+from pyserini.dsearch import SimpleDenseSearcher, TctColBertQueryEncoder
+from pyserini.hsearch import HybridSearcher
+from pyserini.index import IndexReader
+from pyserini.search import SimpleSearcher
+from pyserini.search import get_topics, get_qrels
+class TestSIGIR2021(unittest.TestCase):
+    def setUp(self):
+        self.temp_files = []
+    def test_figure1(self):
+        """Sample code in Figure 1."""
+        searcher = SimpleSearcher.from_prebuilt_index('msmarco-passage')
+        hits = searcher.search('what is a lobster roll?', 10)
+        self.assertAlmostEqual(hits[0].score, 11.00830, delta=0.0001)
+        self.assertEqual(hits[0].docid, '7157707')
+        self.assertAlmostEqual(hits[9].score, 9.92200, delta=0.0001)
+        self.assertEqual(hits[9].docid, '6234461')
+        self.assertEqual(len(hits), 10)
+    def test_figure2(self):
+        """Sample code in Figure 2."""
+        encoder = TctColBertQueryEncoder('castorini/tct_colbert-msmarco')
+        searcher = SimpleDenseSearcher.from_prebuilt_index('msmarco-passage-tct_colbert-hnsw', encoder)
+        hits = searcher.search('what is a lobster roll')
+        self.assertAlmostEqual(hits[0].score, 70.53741, delta=0.0001)
+        self.assertEqual(hits[0].docid, '7157710')
+        self.assertAlmostEqual(hits[9].score, 69.01737, delta=0.0001)
+        self.assertEqual(hits[9].docid, '2920399')
+        self.assertEqual(len(hits), 10)
+    def test_figure3(self):
+        """Sample code in Figure 3."""
+        ssearcher = SimpleSearcher.from_prebuilt_index('msmarco-passage')
+        encoder = TctColBertQueryEncoder('castorini/tct_colbert-msmarco')
+        dsearcher = SimpleDenseSearcher.from_prebuilt_index('msmarco-passage-tct_colbert-hnsw', encoder)
+        hsearcher = HybridSearcher(dsearcher, ssearcher)
+        hits = hsearcher.search('what is a lobster roll')
+        self.assertAlmostEqual(hits[0].score, 71.56023, delta=0.0001)
+        self.assertEqual(hits[0].docid, '7157715')
+        self.assertAlmostEqual(hits[9].score, 70.07635, delta=0.0001)
+        self.assertEqual(hits[9].docid, '7157708')
+        self.assertEqual(len(hits), 10)
+    def test_figure4(self):
+        """Sample code in Figure 4."""
+        topics = get_topics('msmarco-passage-dev-subset')
+        qrels = get_qrels('msmarco-passage-dev-subset')
+        self.assertEqual(len(topics), 6980)
+        self.assertEqual(len(qrels), 6980)
+        # Compute the average length of queries:
+        avg_qlen = sum([len(topics[t]['title'].split()) for t in topics])/len(topics)
+        # Compute the average number of relevance judgments per query:
+        avg_qrels = sum([len(qrels[t]) for t in topics])/len(topics)
+        self.assertAlmostEqual(avg_qlen, 5.925, delta=0.001)
+        self.assertAlmostEqual(avg_qrels, 1.065, delta=0.001)
+    def test_figure5(self):
+        """Sample code in Figure 5."""
+        # Initialize from a pre-built index:
+        reader = IndexReader.from_prebuilt_index('robust04')
+        terms = reader.terms()
+        term = next(terms)
+        self.assertEqual(term.term, '0')
+        self.assertEqual(term.df, 10826)
+        self.assertEqual(term.cf, 33491)
+        term = next(terms)
+        self.assertEqual(term.term, '0,0')
+        self.assertEqual(term.df, 2)
+        self.assertEqual(term.cf, 2)
+        # Analyze a term:
+        term = 'atomic'
+        analyzed = reader.analyze(term)
+        self.assertEqual(analyzed[0], 'atom')
+        # Directly fetch term statistics for a term:
+        df, cf = reader.get_term_counts(term)
+        self.assertEqual(df, 5219)
+        self.assertEqual(cf, 9144)
+        # Traverse postings for a term:
+        postings_list = reader.get_postings_list(term)
+        self.assertEqual(len(postings_list), 5219)
+        self.assertEqual(postings_list[0].docid, 432)
+        self.assertEqual(postings_list[0].tf, 1)
+        self.assertEqual(postings_list[0].positions, [137])
+        self.assertEqual(postings_list[5218].docid, 527779)
+        self.assertEqual(postings_list[5218].tf, 1)
+        self.assertEqual(postings_list[5218].positions, [21])
+        # Examples of manipulating document vectors:
+        tf = reader.get_document_vector('LA071090-0047')
+        tp = reader.get_term_positions('LA071090-0047')
+        df = {
+            term: (reader.get_term_counts(term, analyzer=None))[0]
+            for term in tf.keys()
+        }
+        bm25_vector = {
+            term: reader.compute_bm25_term_weight('LA071090-0047',
+                                                  term,
+                                                  analyzer=None)
+            for term in tf.keys()
+        }
+        self.assertEqual(tf['hubbl'], 12)
+        self.assertEqual(tp['caught'], [42, 624, 960])
+        self.assertEqual(df['problem'], 82225)
+        self.assertAlmostEqual(bm25_vector['hubbl'], 7.49397, delta=0.001)
+        self.assertAlmostEqual(bm25_vector['earth'], 2.64872, delta=0.001)
+    def test_section3_3(self):
+        """Sample code in Section 3.3."""
+        output_file = 'run.msmarco-passage.txt'
+        self.temp_files.append(output_file)
+        run_cmd = f'python -m pyserini.search --topics msmarco-passage-dev-subset \
+                      --index msmarco-passage --output {output_file} \
+                      --bm25 --output-format msmarco'
+        status = os.system(run_cmd)
+        self.assertEqual(status, 0)
+        eval_cmd = f'python -m pyserini.eval.msmarco_passage_eval \
+                       msmarco-passage-dev-subset {output_file}'
+        stdout, stderr = run_command(eval_cmd)
+        score = parse_score_msmarco(stdout, "MRR @10")
+        self.assertAlmostEqual(score, 0.1872, delta=0.0001)
+        # Temporary fix: this is Lucene 9 code running on Lucene 8 prebuilt index.
+    def tearDown(self):
+        clean_files(self.temp_files)
+if __name__ == '__main__':
+    unittest.main()

integrations/papers/test_sigir2022.py ADDED Viewed

	@@ -0,0 +1,100 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Integration tests for commands in Ma et al. resource paper and Trotman et al. demo paper at SIGIR 2022."""
+import os
+import unittest
+from integrations.utils import clean_files, run_command, parse_score, parse_score_msmarco
+class TestSIGIR2021(unittest.TestCase):
+    def setUp(self):
+        self.temp_files = []
+    def test_Ma_etal_section4_1a(self):
+        """Sample code in Section 4.1. in Ma et al. resource paper."""
+        output_file = 'run.msmarco-passage.expanded.txt'
+        self.temp_files.append(output_file)
+        run_cmd = f'python -m pyserini.search.lucene \
+                      --index msmarco-v1-passage-d2q-t5 \
+                      --topics msmarco-passage-dev-subset \
+                      --output {output_file} \
+                      --output-format msmarco \
+                      --bm25'
+        status = os.system(run_cmd)
+        self.assertEqual(status, 0)
+        eval_cmd = f'python -m pyserini.eval.msmarco_passage_eval \
+                       msmarco-passage-dev-subset {output_file}'
+        stdout, stderr = run_command(eval_cmd)
+        score = parse_score_msmarco(stdout, "MRR @10")
+        self.assertAlmostEqual(score, 0.2816, delta=0.0001)
+        # Note that this is the score with (k1=2.18, b=0.86); score is 0.2723 with default (k1=0.9, b=0.4) parameters.
+    def test_Ma_etal_section4_1b(self):
+        """Sample code in Section 4.1. in Ma et al. resource paper."""
+        output_file = 'run.msmarco-v2-passage.unicoil.txt'
+        self.temp_files.append(output_file)
+        run_cmd = f'python -m pyserini.search.lucene \
+                      --index msmarco-v2-passage-unicoil-0shot \
+                      --topics msmarco-v2-passage-dev \
+                      --encoder castorini/unicoil-msmarco-passage \
+                      --output {output_file} \
+                      --batch 144 --threads 36 \
+                      --hits 1000 \
+                      --impact'
+        status = os.system(run_cmd)
+        self.assertEqual(status, 0)
+        eval_cmd = f'python -m pyserini.eval.trec_eval -c -M 100 -m map -m recip_rank msmarco-v2-passage-dev {output_file}'
+        stdout, stderr = run_command(eval_cmd)
+        score = parse_score(stdout, "recip_rank")
+        self.assertAlmostEqual(score, 0.1501, delta=0.0001)
+        # This is the score with otf; with pre-encoded, the score is 0.1499.
+    def test_Trotman_etal(self):
+        """Sample code in Trotman et al. demo paper."""
+        output_file = 'run.msmarco-passage.unicoil.tsv'
+        self.temp_files.append(output_file)
+        run_cmd = f'python -m pyserini.search.lucene \
+                      --index msmarco-passage-unicoil-d2q \
+                      --topics msmarco-passage-dev-subset-unicoil \
+                      --output {output_file} \
+                      --output-format msmarco \
+                      --batch 36 --threads 12 \
+                      --hits 1000 \
+                      --impact'
+        status = os.system(run_cmd)
+        self.assertEqual(status, 0)
+        eval_cmd = f'python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset {output_file}'
+        stdout, stderr = run_command(eval_cmd)
+        score = parse_score_msmarco(stdout, "MRR @10", digits=3)
+        self.assertAlmostEqual(score, 0.352, delta=0.0005)
+        # TODO: There's corresponding test code with JASS that's also in the demo paper. We should also add.
+    def tearDown(self):
+        clean_files(self.temp_files)
+if __name__ == '__main__':
+    unittest.main()

integrations/run_lucenesearcher.py ADDED Viewed

	@@ -0,0 +1,51 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import hashlib
+import os
+from typing import List
+class RunLuceneSearcher:
+    def __init__(self, index: str, topics: str):
+        self.index_path = index
+        self.topics = topics
+        self.pyserini_base_cmd = 'python -m pyserini.search.lucene'
+    @staticmethod
+    def _cleanup(files: List[str]):
+        for file in files:
+            if os.path.exists(file):
+                os.remove(file)
+    def run(self, runtag: str, extras: str) -> str:
+        print('-------------------------')
+        print(f'Running {runtag}:')
+        print('-------------------------')
+        output = f'verify.pyserini.{runtag}.txt'
+        pyserini_cmd = f'{self.pyserini_base_cmd} --index {self.index_path} ' \
+            + f'--topics {self.topics} --output {output} {extras}'
+        status = os.system(pyserini_cmd)
+        if not status == 0:
+            self._cleanup([output])
+            return ""
+        with open(output, 'rb') as f:
+            md5 = hashlib.md5(f.read()).hexdigest()
+        self._cleanup([output])
+        return md5

integrations/sparse/test_lucenesearcher_check_core17.py ADDED Viewed

	@@ -0,0 +1,59 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+import unittest
+from integrations.lucenesearcher_anserini_checker import LuceneSearcherAnseriniMatchChecker
+class TestSearchIntegration(unittest.TestCase):
+    def setUp(self):
+        # The current directory depends on if you're running inside an IDE or from command line.
+        curdir = os.getcwd()
+        if curdir.endswith('sparse'):
+            anserini_root = '../../../anserini'
+            pyserini_root = '../..'
+        else:
+            anserini_root = '../anserini'
+            pyserini_root = '.'
+        self.checker = LuceneSearcherAnseriniMatchChecker(
+            anserini_root=anserini_root,
+            index=os.path.join(anserini_root, 'indexes/lucene-index.nyt'),
+            topics=os.path.join(pyserini_root, 'tools/topics-and-qrels/topics.core17.txt'),
+            pyserini_topics='core17',
+            qrels=os.path.join(pyserini_root, 'tools/topics-and-qrels/qrels.core17.txt'),
+            eval_root=pyserini_root)
+    def test_bm25(self):
+        self.assertTrue(self.checker.run('core17_bm25', '-bm25', '--bm25'))
+    def test_bm25_rm3(self):
+        self.assertTrue(self.checker.run('core17_bm25_rm3', '-bm25 -rm3', '--bm25 --rm3'))
+    def test_qld(self):
+        self.assertTrue(self.checker.run('core17_qld', '-qld', '--qld'))
+    def test_qld_rm3(self):
+        self.assertTrue(self.checker.run('core17_qld_rm3', '-qld -rm3', '--qld --rm3'))
+    def tearDown(self):
+        pass
+if __name__ == '__main__':
+    unittest.main()

integrations/sparse/test_lucenesearcher_check_core18.py ADDED Viewed

	@@ -0,0 +1,59 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+import unittest
+from integrations.lucenesearcher_anserini_checker import LuceneSearcherAnseriniMatchChecker
+class TestSearchIntegration(unittest.TestCase):
+    def setUp(self):
+        # The current directory depends on if you're running inside an IDE or from command line.
+        curdir = os.getcwd()
+        if curdir.endswith('sparse'):
+            anserini_root = '../../../anserini'
+            pyserini_root = '../..'
+        else:
+            anserini_root = '../anserini'
+            pyserini_root = '.'
+        self.checker = LuceneSearcherAnseriniMatchChecker(
+            anserini_root=anserini_root,
+            index=os.path.join(anserini_root, 'indexes/lucene-index.wapo.v2'),
+            topics=os.path.join(pyserini_root, 'tools/topics-and-qrels/topics.core18.txt'),
+            pyserini_topics='core18',
+            qrels=os.path.join(pyserini_root, 'tools/topics-and-qrels/qrels.core18.txt'),
+            eval_root=pyserini_root)
+    def test_bm25(self):
+        self.assertTrue(self.checker.run('core18_bm25', '-bm25', '--bm25'))
+    def test_bm25_rm3(self):
+        self.assertTrue(self.checker.run('core18_bm25_rm3', '-bm25 -rm3', '--bm25 --rm3'))
+    def test_qld(self):
+        self.assertTrue(self.checker.run('core18_qld', '-qld', '--qld'))
+    def test_qld_rm3(self):
+        self.assertTrue(self.checker.run('core18_qld_rm3', '-qld -rm3', '--qld --rm3'))
+    def tearDown(self):
+        pass
+if __name__ == '__main__':
+    unittest.main()

integrations/sparse/test_lucenesearcher_check_irst.py ADDED Viewed

	@@ -0,0 +1,355 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+import unittest
+from shutil import rmtree
+from random import randint
+from integrations.utils import run_command, parse_score
+class TestMsmarcoPassageIrst(unittest.TestCase):
+    def setUp(self):
+        curdir = os.getcwd()
+        if curdir.endswith('sparse'):
+            self.pyserini_root = '../..'
+        else:
+            self.pyserini_root = '.'
+        self.tmp = f'tmp{randint(0, 10000)}'
+        if os.path.isdir(self.tmp):
+            rmtree(self.tmp)
+        os.mkdir(self.tmp)
+        self.dl19_pass = 'dl19-passage'
+        self.dl20 = 'dl20'
+    def test_sum_aggregation_dl19_passage(self):
+        # dl19 passage sum
+        topic = 'dl19-passage'
+        os.system(f'python -m pyserini.search.lucene.irst \
+            --topics {self.dl19_pass} \
+            --index msmarco-v1-passage \
+            --output {self.tmp}/regression_test_sum.{topic}.txt \
+            --alpha 0.1 ')
+        score_cmd = f'python -m pyserini.eval.trec_eval \
+                -c -m map -m ndcg_cut.10 -l 2 {topic} {self.tmp}/regression_test_sum.{topic}.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        map_score = parse_score(stdout, "map")
+        ndcg_score = parse_score(stdout, "ndcg")
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertEqual(map_score, 0.3281)
+        self.assertEqual(ndcg_score, 0.5260)
+    def test_sum_aggregation_dl20_passage(self):
+        # dl20 passage sum
+        topic = 'dl20-passage'
+        os.system(f'python -m pyserini.search.lucene.irst \
+            --topics {self.dl20} \
+            --index msmarco-v1-passage \
+            --output {self.tmp}/regression_test_sum.{topic}.txt \
+            --alpha 0.1 ')
+        score_cmd = f'python -m pyserini.eval.trec_eval \
+                -c -m map -m ndcg_cut.10 -l 2 {topic} {self.tmp}/regression_test_sum.{topic}.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        map_score = parse_score(stdout, "map")
+        ndcg_score = parse_score(stdout, "ndcg")
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertEqual(map_score, 0.3520)
+        self.assertEqual(ndcg_score, 0.5578)
+    def test_max_aggregation_dl19(self):
+        # dl19 passage max
+        topic = 'dl19-passage'
+        os.system(f'python -m pyserini.search.lucene.irst \
+            --topics {self.dl19_pass} \
+            --index msmarco-v1-passage \
+            --output {self.tmp}/regression_test_max.{topic}.txt \
+            --alpha 0.3 \
+            --max-sim ')
+        score_cmd = f'python -m pyserini.eval.trec_eval \
+                -c -m map -m ndcg_cut.10 -l 2 {topic} {self.tmp}/regression_test_max.{topic}.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        map_score = parse_score(stdout, "map")
+        ndcg_score = parse_score(stdout, "ndcg")
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertEqual(map_score, 0.3286)
+        self.assertEqual(ndcg_score, 0.5371)
+    def test_max_aggregation_dl20_passage(self):
+        # dl20 passage max
+        topic = 'dl20-passage'
+        os.system(f'python -m pyserini.search.lucene.irst \
+            --topics {self.dl20} \
+            --index msmarco-v1-passage \
+            --output {self.tmp}/regression_test_max.{topic}.txt \
+            --alpha 0.3 \
+            --max-sim')
+        score_cmd = f'python -m pyserini.eval.trec_eval \
+                -c -m map -m ndcg_cut.10 -l 2 {topic} {self.tmp}/regression_test_max.{topic}.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        map_score = parse_score(stdout, "map")
+        ndcg_score = parse_score(stdout, "ndcg")
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertEqual(map_score, 0.3357)
+        self.assertEqual(ndcg_score, 0.5469)
+    def tearDown(self):
+        rmtree(self.tmp)
+class TestMsmarcoDocumentIrst(unittest.TestCase):
+    def setUp(self):
+        curdir = os.getcwd()
+        if curdir.endswith('sparse'):
+            self.pyserini_root = '../..'
+        else:
+            self.pyserini_root = '.'
+        self.tmp = f'tmp{randint(0, 10000)}'
+        if os.path.isdir(self.tmp):
+            rmtree(self.tmp)
+        os.mkdir(self.tmp)
+        self.dl19_doc = 'dl19-doc'
+        self.dl20 = 'dl20'
+    def test_sum_aggregation_dl19_doc(self):
+        # dl19-doc-sum
+        topic = 'dl19-doc'
+        os.system(f'python -m pyserini.search.lucene.irst \
+            --topics {self.dl19_doc} \
+            --index msmarco-v1-doc \
+            --output {self.tmp}/regression_test_sum.{topic}.txt \
+            --alpha 0.3')
+        score_cmd = f'python -m pyserini.eval.trec_eval \
+               -c -m map -m ndcg_cut.10 -M 100 {topic} {self.tmp}/regression_test_sum.{topic}.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        map_score = parse_score(stdout, "map")
+        ndcg_score = parse_score(stdout, "ndcg")
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertEqual(map_score, 0.2524)
+        self.assertEqual(ndcg_score, 0.5494)
+    def test_sum_aggregation_dl20_doc(self):
+        # dl20-doc-sum
+        topic = 'dl20-doc'
+        os.system(f'python -m pyserini.search.lucene.irst \
+            --topics {self.dl20} \
+            --index msmarco-v1-doc \
+            --output {self.tmp}/regression_test_sum.{topic}.txt \
+            --alpha 0.3 ')
+        score_cmd = f'python -m pyserini.eval.trec_eval \
+                -c -m map -m ndcg_cut.10 -M 100 {topic} {self.tmp}/regression_test_sum.{topic}.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        map_score = parse_score(stdout, "map")
+        ndcg_score = parse_score(stdout, "ndcg")
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertEqual(map_score, 0.3825)
+        self.assertEqual(ndcg_score, 0.5559)
+    def test_max_aggregation_dl19_doc(self):
+        # dl19-doc-max
+        topic = 'dl19-doc'
+        os.system(f'python -m pyserini.search.lucene.irst \
+            --topics {self.dl19_doc} \
+            --index msmarco-v1-doc \
+            --output {self.tmp}/regression_test_max.{topic}.txt \
+            --alpha 0.3 \
+            --max-sim')
+        score_cmd = f'python -m pyserini.eval.trec_eval \
+                -c -m map -m ndcg_cut.10 -M 100 {topic} {self.tmp}/regression_test_max.{topic}.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        map_score = parse_score(stdout, "map")
+        ndcg_score = parse_score(stdout, "ndcg")
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertEqual(map_score, 0.2205)
+        self.assertEqual(ndcg_score, 0.4917)
+    def test_max_aggregation_dl20_doc(self):
+        # dl20-doc-max
+        topic = 'dl20-doc'
+        os.system(f'python -m pyserini.search.lucene.irst \
+            --topics {self.dl20} \
+            --index msmarco-v1-doc \
+            --output {self.tmp}/regression_test_max.{topic}.txt \
+            --alpha 0.3 \
+            --max-sim')
+        score_cmd = f'python -m pyserini.eval.trec_eval \
+                -c -m map -m ndcg_cut.10 -M 100 {topic} {self.tmp}/regression_test_max.{topic}.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        map_score = parse_score(stdout, "map")
+        ndcg_score = parse_score(stdout, "ndcg")
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertEqual(map_score, 0.3373)
+        self.assertEqual(ndcg_score, 0.5015)
+    def tearDown(self):
+        rmtree(self.tmp)
+class TestMsmarcoDocumentSegIrst(unittest.TestCase):
+    def setUp(self):
+        curdir = os.getcwd()
+        if curdir.endswith('sparse'):
+            self.pyserini_root = '../..'
+        else:
+            self.pyserini_root = '.'
+        self.tmp = f'tmp{randint(0, 10000)}'
+        if os.path.isdir(self.tmp):
+            rmtree(self.tmp)
+        os.mkdir(self.tmp)
+        self.dl19_doc = 'dl19-doc'
+        self.dl20 = 'dl20'
+    def test_sum_aggregation_dl19_doc_seg(self):
+        # dl19-doc-seg-sum
+        topic = 'dl19-doc'
+        os.system(f'python -m pyserini.search.lucene.irst \
+            --topics {self.dl19_doc} \
+            --index msmarco-v1-doc-segmented \
+            --output {self.tmp}/regression_test_sum.{topic}.txt \
+            --hits 10000 --segments \
+            --alpha 0.3')
+        score_cmd = f'python -m pyserini.eval.trec_eval \
+               -c -m map -m ndcg_cut.10 -M 100 {topic} {self.tmp}/regression_test_sum.{topic}.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        map_score = parse_score(stdout, "map")
+        ndcg_score = parse_score(stdout, "ndcg")
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertEqual(map_score, 0.2711)
+        self.assertEqual(ndcg_score, 0.5596)
+    def test_sum_aggregation_dl20_doc_seg(self):
+        # dl20-doc-seg-sum
+        topic = 'dl20-doc'
+        os.system(f'python -m pyserini.search.lucene.irst \
+            --topics {self.dl20} \
+            --index msmarco-v1-doc-segmented \
+            --output {self.tmp}/regression_test_sum.{topic}.txt \
+            --hits 10000 --segments \
+            --alpha 0.3 ')
+        score_cmd = f'python -m pyserini.eval.trec_eval \
+                -c -m map -m ndcg_cut.10 -M 100 {topic} {self.tmp}/regression_test_sum.{topic}.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        map_score = parse_score(stdout, "map")
+        ndcg_score = parse_score(stdout, "ndcg")
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertEqual(map_score, 0.3759)
+        self.assertEqual(ndcg_score, 0.5343)
+    def test_max_aggregation_dl19_doc_seg(self):
+        # dl19-doc-seg-max
+        topic = 'dl19-doc'
+        os.system(f'python -m pyserini.search.lucene.irst \
+            --topics {self.dl19_doc} \
+            --index msmarco-v1-doc-segmented \
+            --output {self.tmp}/regression_test_max.{topic}.txt \
+            --alpha 0.3 \
+            --hits 10000 --segments \
+            --max-sim')
+        score_cmd = f'python -m pyserini.eval.trec_eval \
+                -c -m map -m ndcg_cut.10 -M 100 {topic} {self.tmp}/regression_test_max.{topic}.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        map_score = parse_score(stdout, "map")
+        ndcg_score = parse_score(stdout, "ndcg")
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertEqual(map_score, 0.2425)
+        self.assertEqual(ndcg_score, 0.5193)
+    def test_max_aggregation_dl20_doc_seg(self):
+        # dl20-doc-seg-max
+        topic = 'dl20-doc'
+        os.system(f'python -m pyserini.search.lucene.irst \
+            --topics {self.dl20} \
+            --index msmarco-v1-doc-segmented \
+            --output {self.tmp}/regression_test_max.{topic}.txt \
+            --alpha 0.3 \
+            --hits 10000 --segments \
+            --max-sim')
+        score_cmd = f'python -m pyserini.eval.trec_eval \
+                -c -m map -m ndcg_cut.10 -M 100 {topic} {self.tmp}/regression_test_max.{topic}.txt'
+        status = os.system(score_cmd)
+        stdout, stderr = run_command(score_cmd)
+        map_score = parse_score(stdout, "map")
+        ndcg_score = parse_score(stdout, "ndcg")
+        self.assertEqual(status, 0)
+        self.assertEqual(stderr, '')
+        self.assertEqual(map_score, 0.3496)
+        self.assertEqual(ndcg_score, 0.5089)
+    def tearDown(self):
+        rmtree(self.tmp)
+if __name__ == '__main__':
+    unittest.main()

integrations/sparse/test_lucenesearcher_check_ltr_msmarco_document.py ADDED Viewed

	@@ -0,0 +1,60 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+import subprocess
+import sys
+import unittest
+from shutil import rmtree
+class TestLtrMsmarcoDocument(unittest.TestCase):
+    def test_reranking(self):
+        if(os.path.isdir('ltr_test')):
+            rmtree('ltr_test')
+            os.mkdir('ltr_test')
+        inp = 'run.msmarco-pass-doc.bm25.txt'
+        outp = 'run.ltr.msmarco-pass-doc.test.trec'
+        outp_tsv = 'run.ltr.msmarco-pass-doc.test.tsv'
+        #Pre-trained ltr model
+        model_url = 'https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-models/model-ltr-msmarco-passage-mrr-v1.tar.gz'
+        model_tar_name = 'model-ltr-msmarco-passage-mrr-v1.tar.gz'
+        os.system(f'wget {model_url} -P ltr_test/')
+        os.system(f'tar -xzvf ltr_test/{model_tar_name} -C ltr_test')
+        # IBM model
+        ibm_model_url = 'https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-models/model-ltr-ibm.tar.gz'
+        ibm_model_tar_name = 'model-ltr-ibm.tar.gz'
+        os.system(f'wget {ibm_model_url} -P ltr_test/')
+        os.system(f'tar -xzvf ltr_test/{ibm_model_tar_name} -C ltr_test')
+        os.system(f'python -m pyserini.search.lucene.ltr  \
+                    --topic tools/topics-and-qrels/topics.msmarco-doc.dev.txt \
+                    --model ltr_test/msmarco-passage-ltr-mrr-v1/   \
+                    --qrel tools/topics-and-qrels/qrels.msmarco-doc.dev.txt \
+                    --index msmarco-doc-per-passage-ltr --ibm-model ltr_test/ibm_model/ \
+                    --granularity document --output ltr_test/{outp} --max-passage --hits 10000')
+        result = subprocess.check_output(f'python tools/scripts/msmarco/msmarco_doc_eval.py --judgments tools/topics-and-qrels/qrels.msmarco-doc.dev.txt --run ltr_test/{outp}', shell=True).decode(sys.stdout.encoding)
+        a,b = result.find('#####################\nMRR @100:'), result.find('\nQueriesRanked: 5193\n#####################\n')
+        mrr = result[a+32:b]
+        # See:
+        #  - https://github.com/castorini/pyserini/issues/951
+        #  - https://github.com/castorini/pyserini/issues/1430
+        self.assertAlmostEqual(float(mrr), 0.3108, delta=0.0002)
+        rmtree('ltr_test')
+if __name__ == '__main__':
+    unittest.main()

integrations/sparse/test_lucenesearcher_check_ltr_msmarco_passage.py ADDED Viewed

	@@ -0,0 +1,59 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+import subprocess
+import sys
+import unittest
+from shutil import rmtree
+from pyserini.search.lucene import LuceneSearcher
+class TestLtrMsmarcoPassage(unittest.TestCase):
+    def test_reranking(self):
+        if(os.path.isdir('ltr_test')):
+            rmtree('ltr_test')
+            os.mkdir('ltr_test')
+        inp = 'run.msmarco-passage.bm25tuned.txt'
+        outp = 'run.ltr.msmarco-passage.test.tsv'
+        #Pre-trained ltr model
+        model_url = 'https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-models/model-ltr-msmarco-passage-mrr-v1.tar.gz'
+        model_tar_name = 'model-ltr-msmarco-passage-mrr-v1.tar.gz'
+        os.system(f'wget {model_url} -P ltr_test/')
+        os.system(f'tar -xzvf ltr_test/{model_tar_name} -C ltr_test')
+        # IBM model
+        ibm_model_url = 'https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-models/model-ltr-ibm.tar.gz'
+        ibm_model_tar_name = 'model-ltr-ibm.tar.gz'
+        os.system(f'wget {ibm_model_url} -P ltr_test/')
+        os.system(f'tar -xzvf ltr_test/{ibm_model_tar_name} -C ltr_test')
+        #queries process
+        os.system(f'python -m pyserini.search.lucene.ltr \
+                    --model ltr_test/msmarco-passage-ltr-mrr-v1 \
+                    --topic tools/topics-and-qrels/topics.msmarco-passage.dev-subset.txt \
+                    --qrel tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt \
+                    --index msmarco-passage-ltr --ibm-model ltr_test/ibm_model/ \
+                    --output-format tsv --output ltr_test/{outp}')
+        result = subprocess.check_output(f'python tools/scripts/msmarco/msmarco_passage_eval.py tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt ltr_test/{outp}', shell=True).decode(sys.stdout.encoding)
+        a,b = result.find('#####################\nMRR @10:'), result.find('\nQueriesRanked: 6980\n#####################\n')
+        mrr = result[a+31:b]
+        # See https://github.com/castorini/pyserini/issues/951
+        self.assertAlmostEqual(float(mrr), 0.2472, delta=0.0001)
+        rmtree('ltr_test')
+if __name__ == '__main__':
+    unittest.main()

integrations/sparse/test_lucenesearcher_check_robust04.py ADDED Viewed

	@@ -0,0 +1,59 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+import unittest
+from integrations.lucenesearcher_anserini_checker import LuceneSearcherAnseriniMatchChecker
+class TestSearchIntegration(unittest.TestCase):
+    def setUp(self):
+        # The current directory depends on if you're running inside an IDE or from command line.
+        curdir = os.getcwd()
+        if curdir.endswith('sparse'):
+            anserini_root = '../../../anserini'
+            pyserini_root = '../..'
+        else:
+            anserini_root = '../anserini'
+            pyserini_root = '.'
+        self.checker = LuceneSearcherAnseriniMatchChecker(
+            anserini_root=anserini_root,
+            index=os.path.join(anserini_root, 'indexes/lucene-index.disk45'),
+            topics=os.path.join(pyserini_root, 'tools/topics-and-qrels/topics.robust04.txt'),
+            pyserini_topics='robust04',
+            qrels=os.path.join(pyserini_root, 'tools/topics-and-qrels/qrels.robust04.txt'),
+            eval_root=pyserini_root)
+    def test_bm25(self):
+        self.assertTrue(self.checker.run('robust04_bm25', '-bm25', '--bm25'))
+    def test_bm25_rm3(self):
+        self.assertTrue(self.checker.run('robust04_bm25_rm3', '-bm25 -rm3', '--bm25 --rm3'))
+    def test_qld(self):
+        self.assertTrue(self.checker.run('robust04_qld', '-qld', '--qld'))
+    def test_qld_rm3(self):
+        self.assertTrue(self.checker.run('robust04_qld_rm3', '-qld -rm3', '--qld --rm3'))
+    def tearDown(self):
+        pass
+if __name__ == '__main__':
+    unittest.main()

integrations/sparse/test_lucenesearcher_check_robust05.py ADDED Viewed

	@@ -0,0 +1,59 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+import unittest
+from integrations.lucenesearcher_anserini_checker import LuceneSearcherAnseriniMatchChecker
+class TestSearchIntegration(unittest.TestCase):
+    def setUp(self):
+        # The current directory depends on if you're running inside an IDE or from command line.
+        curdir = os.getcwd()
+        if curdir.endswith('sparse'):
+            anserini_root = '../../../anserini'
+            pyserini_root = '../..'
+        else:
+            anserini_root = '../anserini'
+            pyserini_root = '.'
+        self.checker = LuceneSearcherAnseriniMatchChecker(
+            anserini_root=anserini_root,
+            index=os.path.join(anserini_root, 'indexes/lucene-index.robust05'),
+            topics=os.path.join(pyserini_root, 'tools/topics-and-qrels/topics.robust05.txt'),
+            pyserini_topics='robust05',
+            qrels=os.path.join(pyserini_root, 'tools/topics-and-qrels/qrels.robust05.txt'),
+            eval_root=pyserini_root)
+    def test_bm25(self):
+        self.assertTrue(self.checker.run('robust05_bm25', '-bm25', '--bm25'))
+    def test_bm25_rm3(self):
+        self.assertTrue(self.checker.run('robust05_bm25_rm3', '-bm25 -rm3', '--bm25 --rm3'))
+    def test_qld(self):
+        self.assertTrue(self.checker.run('robust05_qld', '-qld', '--qld'))
+    def test_qld_rm3(self):
+        self.assertTrue(self.checker.run('robust05_qld_rm3', '-qld -rm3', '--qld --rm3'))
+    def tearDown(self):
+        pass
+if __name__ == '__main__':
+    unittest.main()

integrations/sparse/test_lucenesearcher_multithread.py ADDED Viewed

	@@ -0,0 +1,79 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+from integrations.run_lucenesearcher import RunLuceneSearcher
+class TestSearchIntegration(unittest.TestCase):
+    def setUp(self):
+        self.test_threads = ['--threads 1 --batch-size 64', '--threads 4 --batch-size 64']
+    def check_equal(self, runner: RunLuceneSearcher, runtag: str, extras: str) -> bool:
+        checksums = []
+        for i, config in enumerate(self.test_threads):
+            checksum = runner.run(runtag=f'{runtag}-{i}', extras=f'{config} {extras}')
+            if len(checksum) == 0:
+                print(f'[FAIL] {runtag} {config} failed to run!')
+                return False
+            checksums.append(checksum)
+        equal = all(x == checksums[0] for x in checksums)
+        if equal:
+            print(f'[SUCCESS] {runtag} results match!')
+        else:
+            print(f'[FAIL] {runtag} results do not match!')
+        return equal
+    def test_robust04(self):
+        checker = RunLuceneSearcher(index='robust04', topics='robust04')
+        self.assertTrue(self.check_equal(checker, 'robust04', extras=''))
+    def test_msmarco_passage(self):
+        checker = RunLuceneSearcher(index='msmarco-v1-passage', topics='msmarco-passage-dev-subset')
+        self.assertTrue(self.check_equal(checker, 'msmarco-v1-passage', extras='--output-format msmarco'))
+    def test_msmarco_passage_docTTTTTquery(self):
+        checker = RunLuceneSearcher(index='msmarco-v1-passage-d2q-t5', topics='msmarco-passage-dev-subset')
+        self.assertTrue(self.check_equal(checker, 'msmarco-v1-passage-d2q-t5', extras='--output-format msmarco'))
+    def test_msmarco_doc(self):
+        checker = RunLuceneSearcher(index='msmarco-v1-doc', topics='msmarco-doc-dev')
+        self.assertTrue(self.check_equal(checker, 'msmarco-v1-doc', extras='--hits 100 --output-format msmarco'))
+    def test_msmarco_doc_docTTTTTquery(self):
+        checker = RunLuceneSearcher(index='msmarco-v1-doc-d2q-t5', topics='msmarco-doc-dev')
+        self.assertTrue(self.check_equal(checker, 'msmarco-v1-doc-d2q-t5',
+                                         extras='--hits 100 --output-format msmarco'))
+    def test_msmarco_doc_per_passage(self):
+        checker = RunLuceneSearcher(index='msmarco-v1-doc-segmented', topics='msmarco-doc-dev')
+        self.assertTrue(
+            self.check_equal(checker, 'msmarco-v1-doc-segmented',
+                             extras='--hits 1000 --max-passage --max-passage-hits 100 --output-format msmarco'))
+    def test_msmarco_doc_docTTTTTquery_passage(self):
+        checker = RunLuceneSearcher(index='msmarco-v1-doc-segmented-d2q-t5', topics='msmarco-doc-dev')
+        self.assertTrue(
+            self.check_equal(checker, 'msmarco-v1-doc-segmented-d2q-t5',
+                             extras='--hits 1000 --max-passage --max-passage-hits 100 --output-format msmarco'))
+    def tearDown(self):
+        pass
+if __name__ == '__main__':
+    unittest.main()

integrations/sparse/test_nmslib.py ADDED Viewed

	@@ -0,0 +1,66 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Integration tests for nmslib index search"""
+import os
+import socket
+import unittest
+from integrations.utils import clean_files, run_command, parse_score
+class TestSearchIntegration(unittest.TestCase):
+    def setUp(self):
+        self.temp_files = []
+        self.threads = 12
+        self.batch_size = 36
+        # Hard-code larger values for internal servers
+        if socket.gethostname().startswith('damiano') or socket.gethostname().startswith('orca'):
+            self.threads = 36
+            self.batch_size = 144
+    def test_msmarco_passage_deepimpact_nmslib_hnsw(self):
+        output_file = 'test_run.msmarco-passage.deepimpact.nmslib.tsv'
+        self.temp_files.append(output_file)
+        cmd = 'wget https://raw.githubusercontent.com/castorini/pyserini-data/main/encoded-queries/deepimpact_msmarco_passage_dev_topic.jsonl'
+        status = os.system(cmd)
+        cmd = 'wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-indexes/nmslib-index.msmarco-passage.deepimpact.20211012.58d286.tar.gz'
+        status = os.system(cmd)
+        cmd = 'tar -xvf nmslib-index.msmarco-passage.deepimpact.20211012.58d286.tar.gz'
+        status = os.system(cmd)
+        self.temp_files.append('deepimpact_msmarco_passage_dev_topic.jsonl')
+        self.temp_files.append('nmslib-index.msmarco-passage.deepimpact.20211012.58d286.tar.gz')
+        self.temp_files.append('nmslib-index.msmarco-passage.deepimpact.20211012.58d286')
+        cmd1 = f'python -m pyserini.search.nmslib --topics deepimpact_msmarco_passage_dev_topic.jsonl \
+                          --index nmslib-index.msmarco-passage.deepimpact.20211012.58d286 \
+                          --output {output_file} \
+                          --hits 1000 --batch {self.batch_size} --threads {self.threads} \
+                          --output-format msmarco --is-sparse --ef 1000'
+        cmd2 = f'python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset {output_file}'
+        status = os.system(cmd1)
+        stdout, stderr = run_command(cmd2)
+        score = parse_score(stdout, "MRR @10")
+        self.assertEqual(status, 0)
+        self.assertAlmostEqual(score, 0.298, delta=0.001)
+    def tearDown(self):
+        clean_files(self.temp_files)
+if __name__ == '__main__':
+    unittest.main()

integrations/sparse/test_prebuilt_msmarco_v1_doc.py ADDED Viewed

	@@ -0,0 +1,170 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Integration tests for MS MARCO V1 doc corpora (full and segmented) using pre-built indexes."""
+import unittest
+from integrations.utils import run_retrieval_and_return_scores
+class TestPrebuiltMsMarcoV1Doc(unittest.TestCase):
+    def setUp(self):
+        self.threads = 16
+        self.batch_size = 128
+    #
+    # doc "full" conditions
+    #
+    def test_doc_full_trec_output(self):
+        """Test case for MS MARCO V1 doc (full), dev queries, TREC output
+           on all three pre-built indexes (base, slim, full)."""
+        # Loop over all three pre-built indexes.
+        for index in ['msmarco-v1-doc', 'msmarco-v1-doc-slim', 'msmarco-v1-doc-full']:
+            scores = run_retrieval_and_return_scores(
+                'runs/test_run.msmarco-doc.trec.txt',
+                f'python -m pyserini.search.lucene --threads {self.threads} --batch-size {self.batch_size} \
+                    --index {index} --topics msmarco-doc-dev --bm25 --hits 1000',
+                'msmarco-doc-dev',
+                'trec_eval',
+                [['map', 'map'], ['recall.1000', 'recall_1000']])
+            self.assertTrue('map' in scores)
+            self.assertTrue('recall.1000' in scores)
+            self.assertAlmostEqual(scores['map'], 0.2774, delta=0.0001)
+            self.assertAlmostEqual(scores['recall.1000'], 0.9357, delta=0.0001)
+    def test_doc_full_msmarco_output(self):
+        """Test case for MS MARCO V1 doc (full), dev queries, MS MARCO output
+           on all three pre-built indexes (base, slim, full)."""
+        # Loop over all three pre-built indexes.
+        for index in ['msmarco-v1-doc', 'msmarco-v1-doc-slim', 'msmarco-v1-doc-full']:
+            scores = run_retrieval_and_return_scores(
+                'runs/test_run.msmarco-doc.msmarco.txt',
+                f'python -m pyserini.search.lucene --threads {self.threads} --batch-size {self.batch_size} \
+                    --index {index} --topics msmarco-doc-dev --bm25 --hits 100 --output-format msmarco',
+                'msmarco-doc-dev',
+                'msmarco_doc_string', [])
+            self.assertTrue('MRR@100' in scores)
+            self.assertEqual(scores['MRR@100'], '0.2766351807440808')
+    #
+    # doc segmented conditions
+    #
+    def test_doc_segmented_trec_output(self):
+        """Test case for MS MARCO V1 doc segmented, dev queries, TREC output
+           on all three pre-built indexes (base, slim, full)."""
+        # Loop over all three pre-built indexes.
+        for index in ['msmarco-v1-doc-segmented', 'msmarco-v1-doc-segmented-slim', 'msmarco-v1-doc-segmented-full']:
+            scores = run_retrieval_and_return_scores(
+                'runs/test_run.msmarco-doc-segmented.trec.txt',
+                f'python -m pyserini.search.lucene --threads {self.threads} --batch-size {self.batch_size} \
+                    --index {index} --topics msmarco-doc-dev --bm25 --hits 10000 --max-passage --max-passage-hits 1000',
+                'msmarco-doc-dev',
+                'trec_eval',
+                [['map', 'map'], ['recall.1000', 'recall_1000']])
+            self.assertTrue('map' in scores)
+            self.assertTrue('recall.1000' in scores)
+            self.assertAlmostEqual(scores['map'], 0.2762, delta=0.0001)
+            self.assertAlmostEqual(scores['recall.1000'], 0.9311, delta=0.0001)
+    def test_doc_segmented_msmarco_output(self):
+        """Test case for MS MARCO V1 doc segmented, dev queries, MS MARCO output
+           on all three pre-built indexes (base, slim, full)."""
+        # Loop over all three pre-built indexes.
+        for index in ['msmarco-v1-doc-segmented', 'msmarco-v1-doc-segmented-slim', 'msmarco-v1-doc-segmented-full']:
+            scores = run_retrieval_and_return_scores(
+                'runs/test_run.msmarco-doc-segmented.msmarco.txt',
+                f'python -m pyserini.search.lucene --threads {self.threads} --batch-size {self.batch_size} \
+                    --index {index} --topics msmarco-doc-dev \
+                    --bm25 --hits 1000 --max-passage --max-passage-hits 100 --output-format msmarco',
+                'msmarco-doc-dev',
+                'msmarco_doc_string', [])
+            self.assertTrue('MRR@100' in scores)
+            self.assertEqual(scores['MRR@100'], '0.2755196341768384')
+    #
+    # doc2query conditions
+    #
+    def test_doc_full_expanded_trec_output(self):
+        """Test case for MS MARCO V1 doc (full) + doc2query-T5 expansions, dev queries, TREC output."""
+        scores = run_retrieval_and_return_scores(
+            'runs/test_run.msmarco-doc.expanded.trec.txt',
+            f'python -m pyserini.search.lucene --threads {self.threads} --batch-size {self.batch_size} \
+                --index msmarco-v1-doc-d2q-t5 --topics msmarco-doc-dev --bm25 --hits 1000',
+            'msmarco-doc-dev',
+            'trec_eval',
+            [['map', 'map'], ['recall.1000', 'recall_1000']])
+        self.assertTrue('map' in scores)
+        self.assertTrue('recall.1000' in scores)
+        self.assertAlmostEqual(scores['map'], 0.3273, delta=0.0001)
+        self.assertAlmostEqual(scores['recall.1000'], 0.9553, delta=0.0001)
+    def test_doc_full_expanded_msmarco_output(self):
+        """Test case for MS MARCO V1 doc (full) + doc2query-T5 expansions, dev queries, MS MARCO output."""
+        scores = run_retrieval_and_return_scores(
+            'runs/test_run.msmarco-doc.expanded.msmarco.txt',
+            f'python -m pyserini.search.lucene --threads {self.threads} --batch-size {self.batch_size} \
+                --index msmarco-v1-doc-d2q-t5 --topics msmarco-doc-dev --bm25 --hits 100 --output-format msmarco',
+            'msmarco-doc-dev',
+            'msmarco_doc_string', [])
+        self.assertTrue('MRR@100' in scores)
+        self.assertEqual(scores['MRR@100'], '0.3268656233100833')
+    def test_doc_segmented_expanded_trec_output(self):
+        """Test case for MS MARCO V1 doc segmented + doc2query-T5 expansions, dev queries, TREC output."""
+        scores = run_retrieval_and_return_scores(
+            'runs/test_run.msmarco-doc-segmented.expanded.trec.txt',
+            f'python -m pyserini.search.lucene --threads {self.threads} --batch-size {self.batch_size} \
+                --index msmarco-v1-doc-segmented-d2q-t5 --topics msmarco-doc-dev \
+                --bm25 --hits 10000 --max-passage --max-passage-hits 1000',
+            'msmarco-doc-dev',
+            'trec_eval',
+            [['map', 'map'], ['recall.1000', 'recall_1000']])
+        self.assertTrue('map' in scores)
+        self.assertTrue('recall.1000' in scores)
+        self.assertAlmostEqual(scores['map'], 0.3213, delta=0.0001)
+        self.assertAlmostEqual(scores['recall.1000'], 0.9530, delta=0.0001)
+    def test_doc_segmented_expanded_msmarco_output(self):
+        """Test case for MS MARCO V1 doc segmented + doc2query-T5 expansions, dev queries, MS MARCO output."""
+        scores = run_retrieval_and_return_scores(
+            'runs/test_run.msmarco-doc-segmented.expanded.msmarco.txt',
+            f'python -m pyserini.search.lucene --threads {self.threads} --batch-size {self.batch_size} \
+                --index msmarco-v1-doc-segmented-d2q-t5 --topics msmarco-doc-dev \
+                --bm25 --hits 1000 --max-passage --max-passage-hits 100 --output-format msmarco',
+            'msmarco-doc-dev',
+            'msmarco_doc_string', [])
+        self.assertTrue('MRR@100' in scores)
+        self.assertEqual(scores['MRR@100'], '0.320918438140918')
+if __name__ == '__main__':
+    unittest.main()

integrations/sparse/test_prebuilt_msmarco_v1_passage.py ADDED Viewed

	@@ -0,0 +1,96 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Integration tests for MS MARCO V1 passage corpus using pre-built indexes."""
+import unittest
+from integrations.utils import run_retrieval_and_return_scores
+class TestPrebuiltMsMarcoV1Passage(unittest.TestCase):
+    def setUp(self):
+        self.threads = 16
+        self.batch_size = 128
+    def test_passage_trec_output(self):
+        """Test case for MS MARCO V1 passage, dev queries, TREC output
+           on all three pre-built indexes (base, slim, full)."""
+        # Loop over all three pre-built indexes.
+        for index in ['msmarco-v1-passage', 'msmarco-v1-passage-slim', 'msmarco-v1-passage-full']:
+            scores = run_retrieval_and_return_scores(
+                'runs/test_run.msmarco-passage.trec.txt',
+                f'python -m pyserini.search.lucene \
+                    --threads {self.threads} --batch-size {self.batch_size} \
+                    --index {index} --topics msmarco-passage-dev-subset --bm25',
+                'msmarco-passage-dev-subset',
+                'trec_eval',
+                [['map', 'map'], ['recall.1000', 'recall_1000']])
+            self.assertTrue('map' in scores)
+            self.assertTrue('recall.1000' in scores)
+            self.assertAlmostEqual(scores['map'], 0.1958, delta=0.0001)
+            self.assertAlmostEqual(scores['recall.1000'], 0.8573, delta=0.0001)
+    def test_passage_msmarco_output(self):
+        """Test case for MS MARCO V1 passage, dev queries, MS MARCO output
+           on all three pre-built indexes (base, slim, full)."""
+        # Loop over all three pre-built indexes.
+        for index in ['msmarco-v1-passage', 'msmarco-v1-passage-slim', 'msmarco-v1-passage-full']:
+            scores = run_retrieval_and_return_scores(
+                'runs/test_run.msmarco-passage.msmarco.txt',
+                f'python -m pyserini.search.lucene --threads {self.threads} --batch-size {self.batch_size} \
+                    --index {index} --topics msmarco-passage-dev-subset --bm25 --output-format msmarco',
+                'msmarco-passage-dev-subset',
+                'msmarco_passage_string', [])
+            self.assertTrue('MRR@10' in scores)
+            self.assertEqual(scores['MRR@10'], '0.18741227770955546')
+    def test_passage_expanded_trec_output(self):
+        """Test case for MS MARCO V1 passage w/ doc2query-T5 expansions, dev queries, TREC output."""
+        scores = run_retrieval_and_return_scores(
+            'runs/test_run.msmarco-passage.expanded.trec.txt',
+            f'python -m pyserini.search.lucene --threads {self.threads} --batch-size {self.batch_size} \
+                --index msmarco-v1-passage-d2q-t5 --topics msmarco-passage-dev-subset --bm25',
+            'msmarco-passage-dev-subset',
+            'trec_eval',
+            [['map', 'map'], ['recall.1000', 'recall_1000']])
+        self.assertTrue('map' in scores)
+        self.assertTrue('recall.1000' in scores)
+        self.assertAlmostEqual(scores['map'], 0.2893, delta=0.0001)
+        self.assertAlmostEqual(scores['recall.1000'], 0.9506, delta=0.0001)
+    def test_passage_expanded_msmarco_output(self):
+        """Test case for MS MARCO V1 passage w/ doc2query-T5 expansions, dev queries, MS MARCO output."""
+        scores = run_retrieval_and_return_scores(
+            'runs/test_run.msmarco-passage.expanded.msmarco.txt',
+            f'python -m pyserini.search.lucene --threads {self.threads} --batch-size {self.batch_size} \
+                --index msmarco-v1-passage-d2q-t5 --topics msmarco-passage-dev-subset --bm25 --output-format msmarco',
+            'msmarco-passage-dev-subset',
+            'msmarco_passage_string', [])
+        self.assertTrue('MRR@10' in scores)
+        self.assertEqual(scores['MRR@10'], '0.281560751807885')
+if __name__ == '__main__':
+    unittest.main()

integrations/sparse/test_prebuilt_robust04.py ADDED Viewed

	@@ -0,0 +1,42 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Integration tests for Robust04 using pre-built indexes."""
+import unittest
+from integrations.utils import run_retrieval_and_return_scores
+class TestPrebuiltRobust04(unittest.TestCase):
+    def test_robust04(self):
+        """Test case for Robust04."""
+        scores = run_retrieval_and_return_scores(
+            'runs/test_run.robust04.bm25.txt',
+            'python -m pyserini.search.lucene --topics robust04 --index robust04 --bm25',
+            'robust04',
+            'trec_eval',
+            [['map', 'map'], ['P.30', 'P_30']])
+        self.assertTrue('map' in scores)
+        self.assertTrue('P.30' in scores)
+        self.assertAlmostEqual(scores['map'], 0.2531, delta=0.0001)
+        self.assertAlmostEqual(scores['P.30'], 0.3102, delta=0.0001)
+if __name__ == '__main__':
+    unittest.main()