anpigon commited on
Commit
3e40865
1 Parent(s): 2b11770

chore: Add evaluation data files

Browse files
evaluation.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
evaluation_data/dataset/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a65e6abf048326f22dfaf0bfb8b030eb9432366d427abb37f87e002c320da61b
3
+ size 503992
evaluation_data/dataset/dataset_info.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "question": {
6
+ "dtype": "string",
7
+ "_type": "Value"
8
+ },
9
+ "ground_truth": {
10
+ "feature": {
11
+ "dtype": "string",
12
+ "_type": "Value"
13
+ },
14
+ "_type": "Sequence"
15
+ },
16
+ "answer": {
17
+ "dtype": "string",
18
+ "_type": "Value"
19
+ },
20
+ "contexts": {
21
+ "feature": {
22
+ "dtype": "string",
23
+ "_type": "Value"
24
+ },
25
+ "_type": "Sequence"
26
+ }
27
+ },
28
+ "homepage": "",
29
+ "license": ""
30
+ }
evaluation_data/dataset/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "0b75232bae9cb3e7",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": null
13
+ }
make_corpus.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ from langchain.schema import Document
3
+ from autorag.data.corpus import langchain_documents_to_parquet
4
+
5
+
6
+ with open("/Users/anpigon/Documents/Embed/법원판례/cases.pkl", "rb") as file:
7
+ data = pickle.load(file)
8
+
9
+ print(len(data)) # 총 2736개의 배치
10
+
11
+ docs = []
12
+ for i in range(100):
13
+ for sentence in data[i][1]:
14
+ print(sentence)
15
+ doc = Document(page_content=sentence)
16
+ docs.append(doc)
17
+
18
+ langchain_documents_to_parquet(docs, "evaluation/data/corpus.parquet")