Spaces:
Runtime error
Runtime error
Commit
•
5d0ccb5
1
Parent(s):
ff76c3e
Adding url and making subreddit have r/
Browse files- src/build_nomic.py +3 -2
src/build_nomic.py
CHANGED
@@ -44,7 +44,7 @@ def delete_old_nomic():
|
|
44 |
def build_nomic(dataset):
|
45 |
df = dataset['train'].to_pandas()
|
46 |
|
47 |
-
non_embedding_columns = ['date_utc', 'title', 'flair', 'poster', '
|
48 |
'score', 'score_percentile', 'html_content', 'subreddit']
|
49 |
|
50 |
# Calculate the 0th, 10th, 20th, ..., 90th percentiles for the 'score' column
|
@@ -62,6 +62,7 @@ def build_nomic(dataset):
|
|
62 |
df['score_percentile'] = pd.cut(df['score'], bins=bins, labels=labels, include_lowest=True)
|
63 |
|
64 |
df['word_count'] = df['content'].apply(count_words)
|
|
|
65 |
df['html_content'] = df['content'].apply(convert_markdown_to_html)
|
66 |
|
67 |
# Regex to extract subreddit
|
@@ -69,7 +70,7 @@ def build_nomic(dataset):
|
|
69 |
def extract_subreddit(text):
|
70 |
match = subreddit_re.search(text)
|
71 |
if match:
|
72 |
-
return match.group(1)
|
73 |
return ''
|
74 |
|
75 |
# Apply the function
|
|
|
44 |
def build_nomic(dataset):
|
45 |
df = dataset['train'].to_pandas()
|
46 |
|
47 |
+
non_embedding_columns = ['date_utc', 'title', 'flair', 'poster', 'url', 'id', 'word_count',
|
48 |
'score', 'score_percentile', 'html_content', 'subreddit']
|
49 |
|
50 |
# Calculate the 0th, 10th, 20th, ..., 90th percentiles for the 'score' column
|
|
|
62 |
df['score_percentile'] = pd.cut(df['score'], bins=bins, labels=labels, include_lowest=True)
|
63 |
|
64 |
df['word_count'] = df['content'].apply(count_words)
|
65 |
+
df['url'] = 'https://www.reddit.com' + df['permalink']
|
66 |
df['html_content'] = df['content'].apply(convert_markdown_to_html)
|
67 |
|
68 |
# Regex to extract subreddit
|
|
|
70 |
def extract_subreddit(text):
|
71 |
match = subreddit_re.search(text)
|
72 |
if match:
|
73 |
+
return 'r/' + match.group(1)
|
74 |
return ''
|
75 |
|
76 |
# Apply the function
|