derek-thomas HF staff commited on
Commit
5d0ccb5
1 Parent(s): ff76c3e

Adding url and making subreddit have r/

Browse files
Files changed (1) hide show
  1. src/build_nomic.py +3 -2
src/build_nomic.py CHANGED
@@ -44,7 +44,7 @@ def delete_old_nomic():
44
  def build_nomic(dataset):
45
  df = dataset['train'].to_pandas()
46
 
47
- non_embedding_columns = ['date_utc', 'title', 'flair', 'poster', 'permalink', 'id', 'word_count',
48
  'score', 'score_percentile', 'html_content', 'subreddit']
49
 
50
  # Calculate the 0th, 10th, 20th, ..., 90th percentiles for the 'score' column
@@ -62,6 +62,7 @@ def build_nomic(dataset):
62
  df['score_percentile'] = pd.cut(df['score'], bins=bins, labels=labels, include_lowest=True)
63
 
64
  df['word_count'] = df['content'].apply(count_words)
 
65
  df['html_content'] = df['content'].apply(convert_markdown_to_html)
66
 
67
  # Regex to extract subreddit
@@ -69,7 +70,7 @@ def build_nomic(dataset):
69
  def extract_subreddit(text):
70
  match = subreddit_re.search(text)
71
  if match:
72
- return match.group(1)
73
  return ''
74
 
75
  # Apply the function
 
44
  def build_nomic(dataset):
45
  df = dataset['train'].to_pandas()
46
 
47
+ non_embedding_columns = ['date_utc', 'title', 'flair', 'poster', 'url', 'id', 'word_count',
48
  'score', 'score_percentile', 'html_content', 'subreddit']
49
 
50
  # Calculate the 0th, 10th, 20th, ..., 90th percentiles for the 'score' column
 
62
  df['score_percentile'] = pd.cut(df['score'], bins=bins, labels=labels, include_lowest=True)
63
 
64
  df['word_count'] = df['content'].apply(count_words)
65
+ df['url'] = 'https://www.reddit.com' + df['permalink']
66
  df['html_content'] = df['content'].apply(convert_markdown_to_html)
67
 
68
  # Regex to extract subreddit
 
70
  def extract_subreddit(text):
71
  match = subreddit_re.search(text)
72
  if match:
73
+ return 'r/' + match.group(1)
74
  return ''
75
 
76
  # Apply the function