Spaces:

teatwots
/

wordcloud

Sleeping

App Files Files Community

teatwots commited on Jun 10

Commit

ea5d289

•

1 Parent(s): 943fd65

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -72

app.py CHANGED Viewed

@@ -1,81 +1,96 @@
 import matplotlib.pyplot as plt
 from wordcloud import WordCloud
 import nltk
 from collections import Counter
-from translate import Translator
-from nltk.corpus import stopwords
 import gradio as gr
 # Download necessary NLTK data
 nltk.download('punkt')
 nltk.download('averaged_perceptron_tagger')
 nltk.download('stopwords')
-# Set up the translator
-translator = Translator(to_lang="ko")
 stop_words = set(stopwords.words('english'))
-# Define example sentences and synonyms for the word list
 word_data_examples = {
-    "village": ("The village was quiet at night.", "hamlet, community"),
-    "adventure": ("They went on an exciting adventure in the forest.", "expedition, quest"),
-    "map": ("We used a map to find the hidden treasure.", "chart, atlas"),
-    "cave": ("They explored a dark cave in the mountains.", "cavern, grotto"),
-    "among": ("She found her book among the pile of papers.", "amidst, between"),
-    "mountains": ("The mountains were covered with snow in winter.", "peaks, ranges"),
-    "children": ("The children played games in the park.", "kids, youngsters"),
-    "known": ("He was known for his kindness and bravery.", "recognized, famous"),
-    "hidden": ("They found a hidden door behind the bookshelf.", "concealed, secret"),
-    "local": ("The local market was full of fresh produce.", "regional, native"),
-    "discovery": ("The discovery of the old map excited everyone.", "finding, revelation"),
-    "eagle": ("An eagle soared high above the valley.", "raptor, bird of prey"),
-    "villagers": ("The villagers gathered in the square for the festival.", "residents, townsfolk"),
-    "legend": ("The legend of the lost city intrigued the adventurers.", "myth, lore"),
-    "tales": ("Grandma told us tales of her childhood.", "stories, narratives"),
-    "daring": ("His daring escape from the cave was legendary.", "bold, audacious"),
-    "spirit": ("The spirit of adventure was alive in their hearts.", "soul, essence"),
-    "exploring": ("They spent the summer exploring the forest.", "investigating, discovering"),
-    "old": ("The old castle was full of secrets.", "ancient, aged"),
-    "lost": ("He felt lost without his best friend.", "missing, misplaced"),
-    "ancient": ("They discovered ancient artifacts in the desert.", "archaic, antique"),
-    "inside": ("Inside the box was a beautiful necklace.", "within, interior"),
-    "treasure": ("They dreamed of finding hidden treasure.", "riches, valuables"),
-    "whispering": ("The trees were whispering secrets in the wind.", "murmuring, softly speaking"),
-    "hollow": ("They found a hollow tree to hide in during the storm.", "cavity, void"),
-    "decided": ("She decided to take the long way home.", "determined, resolved"),
-    "journey": ("Their journey took them across the country.", "trip, voyage"),
-    "together": ("They worked together to solve the mystery.", "jointly, collectively"),
-    "way": ("She found a new way to solve the puzzle.", "method, manner"),
-    "reached": ("They finally reached the top of the hill.", "arrived, attained"),
-    "chest": ("The chest was filled with gold coins.", "trunk, box"),
-    "boulder": ("A large boulder blocked the path.", "rock, stone"),
-    "artifacts": ("The museum displayed artifacts from ancient Egypt.", "relics, antiquities"),
-    "legends": ("The legends spoke of a hidden kingdom.", "myths, sagas"),
-    "explore": ("They wanted to explore the old mansion.", "investigate, examine"),
-    "secret": ("She kept the secret hidden from everyone.", "confidential, hidden"),
-    "small": ("The small kitten was very playful.", "tiny, little"),
-    "mountain": ("The mountain was covered in thick forests.", "peak, hill"),
-    "part": ("Each part of the puzzle was important.", "piece, segment"),
-    "everyday": ("He wore his everyday clothes to the party.", "daily, routine"),
-    "life": ("Life in the village was peaceful.", "existence, being"),
-    "nestled": ("The cabin was nestled in the woods.", "tucked, situated"),
-    "towering": ("The towering trees made the forest dark and cool.", "lofty, soaring"),
-    "peaks": ("The mountain peaks were covered in snow.", "summits, crests"),
-    "said": ("He said he would be back soon.", "stated, remarked"),
-    "protected": ("The ancient ruins were protected by law.", "guarded, sheltered"),
-    "massive": ("The massive ship docked at the port.", "enormous, huge"),
-    "supposedly": ("The treasure was supposedly buried under the tree.", "allegedly, reportedly"),
-    "watched": ("They watched the movie together.", "observed, viewed"),
-    "perch": ("The bird found a perch on the windowsill.", "roost, rest")
 }
 # Words to be excluded from both the word cloud and the word list
 exclude_words = set([
-    'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
     'my', 'your', 'his', 'its', 'our', 'their', 'mine', 'yours', 'hers', 'ours', 'theirs',
     'alex', 'mia', 'sam', 'echo', 'ridge', 'guardian', 'of', 'the', 'glen'
 ])
 def process_text(text):
     words = nltk.word_tokenize(text)
     words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words and word.lower() not in exclude_words]
@@ -93,30 +108,59 @@ def generate_wordcloud(word_freq):
 def translate_and_get_pos(word_freq, pos_tags):
     pos_map = {
-        'NN': 'n.', 'NNS': 'n.', 'VB': 'v.', 'VBD': 'v.', 'VBG': 'v.', 'VBN': 'v.',
-        'VBP': 'v.', 'VBZ': 'v.', 'JJ': 'adj.', 'JJR': 'adj.', 'JJS': 'adj.', 'RB': 'adv.',
-        'RBR': 'adv.', 'RBS': 'adv.'
     }
     word_data = []
     for word, freq in word_freq.items():
-        pos = [pos_tag[1] for pos_tag in pos_tags if pos_tag[0] == word]
-        if pos and (pos[0] in ['NNP', 'NNPS'] or word in exclude_words):
-            continue  # Skip proper nouns, pronouns, and specific excluded words
-        translation = translator.translate(word)
-        pos = pos_map.get(pos[0], 'N/A') if pos else 'N/A'
-        example_sentence, synonyms = word_data_examples.get(word, (f"ex) The word '{word}' in a sentence.", ""))
-        word_data.append((word, freq, translation, pos, example_sentence, synonyms))
-    word_data.sort(key=lambda x: x[1], reverse=True)
-    return word_data[:50]
 def main(text):
     word_freq, pos_tags = process_text(text)
     wordcloud_image = generate_wordcloud(word_freq)
     word_data = translate_and_get_pos(word_freq, pos_tags)
-    word_data_str = "\n".join([f"{i+1}. {word}: {pos} {translation}, ex) {example_sentence} 동의어: {synonyms}." for i, (word, freq, translation, pos, example_sentence, synonyms) in enumerate(word_data)])
-    return wordcloud_image, word_data_str
 # Custom CSS for the Gradio interface
 css = """
@@ -128,6 +172,18 @@ body {
     background-color: blue !important;
     border-color: blue !important;
 }
 </style>
 """
@@ -135,7 +191,7 @@ body {
 interface = gr.Interface(
     fn=main,
     inputs="text",
-    outputs=["image", "text"],
     title="Wordcloud Vocabulary Learning App",
     description="Input text to generate a word cloud and a frequency list with Korean meanings, parts of speech, and example sentences."
      "<br><br><b>The full text:</b><br>"

 import matplotlib.pyplot as plt
 from wordcloud import WordCloud
 import nltk
 from collections import Counter
+from nltk.corpus import stopwords, wordnet
+from nltk.stem import WordNetLemmatizer
 import gradio as gr
+import pandas as pd
 # Download necessary NLTK data
 nltk.download('punkt')
 nltk.download('averaged_perceptron_tagger')
 nltk.download('stopwords')
+nltk.download('wordnet')
 stop_words = set(stopwords.words('english'))
+lemmatizer = WordNetLemmatizer()
+# Define example sentences, synonyms, and Korean meanings for the word list
 word_data_examples = {
+    "village": ("The village was quiet at night.", "hamlet, community", "마을", "시골"),
+    "adventure": ("They went on an exciting adventure in the forest.", "expedition, quest", "모험", "여행"),
+    "map": ("We used a map to find the hidden treasure.", "chart, atlas", "지도", "약도"),
+    "cave": ("They explored a dark cave in the mountains.", "cavern, grotto", "동굴", "굴"),
+    "among": ("She found her book among the pile of papers.", "amidst, between", "가운데", "사이에"),
+    "mountains": ("The mountains were covered with snow in winter.", "peaks, ranges", "산", "산맥"),
+    "children": ("The children played games in the park.", "kids, youngsters", "아이들", "어린이"),
+    "known": ("He was known for his kindness and bravery.", "recognized, famous", "알려진", "유명한"),
+    "hidden": ("They found a hidden door behind the bookshelf.", "concealed, secret", "숨겨진", "비밀의"),
+    "local": ("The local market was full of fresh produce.", "regional, native", "지역의", "현지의"),
+    "discovery": ("The discovery of the old map excited everyone.", "finding, revelation", "발견", "탐구"),
+    "eagle": ("An eagle soared high above the valley.", "raptor, bird of prey", "독수리", "맹금"),
+    "villagers": ("The villagers gathered in the square for the festival.", "residents, townsfolk", "마을 사람들", "주민들"),
+    "legend": ("The legend of the lost city intrigued the adventurers.", "myth, lore", "전설", "신화"),
+    "tales": ("Grandma told us tales of her childhood.", "stories, narratives", "이야기", "동화"),
+    "daring": ("His daring escape from the cave was legendary.", "bold, audacious", "대담한", "용감한"),
+    "spirit": ("The spirit of adventure was alive in their hearts.", "soul, essence", "정신", "혼"),
+    "exploring": ("They spent the summer exploring the forest.", "investigating, discovering", "탐험하다", "탐구하다"),
+    "old": ("The old castle was full of secrets.", "ancient, aged", "오래된", "낡은"),
+    "lost": ("He felt lost without his best friend.", "missing, misplaced", "잃어버린", "길을 잃은"),
+    "ancient": ("They discovered ancient artifacts in the desert.", "archaic, antique", "고대의", "옛날의"),
+    "inside": ("Inside the box was a beautiful necklace.", "within, interior", "안쪽", "내부"),
+    "treasure": ("They dreamed of finding hidden treasure.", "riches, valuables", "보물", "귀중품"),
+    "whispering": ("The trees were whispering secrets in the wind.", "murmuring, softly speaking", "속삭이는", "조용히 말하는"),
+    "hollow": ("They found a hollow tree to hide in during the storm.", "cavity, void", "빈", "구멍 난"),
+    "decided": ("She decided to take the long way home.", "determined, resolved", "결정하다", "결심하다"),
+    "journey": ("Their journey took them across the country.", "trip, voyage", "여행", "여정"),
+    "together": ("They worked together to solve the mystery.", "jointly, collectively", "함께", "같이"),
+    "way": ("She found a new way to solve the puzzle.", "method, manner", "방법", "방식"),
+    "reached": ("They finally reached the top of the hill.", "arrived, attained", "도달하다", "도착하다"),
+    "chest": ("The chest was filled with gold coins.", "trunk, box", "상자", "가슴"),
+    "boulder": ("A large boulder blocked the path.", "rock, stone", "바위", "돌"),
+    "artifacts": ("The museum displayed artifacts from ancient Egypt.", "relics, antiquities", "유물", "고대 유물"),
+    "legends": ("The legends spoke of a hidden kingdom.", "myths, sagas", "전설", "신화"),
+    "explore": ("They wanted to explore the old mansion.", "investigate, examine", "탐험하다", "조사하다"),
+    "secret": ("She kept the secret hidden from everyone.", "confidential, hidden", "비밀", "숨겨진"),
+    "small": ("The small kitten was very playful.", "tiny, little", "작은", "소형"),
+    "mountain": ("The mountain was covered in thick forests.", "peak, hill", "산", "산맥"),
+    "part": ("Each part of the puzzle was important.", "piece, segment", "부분", "조각"),
+    "everyday": ("He wore his everyday clothes to the party.", "daily, routine", "일상적인", "매일의"),
+    "life": ("Life in the village was peaceful.", "existence, being", "삶", "생명"),
+    "nestled": ("The cabin was nestled in the woods.", "tucked, situated", "자리 잡다", "위치하다"),
+    "towering": ("The towering trees made the forest dark and cool.", "lofty, soaring", "우뚝 솟은", "높은"),
+    "peaks": ("The mountain peaks were covered in snow.", "summits, crests", "산봉우리", "정상"),
+    "said": ("He said he would be back soon.", "stated, remarked", "말하다", "언급하다"),
+    "protected": ("The ancient ruins were protected by law.", "guarded, sheltered", "보호된", "지켜진"),
+    "massive": ("The massive ship docked at the port.", "enormous, huge", "거대한", "엄청난"),
+    "supposedly": ("The treasure was supposedly buried under the tree.", "allegedly, reportedly", "아마", "추정상"),
+    "watched": ("They watched the movie together.", "observed, viewed", "보다", "관찰하다"),
+    "perch": ("The bird found a perch on the windowsill.", "roost, rest", "횃대", "앉다")
 }
 # Words to be excluded from both the word cloud and the word list
 exclude_words = set([
+    'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
     'my', 'your', 'his', 'its', 'our', 'their', 'mine', 'yours', 'hers', 'ours', 'theirs',
     'alex', 'mia', 'sam', 'echo', 'ridge', 'guardian', 'of', 'the', 'glen'
 ])
+def get_wordnet_pos(treebank_tag):
+    if treebank_tag.startswith('J'):
+        return wordnet.ADJ
+    elif treebank_tag.startswith('V'):
+        return wordnet.VERB
+    elif treebank_tag.startswith('N'):
+        return wordnet.NOUN
+    elif treebank_tag.startswith('R'):
+        return wordnet.ADV
+    else:
+        return None
 def process_text(text):
     words = nltk.word_tokenize(text)
     words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words and word.lower() not in exclude_words]
 def translate_and_get_pos(word_freq, pos_tags):
     pos_map = {
+        'NN': 'n.', 'NNS': 'n.', 'NNP': 'n.', 'NNPS': 'n.', 'VB': 'v.', 'VBD': 'v. (과거형)', 'VBG': 'v. (ing형)',
+        'VBN': 'v. (과거분사형/수동태)', 'VBP': 'v.', 'VBZ': 'v.', 'JJ': 'adj.', 'JJR': 'adj.', 'JJS': 'adj.',
+        'RB': 'adv.', 'RBR': 'adv.', 'RBS': 'adv.', 'IN': 'prep.', 'DT': 'det.', 'CC': 'conj.',
+        'UH': 'intj.'
     }
+    seen_verbs = set()  # To track if we have already annotated specific verb forms
     word_data = []
     for word, freq in word_freq.items():
+        if word not in word_data_examples:
+            continue  # Skip if the word is not in the word_data_examples
+        pos_list = [pos_map.get(pos_tag[1], 'N/A') for pos_tag in pos_tags if pos_tag[0] == word and pos_tag[1] in pos_map]
+        pos_list = set(pos_list) if pos_list else {'N/A'}
+        if 'N/A' in pos_list or word in exclude_words:
+            continue  # Skip if no valid POS or excluded word
+        pos_str = ", ".join(pos_list)
+        # Check if the word is a specific verb form and get the base form
+        lemmatized_word = word
+        original_pos_tags = [pos_tag[1] for pos_tag in pos_tags if pos_tag[0] == word]
+        for pos_tag in original_pos_tags:
+            wn_pos = get_wordnet_pos(pos_tag)
+            if wn_pos == wordnet.VERB:
+                lemmatized_word = lemmatizer.lemmatize(word, wn_pos)
+                if word != lemmatized_word and lemmatized_word not in seen_verbs:
+                    if pos_tag.startswith('VBD'):
+                        pos_str += f" (v. {lemmatized_word}의 과거형)"
+                    elif pos_tag.startswith('VBG'):
+                        pos_str += f" (v. {lemmatized_word}의 ing형)"
+                    elif pos_tag.startswith('VBN'):
+                        pos_str += f" (v. {lemmatized_word}의 과거분사형/수동태)"
+                    seen_verbs.add(lemmatized_word)
+        translation = f"{word_data_examples[word][2]}, {word_data_examples[word][3]}"
+        example_sentence, synonyms = word_data_examples[word][:2]
+        word_data.append((word, pos_str, translation, example_sentence, synonyms))
+    # Sort the word data by frequency
+    word_data.sort(key=lambda x: word_freq[x[0]], reverse=True)
+    return word_data
 def main(text):
     word_freq, pos_tags = process_text(text)
     wordcloud_image = generate_wordcloud(word_freq)
     word_data = translate_and_get_pos(word_freq, pos_tags)
+    # Create a DataFrame to display the word data in a table format
+    df = pd.DataFrame(word_data, columns=["어휘 (Word)", "범주 (Category)", "뜻 (Meaning)", "예문 (Example)", "동의어 (Synonyms)"])
+    word_data_table = df.to_html(index=False, justify='center')
+    return wordcloud_image, word_data_table
 # Custom CSS for the Gradio interface
 css = """
     background-color: blue !important;
     border-color: blue !important;
 }
+table {
+    width: 100%;
+    border-collapse: collapse;
+    text-align: center;
+}
+th, td {
+    padding: 8px;
+    border: 1px solid #ddd;
+}
+th {
+    background-color: #f2f2f2;
+}
 </style>
 """
 interface = gr.Interface(
     fn=main,
     inputs="text",
+    outputs=["image", "html"],
     title="Wordcloud Vocabulary Learning App",
     description="Input text to generate a word cloud and a frequency list with Korean meanings, parts of speech, and example sentences."
      "<br><br><b>The full text:</b><br>"