Spaces:

Hobson
/

nlpia-rnn

Sleeping

App Files Files Community

hobs commited on Jun 23, 2022

Commit

0647bb4

•

1 Parent(s): 5c11d69

load categories from json

Browse files

Files changed (2) hide show

app.py +108 -74
categories.json +1 -0

app.py CHANGED Viewed

@@ -2,17 +2,117 @@
 import gradio as gr
-import os
 from pathlib import Path
 # import random
 # import time
 import torch
 import torch.nn as nn
-import pandas as pd
-from nlpia2.init import SRC_DATA_DIR, maybe_download
-from nlpia2.string_normalizers import Asciifier, ASCII_NAME_CHARS
 name_char_vocab_size = len(ASCII_NAME_CHARS) + 1  # Plus EOS marker
@@ -31,49 +131,10 @@ char2i = {c: i for i, c in enumerate(ASCII_NAME_CHARS)}
 print(f'asciify("O’Néàl") => {asciify("O’Néàl")}')
-# Build the category_lines dictionary, a list of names per language
-category_lines = {}
-all_categories = []
-labeled_lines = []
-categories = []
-for filepath in find_files(SRC_DATA_DIR / 'names', '*.txt'):
-    filename = Path(filepath).name
-    filepath = maybe_download(filename=Path('names') / filename)
-    with filepath.open() as fin:
-        lines = [asciify(line.rstrip()) for line in fin]
-    category = Path(filename).with_suffix('')
-    categories.append(category)
-    labeled_lines += list(zip(lines, [category] * len(lines)))
 n_categories = len(categories)
-df = pd.DataFrame(labeled_lines, columns=('name', 'category'))
-def readLines(filename):
-    lines = open(filename, encoding='utf-8').read().strip().split('\n')
-    return [asciify(line) for line in lines]
-for filename in find_files(path='data/names', pattern='*.txt'):
-    category = os.path.splitext(os.path.basename(filename))[0]
-    all_categories.append(category)
-    lines = readLines(filename)
-    category_lines[category] = lines
-n_categories = len(all_categories)
-######################################################################
-# Now we have ``category_lines``, a dictionary mapping each category
-# (language) to a list of lines (names). We also kept track of
-# ``all_categories`` (just a list of languages) and ``n_categories`` for
-# later reference.
-#
-print(category_lines['Italian'][:5])
 ######################################################################
 # Turning Names into Tensors
 # --------------------------
@@ -117,33 +178,6 @@ def encode_one_hot_seq(line):
     return tensor
-print(encode_one_hot_vec('A'))
-print(encode_one_hot_seq('Abe').size())
-######################################################################
-# Creating the Network
-# ====================
-#
-# Before autograd, creating a recurrent neural network in Torch involved
-# cloning the parameters of a layer over several timesteps. The layers
-# held hidden state and gradients which are now entirely handled by the
-# graph itself. This means you can implement a RNN in a very "pure" way,
-# as regular feed-forward layers.
-#
-# This RNN module (mostly copied from `the PyTorch for Torch users
-# tutorial <https://pytorch.org/tutorials/beginner/former_torchies/
-# nn_tutorial.html#example-2-recurrent-net>`__)
-# is just 2 linear layers which operate on an input and hidden state, with
-# a LogSoftmax layer after the output.
-#
-# .. figure:: https://i.imgur.com/Z2xbySO.png
-#    :alt:
-#
-#
 class RNN(nn.Module):
     def __init__(self, input_size, hidden_size, output_size):
         super(RNN, self).__init__()
@@ -178,7 +212,7 @@ output, next_hidden = rnn(input, hidden)
 def categoryFromOutput(output):
     top_n, top_i = output.topk(1)
     category_i = top_i[0].item()
-    return all_categories[category_i], category_i
 def output_from_str(s):
@@ -222,8 +256,8 @@ def predict(input_line, n_predictions=3):
         for i in range(n_predictions):
             value = topv[0][i].item()
             category_index = topi[0][i].item()
-            print('(%.2f) %s' % (value, all_categories[category_index]))
-            predictions.append([value, all_categories[category_index]])
 predict('Dovesky')

 import gradio as gr
+import json
 from pathlib import Path
 # import random
 # import time
 import torch
 import torch.nn as nn
+import string
+import unicodedata
+from unidecode import unidecode
+ASCII_LETTERS = string.ascii_letters
+ASCII_PRINTABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'
+ASCII_PRINTABLE_COMMON = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r'
+ASCII_VERTICAL_TAB = '\x0b'
+ASCII_PAGE_BREAK = '\x0c'
+ASCII_ALL = ''.join(chr(i) for i in range(0, 128))  # ASCII_PRINTABLE
+ASCII_DIGITS = string.digits
+ASCII_IMPORTANT_PUNCTUATION = " .?!,;'-=+)(:"
+ASCII_NAME_PUNCTUATION = " .,;'-"
+ASCII_NAME_CHARS = set(ASCII_LETTERS + ASCII_NAME_PUNCTUATION)
+ASCII_IMPORTANT_CHARS = set(ASCII_LETTERS + ASCII_IMPORTANT_PUNCTUATION)
+CURLY_SINGLE_QUOTES = '‘’`´'
+STRAIGHT_SINGLE_QUOTES = "'" * len(CURLY_SINGLE_QUOTES)
+CURLY_DOUBLE_QUOTES = '“”'
+STRAIGHT_DOUBLE_QUOTES = '"' * len(CURLY_DOUBLE_QUOTES)
+def normalize_newlines(s):
+    s = s.replace(ASCII_VERTICAL_TAB, '\n')
+    s = s.replace(ASCII_PAGE_BREAK, '\n\n')
+class Asciifier:
+    """ Construct a function that filters out all non-ascii unicode characters
+    >>> test_str = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'
+    >>> Asciifier(include='a b c 123XYZ')(test_str):
+    '123abcXYZ '
+    """
+    def __init__(
+            self,
+            min_ord=1, max_ord=128,
+            exclude=None,
+            include=ASCII_PRINTABLE,
+            exclude_category='Mn',
+            normalize_quotes=True,
+    ):
+        self.include = set(sorted(include or ASCII_PRINTABLE))
+        self._include = ''.join(sorted(self.include))
+        self.exclude = exclude or set()
+        self.exclude = set(sorted(exclude or []))
+        self._exclude = ''.join(self.exclude)
+        self.min_ord, self.max_ord = int(min_ord), int(max_ord or 128)
+        self.normalize_quotes = normalize_quotes
+        if self.min_ord:
+            self.include = set(c for c in self.include if ord(c) >= self.min_ord)
+        if self.max_ord:
+            self.include = set(c for c in self._include if ord(c) <= self.max_ord)
+        if exclude_category:
+            self.include = set(
+                c for c in self._include if unicodedata.category(c) != exclude_category)
+        self.vocab = sorted(self.include - self.exclude)
+        self._vocab = ''.join(self.vocab)
+        self.char2i = {c: i for (i, c) in enumerate(self._vocab)}
+        self._translate_from = self._vocab
+        self._translate_to = self._translate_from
+        # FIXME: self.normalize_quotes is accomplished by unidecode.unidecode!!
+        # ’->'  ‘->'  “->"  ”->"
+        if self.normalize_quotes:
+            trans_table = str.maketrans(
+                CURLY_SINGLE_QUOTES + CURLY_DOUBLE_QUOTES,
+                STRAIGHT_SINGLE_QUOTES + STRAIGHT_DOUBLE_QUOTES)
+            self._translate_to = self._translate_to.translate(trans_table)
+            # print(self._translate_to)
+        # eliminate any non-translations (if from == to)
+        self._translate_from_filtered = ''
+        self._translate_to_filtered = ''
+        for c1, c2 in zip(self._translate_from, self._translate_to):
+            if c1 == c2:
+                continue
+            else:
+                self._translate_from_filtered += c1
+                self._translate_to_filtered += c2
+        self._translate_del = ''
+        for c in ASCII_ALL:
+            if c not in self.vocab:
+                self._translate_del += c
+        self._translate_from = self._translate_from_filtered
+        self._translate_to = self._translate_to_filtered
+        self.translation_table = str.maketrans(
+            self._translate_from,
+            self._translate_to,
+            self._translate_del)
+    def __call__(self, text):
+        return unidecode(unicodedata.normalize('NFD', text)).translate(self.translation_table)
 name_char_vocab_size = len(ASCII_NAME_CHARS) + 1  # Plus EOS marker
 print(f'asciify("O’Néàl") => {asciify("O’Néàl")}')
+categories = json.load(open('categories.json'))
 n_categories = len(categories)
 ######################################################################
 # Turning Names into Tensors
 # --------------------------
     return tensor
 class RNN(nn.Module):
     def __init__(self, input_size, hidden_size, output_size):
         super(RNN, self).__init__()
 def categoryFromOutput(output):
     top_n, top_i = output.topk(1)
     category_i = top_i[0].item()
+    return categories[category_i], category_i
 def output_from_str(s):
         for i in range(n_predictions):
             value = topv[0][i].item()
             category_index = topi[0][i].item()
+            print('(%.2f) %s' % (value, categories[category_index]))
+            predictions.append([value, categories[category_index]])
 predict('Dovesky')

categories.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ ["Arabic", "Irish", "Spanish", "French", "German", "English", "Korean", "Vietnamese", "Scottish", "Japanese", "Polish", "Greek", "Czech", "Italian", "Portuguese", "Russian", "Dutch", "Chinese"]