cryptocalypse commited on
Commit
40f0e47
1 Parent(s): cf93e36

Create nos.py

Browse files
Files changed (1) hide show
  1. nos.py +326 -0
nos.py ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import math
3
+ import re
4
+
5
+ import heapq
6
+ from collections import defaultdict, Counter
7
+ from typing import List, Tuple, Dict
8
+
9
+
10
+ class TextProcessor:
11
+ def __init__(self, texto):
12
+ self.texto = texto
13
+
14
+ def entropy(self):
15
+ simbolos = {}
16
+ total_caracteres = len(self.texto)
17
+
18
+ for caracter in self.texto:
19
+ simbolos[caracter] = simbolos.get(caracter, 0) + 1
20
+
21
+ entropia = 0
22
+ for count in simbolos.values():
23
+ probabilidad = count / total_caracteres
24
+ entropia -= probabilidad * math.log2(probabilidad)
25
+
26
+ return simbolos, entropia
27
+
28
+ def common_string(self, cadena1, cadena2):
29
+ longitud1 = len(cadena1)
30
+ longitud2 = len(cadena2)
31
+ comun = ''
32
+ subcadenas_comunes = []
33
+
34
+ for i in range(longitud1):
35
+ for j in range(longitud2):
36
+ k = 0
37
+ while (i+k < longitud1 and j+k < longitud2 and cadena1[i+k] == cadena2[j+k]):
38
+ k += 1
39
+ if k > 0:
40
+ subcadenas_comunes.append(cadena1[i:i+k])
41
+
42
+ if subcadenas_comunes:
43
+ comun = max(subcadenas_comunes, key=len)
44
+
45
+ return comun
46
+
47
+ def magic_split(self):
48
+ unique_symbols = set(self.texto)
49
+ symbol_distances = {}
50
+ for symbol in unique_symbols:
51
+ indices = [i for i, char in enumerate(self.texto) if char == symbol]
52
+ if len(indices) > 1:
53
+ distances = [indices[i + 1] - indices[i] for i in range(len(indices) - 1)]
54
+ symbol_distances[symbol] = distances
55
+
56
+ variation = {symbol: max(distances) - min(distances) for symbol, distances in symbol_distances.items() if distances}
57
+
58
+ mins = {}
59
+ for v in variation:
60
+ if variation[v]!=0 and variation[v]!=1:
61
+ mins[v] = variation[v]
62
+
63
+ best_symbol = min(mins, key=mins.get)
64
+
65
+ return best_symbol
66
+
67
+ def rotate_string(self, string, n):
68
+ indice = n % len(string)
69
+ string_rotado = string[indice:] + string[:indice]
70
+ return string_rotado
71
+
72
+ def rotate_compare(self, tokiA, tokiB):
73
+ if tokiA >= tokiB:
74
+ tokA = tokiA
75
+ tokB = tokiB
76
+ ltokA = len(tokA)
77
+ else:
78
+ tokA = tokiB
79
+ tokB = tokiA
80
+ ltokA = len(tokB)
81
+
82
+ i = 0
83
+ rotations = {}
84
+ while i < ltokA:
85
+ tokrotated = self.rotate_string(tokA, i)
86
+ rotations[str(i)] = self.common_string(tokrotated, tokB)
87
+ i += 1
88
+
89
+ best_r = ""
90
+ for x in rotations:
91
+ lb = len(best_r)
92
+ rot = rotations[x]
93
+ lrot = len(rot)
94
+ if lrot > 1 and lrot < ltokA and lrot > lb:
95
+ best_r = rot
96
+
97
+ return best_r
98
+
99
+ def get_subTokens(self, spl):
100
+ sub_tokens = self.texto.split(spl)
101
+ toks = []
102
+ for tok in sub_tokens:
103
+ for tok2 in sub_tokens:
104
+ if tok != tok2:
105
+ toks.append(self.rotate_compare(tok, tok2))
106
+
107
+ return list(set(toks))
108
+
109
+ def tokenize(self, spliter_optimo):
110
+ tokens = self.get_subTokens(spliter_optimo)
111
+ tokenized_sentence = {}
112
+ chunk = self.texto.split(spliter_optimo)
113
+ for txt in chunk:
114
+ best_split = ""
115
+ if len(txt)<3:
116
+ tokenized_sentence[txt]= txt
117
+ else:
118
+
119
+ for tok in tokens:
120
+ if tok != "":
121
+ lt = len(tok)
122
+ lb = len(best_split)
123
+ spltxt = txt.split(tok)
124
+ if len(spltxt) > 1:
125
+ l0 = len(spltxt[0])
126
+ l1 = len(spltxt[1])
127
+ if lt < len(txt) and lt > lb:
128
+ best_split = tok
129
+ tokenized_sentence[txt] = " " + spltxt[0] + "-" + tok + "-" + spltxt[1]
130
+
131
+ return tokenized_sentence
132
+
133
+
134
+ def symbol_distances(self,texto, tokens):
135
+ # Ordena los tokens por longitud descendente para garantizar la división más larga posible.
136
+ txt = texto
137
+ for tok in tokens:
138
+ if tok !='':
139
+ txt = txt.replace(tok,"-"+tok+"-")
140
+
141
+ #print(txt)
142
+ arr = txt.split("-")
143
+ return [elem for elem in arr if elem != '']
144
+
145
+
146
+ def distances(self,tokens):
147
+ tokens_unicos = {}
148
+ for i, token in enumerate(tokens):
149
+ if token not in tokens_unicos:
150
+ tokens_unicos[token] = [i]
151
+ else:
152
+ tokens_unicos[token].append(i)
153
+
154
+ return tokens_unicos
155
+
156
+
157
+
158
+ def from_distances(self,tokens_distancias):
159
+ rebuild={}
160
+ recoded_dic={}
161
+ for tok in tokens_distancias:
162
+ for dis in tokens_distancias[tok]:
163
+ try:
164
+ rebuild[dis]=tok
165
+ recoded_dic[dis] = gindex(tokens_distancias,tok)
166
+ except:
167
+ pass
168
+
169
+
170
+ enc = {k: recoded_dic[k] for k in sorted(recoded_dic)}
171
+ rebu = {k: rebuild[k] for k in sorted(rebuild)}
172
+
173
+ dic_str = ""
174
+ for d in tokens_distancias:
175
+ dic_str+=","+d
176
+
177
+ enc_str = ""
178
+ for e in enc:
179
+ enc_str += ","+str(enc[e])
180
+
181
+ return dic_str,enc_str
182
+
183
+
184
+ def gindex(obj, key):
185
+ keys = list(obj.keys())
186
+ try:
187
+ index = keys.index(key)
188
+ return index
189
+ except ValueError:
190
+ return None # Key not found in the dictionary
191
+
192
+
193
+
194
+ # Ejemplo de uso:
195
+ texto_ejemplo = "cuando te digo vete , te aburres , corres o andas ? cuando me dices vete , me aburro, corro y ando"
196
+ processor = TextProcessor(texto_ejemplo)
197
+ spliter_optimo = processor.magic_split()
198
+ tokenized_sentence = processor.tokenize(spliter_optimo)
199
+
200
+ token_txt =""
201
+
202
+ for token in tokenized_sentence:
203
+ token_txt += "-"+tokenized_sentence[token]
204
+
205
+
206
+ tokens = set(token_txt.split("-"))
207
+ symb = processor.symbol_distances(texto_ejemplo,tokens)
208
+
209
+ print("Tokens")
210
+ print(tokens)
211
+
212
+ print("Number of symbols in tokens:")
213
+ print(len(tokens))
214
+
215
+ print("Number of symbols in chars:")
216
+ print(len(set(texto_ejemplo)))
217
+ print("Length of text",len(texto_ejemplo))
218
+
219
+ print("Texto original:", texto_ejemplo)
220
+ print("Spliter óptimo:", spliter_optimo)
221
+ print("Frase tokenizada:", tokenized_sentence)
222
+ print("Length tokenized",len(tokenized_sentence))
223
+ print("Token Sentences", symb)
224
+ print("Lenght Token Sentence", len(symb))
225
+ print("Length Symbols Token Dictionary",len(set(symb)))
226
+ distances = processor.distances(symb)
227
+
228
+ print("Token Distances", distances)
229
+ print("Token Distance Length", len(distances))
230
+
231
+ print(gindex(distances,"cu"))
232
+ dic_str,enc_str = processor.from_distances(distances)
233
+ print(dic_str,enc_str)
234
+
235
+
236
+
237
+
238
+
239
+ class HuffmanNode:
240
+ def __init__(self, char: str, freq: int):
241
+ self.char = char
242
+ self.freq = freq
243
+ self.left = None
244
+ self.right = None
245
+
246
+ def __lt__(self, other):
247
+ return self.freq < other.freq
248
+
249
+ def build_huffman_tree(text: str) -> HuffmanNode:
250
+ frequency = Counter(text)
251
+ priority_queue = [HuffmanNode(char, freq) for char, freq in frequency.items()]
252
+ heapq.heapify(priority_queue)
253
+
254
+ while len(priority_queue) > 1:
255
+ left = heapq.heappop(priority_queue)
256
+ right = heapq.heappop(priority_queue)
257
+
258
+ merged_node = HuffmanNode(None, left.freq + right.freq)
259
+ merged_node.left = left
260
+ merged_node.right = right
261
+
262
+ heapq.heappush(priority_queue, merged_node)
263
+
264
+ return priority_queue[0]
265
+
266
+ def encode_huffman_tree(node: HuffmanNode, prefix: str = "") -> Dict[str, str]:
267
+ if node is None:
268
+ return {}
269
+
270
+ if node.char is not None:
271
+ return {node.char: prefix}
272
+
273
+ encoding = {}
274
+ encoding.update(encode_huffman_tree(node.left, prefix + "0"))
275
+ encoding.update(encode_huffman_tree(node.right, prefix + "1"))
276
+
277
+ return encoding
278
+
279
+ def huffman_encode(text: str) -> Tuple[Dict[str, str], bytes]:
280
+ root = build_huffman_tree(text)
281
+ encoding_map = encode_huffman_tree(root)
282
+ encoded_text = ''.join(encoding_map[char] for char in text)
283
+
284
+ # Asegurarse de que la longitud de la cadena codificada es múltiplo de 8 para la conversión a bytes
285
+ remainder = len(encoded_text) % 8
286
+ if remainder != 0:
287
+ encoded_text += '0' * (8 - remainder)
288
+
289
+ # Convertir la cadena binaria a bytes
290
+ encoded_bytes = bytes(int(encoded_text[i:i+8], 2) for i in range(0, len(encoded_text), 8))
291
+
292
+ return encoding_map, encoded_bytes
293
+
294
+ def huffman_decode(encoding_map: Dict[str, str], encoded_bytes: bytes) -> str:
295
+ # Convertir bytes a una cadena binaria
296
+ encoded_text = ''.join(format(byte, '08b') for byte in encoded_bytes)
297
+
298
+ decoding_map = {code: char for char, code in encoding_map.items()}
299
+ decoded_text = ""
300
+ current_code = ""
301
+ for bit in encoded_text:
302
+ current_code += bit
303
+ if current_code in decoding_map:
304
+ decoded_text += decoding_map[current_code]
305
+ current_code = ""
306
+ return decoded_text
307
+
308
+ def guardar_binarios_en_archivo(binarios: List[bytes], nombre_archivo: str):
309
+ with open(nombre_archivo, 'wb') as archivo:
310
+ for binario in binarios:
311
+ archivo.write(binario)
312
+ archivo.write(b'\n') # Separador entre los binarios
313
+ print(f"Datos binarios guardados en el archivo '{nombre_archivo}'")
314
+
315
+ # Ejemplo de uso
316
+ cadena1 = dic_str
317
+ cadena2 = enc_str
318
+
319
+ # Codificar cadena1 y cadena2
320
+ encoding_map1, encoded_bytes1 = huffman_encode(cadena1)
321
+ encoding_map2, encoded_bytes2 = huffman_encode(cadena2)
322
+
323
+ # Guardar binarios en un solo archivo
324
+ guardar_binarios_en_archivo([encoded_bytes1, encoded_bytes2], "text.txt.nos")
325
+
326
+