cryptocalypse
Create nos.py
40f0e47 verified
import sys
import math
import re
import heapq
from collections import defaultdict, Counter
from typing import List, Tuple, Dict
class TextProcessor:
def __init__(self, texto):
self.texto = texto
def entropy(self):
simbolos = {}
total_caracteres = len(self.texto)
for caracter in self.texto:
simbolos[caracter] = simbolos.get(caracter, 0) + 1
entropia = 0
for count in simbolos.values():
probabilidad = count / total_caracteres
entropia -= probabilidad * math.log2(probabilidad)
return simbolos, entropia
def common_string(self, cadena1, cadena2):
longitud1 = len(cadena1)
longitud2 = len(cadena2)
comun = ''
subcadenas_comunes = []
for i in range(longitud1):
for j in range(longitud2):
k = 0
while (i+k < longitud1 and j+k < longitud2 and cadena1[i+k] == cadena2[j+k]):
k += 1
if k > 0:
subcadenas_comunes.append(cadena1[i:i+k])
if subcadenas_comunes:
comun = max(subcadenas_comunes, key=len)
return comun
def magic_split(self):
unique_symbols = set(self.texto)
symbol_distances = {}
for symbol in unique_symbols:
indices = [i for i, char in enumerate(self.texto) if char == symbol]
if len(indices) > 1:
distances = [indices[i + 1] - indices[i] for i in range(len(indices) - 1)]
symbol_distances[symbol] = distances
variation = {symbol: max(distances) - min(distances) for symbol, distances in symbol_distances.items() if distances}
mins = {}
for v in variation:
if variation[v]!=0 and variation[v]!=1:
mins[v] = variation[v]
best_symbol = min(mins, key=mins.get)
return best_symbol
def rotate_string(self, string, n):
indice = n % len(string)
string_rotado = string[indice:] + string[:indice]
return string_rotado
def rotate_compare(self, tokiA, tokiB):
if tokiA >= tokiB:
tokA = tokiA
tokB = tokiB
ltokA = len(tokA)
else:
tokA = tokiB
tokB = tokiA
ltokA = len(tokB)
i = 0
rotations = {}
while i < ltokA:
tokrotated = self.rotate_string(tokA, i)
rotations[str(i)] = self.common_string(tokrotated, tokB)
i += 1
best_r = ""
for x in rotations:
lb = len(best_r)
rot = rotations[x]
lrot = len(rot)
if lrot > 1 and lrot < ltokA and lrot > lb:
best_r = rot
return best_r
def get_subTokens(self, spl):
sub_tokens = self.texto.split(spl)
toks = []
for tok in sub_tokens:
for tok2 in sub_tokens:
if tok != tok2:
toks.append(self.rotate_compare(tok, tok2))
return list(set(toks))
def tokenize(self, spliter_optimo):
tokens = self.get_subTokens(spliter_optimo)
tokenized_sentence = {}
chunk = self.texto.split(spliter_optimo)
for txt in chunk:
best_split = ""
if len(txt)<3:
tokenized_sentence[txt]= txt
else:
for tok in tokens:
if tok != "":
lt = len(tok)
lb = len(best_split)
spltxt = txt.split(tok)
if len(spltxt) > 1:
l0 = len(spltxt[0])
l1 = len(spltxt[1])
if lt < len(txt) and lt > lb:
best_split = tok
tokenized_sentence[txt] = " " + spltxt[0] + "-" + tok + "-" + spltxt[1]
return tokenized_sentence
def symbol_distances(self,texto, tokens):
# Ordena los tokens por longitud descendente para garantizar la divisi贸n m谩s larga posible.
txt = texto
for tok in tokens:
if tok !='':
txt = txt.replace(tok,"-"+tok+"-")
#print(txt)
arr = txt.split("-")
return [elem for elem in arr if elem != '']
def distances(self,tokens):
tokens_unicos = {}
for i, token in enumerate(tokens):
if token not in tokens_unicos:
tokens_unicos[token] = [i]
else:
tokens_unicos[token].append(i)
return tokens_unicos
def from_distances(self,tokens_distancias):
rebuild={}
recoded_dic={}
for tok in tokens_distancias:
for dis in tokens_distancias[tok]:
try:
rebuild[dis]=tok
recoded_dic[dis] = gindex(tokens_distancias,tok)
except:
pass
enc = {k: recoded_dic[k] for k in sorted(recoded_dic)}
rebu = {k: rebuild[k] for k in sorted(rebuild)}
dic_str = ""
for d in tokens_distancias:
dic_str+=","+d
enc_str = ""
for e in enc:
enc_str += ","+str(enc[e])
return dic_str,enc_str
def gindex(obj, key):
keys = list(obj.keys())
try:
index = keys.index(key)
return index
except ValueError:
return None # Key not found in the dictionary
# Ejemplo de uso:
texto_ejemplo = "cuando te digo vete , te aburres , corres o andas ? cuando me dices vete , me aburro, corro y ando"
processor = TextProcessor(texto_ejemplo)
spliter_optimo = processor.magic_split()
tokenized_sentence = processor.tokenize(spliter_optimo)
token_txt =""
for token in tokenized_sentence:
token_txt += "-"+tokenized_sentence[token]
tokens = set(token_txt.split("-"))
symb = processor.symbol_distances(texto_ejemplo,tokens)
print("Tokens")
print(tokens)
print("Number of symbols in tokens:")
print(len(tokens))
print("Number of symbols in chars:")
print(len(set(texto_ejemplo)))
print("Length of text",len(texto_ejemplo))
print("Texto original:", texto_ejemplo)
print("Spliter 贸ptimo:", spliter_optimo)
print("Frase tokenizada:", tokenized_sentence)
print("Length tokenized",len(tokenized_sentence))
print("Token Sentences", symb)
print("Lenght Token Sentence", len(symb))
print("Length Symbols Token Dictionary",len(set(symb)))
distances = processor.distances(symb)
print("Token Distances", distances)
print("Token Distance Length", len(distances))
print(gindex(distances,"cu"))
dic_str,enc_str = processor.from_distances(distances)
print(dic_str,enc_str)
class HuffmanNode:
def __init__(self, char: str, freq: int):
self.char = char
self.freq = freq
self.left = None
self.right = None
def __lt__(self, other):
return self.freq < other.freq
def build_huffman_tree(text: str) -> HuffmanNode:
frequency = Counter(text)
priority_queue = [HuffmanNode(char, freq) for char, freq in frequency.items()]
heapq.heapify(priority_queue)
while len(priority_queue) > 1:
left = heapq.heappop(priority_queue)
right = heapq.heappop(priority_queue)
merged_node = HuffmanNode(None, left.freq + right.freq)
merged_node.left = left
merged_node.right = right
heapq.heappush(priority_queue, merged_node)
return priority_queue[0]
def encode_huffman_tree(node: HuffmanNode, prefix: str = "") -> Dict[str, str]:
if node is None:
return {}
if node.char is not None:
return {node.char: prefix}
encoding = {}
encoding.update(encode_huffman_tree(node.left, prefix + "0"))
encoding.update(encode_huffman_tree(node.right, prefix + "1"))
return encoding
def huffman_encode(text: str) -> Tuple[Dict[str, str], bytes]:
root = build_huffman_tree(text)
encoding_map = encode_huffman_tree(root)
encoded_text = ''.join(encoding_map[char] for char in text)
# Asegurarse de que la longitud de la cadena codificada es m煤ltiplo de 8 para la conversi贸n a bytes
remainder = len(encoded_text) % 8
if remainder != 0:
encoded_text += '0' * (8 - remainder)
# Convertir la cadena binaria a bytes
encoded_bytes = bytes(int(encoded_text[i:i+8], 2) for i in range(0, len(encoded_text), 8))
return encoding_map, encoded_bytes
def huffman_decode(encoding_map: Dict[str, str], encoded_bytes: bytes) -> str:
# Convertir bytes a una cadena binaria
encoded_text = ''.join(format(byte, '08b') for byte in encoded_bytes)
decoding_map = {code: char for char, code in encoding_map.items()}
decoded_text = ""
current_code = ""
for bit in encoded_text:
current_code += bit
if current_code in decoding_map:
decoded_text += decoding_map[current_code]
current_code = ""
return decoded_text
def guardar_binarios_en_archivo(binarios: List[bytes], nombre_archivo: str):
with open(nombre_archivo, 'wb') as archivo:
for binario in binarios:
archivo.write(binario)
archivo.write(b'\n') # Separador entre los binarios
print(f"Datos binarios guardados en el archivo '{nombre_archivo}'")
# Ejemplo de uso
cadena1 = dic_str
cadena2 = enc_str
# Codificar cadena1 y cadena2
encoding_map1, encoded_bytes1 = huffman_encode(cadena1)
encoding_map2, encoded_bytes2 = huffman_encode(cadena2)
# Guardar binarios en un solo archivo
guardar_binarios_en_archivo([encoded_bytes1, encoded_bytes2], "text.txt.nos")