Upload 12 files

Browse files

Model and python training files

Files changed (12) hide show

model/config.json +62 -0
model/generation_config.json +7 -0
model/pytorch_model.bin +3 -0
model/special_tokens_map.json +107 -0
model/spiece.model +3 -0
model/tokenizer.json +0 -0
model/tokenizer_config.json +112 -0
model_training.py +76 -0
test.csv +0 -0
test_model.py +19 -0
transcript_downloader.py +101 -0
urls.csv +1 -0

model/config.json ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+  "_name_or_path": "google/flan-t5-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "classifier_dropout": 0.0,
+  "d_ff": 1024,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "gelu_new",
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "gated-gelu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": true,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 8,
+  "num_heads": 6,
+  "num_layers": 8,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.33.2",
+  "use_cache": true,
+  "vocab_size": 32128
+}

model/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.33.2"
+}

model/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e5838dc0a49ce85ea877140feae20f5fd1b9d1bbb460f535c3f9ffae494ae03
+size 307907461

model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,107 @@

+{
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
+  "eos_token": "</s>",
+  "pad_token": "</s>",
+  "unk_token": "<unk>"
+}

model/spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
+size 791656

model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,112 @@

+{
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "</s>",
+  "extra_ids": 100,
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "T5Tokenizer",
+  "unk_token": "<unk>"
+}

model_training.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import os
+from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM
+from datasets import load_dataset, Dataset
+os.environ['TOKENIZERS_PARALLELISM'] = 'true'
+def load_transcripts(directory):
+    transcript_texts = []
+    for filename in os.listdir(directory):
+        if filename.endswith('.txt'):
+            file_path = os.path.join(directory, filename)
+            with open(file_path, 'r', encoding='utf-8') as file:
+                transcript_texts.append(file.read())
+    return transcript_texts
+def main():
+    tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
+    model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
+    tokenizer.pad_token = tokenizer.eos_token  # Set a padding token
+    datasets = load_dataset(
+        'csv', data_files={'test': 'test.csv'})
+    test_dataset = datasets['test'].map(
+        lambda examples: tokenizer(
+            examples['transcript'], truncation=True, padding='max_length', return_attention_mask=False),
+        batched=True
+    )
+    transcripts = load_transcripts('transcripts')
+    train_encodings = tokenizer(transcripts, truncation=True,
+                                padding='max_length', max_length=tokenizer.model_max_length)
+    train_dataset = Dataset.from_dict(
+        {"input_ids": train_encodings["input_ids"], "attention_mask": train_encodings["attention_mask"]})
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer, mlm=False)
+    training_args = TrainingArguments(
+        output_dir="./model",
+        overwrite_output_dir=True,
+        num_train_epochs=3,
+        per_device_train_batch_size=4,
+        gradient_accumulation_steps=4,
+        save_steps=2_000,
+        save_total_limit=2,
+        logging_dir='./logs',
+        learning_rate=5e-5,
+        weight_decay=0.01,
+        evaluation_strategy="steps",
+        eval_steps=500,
+        warmup_steps=300
+    )
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        data_collator=data_collator,
+        train_dataset=train_dataset,
+        eval_dataset=test_dataset,
+    )
+    # Training
+    trainer.train()
+    trainer.evaluate()
+    # Saving the trained model
+    model.save_pretrained("./model")
+    tokenizer.save_pretrained("./model")
+if __name__ == "__main__":
+    main()

test.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

test_model.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+# Load the trained model and tokenizer
+model = AutoModelForSeq2SeqLM.from_pretrained("./model")
+tokenizer = AutoTokenizer.from_pretrained("./model", model_max_length=512)
+# Prepare the text you want to use as a prompt
+text = "premise: woman are emotional creatures. outcome: why do woman get upset easily?"
+# Encode the text and run it through the model
+input_ids = tokenizer(text, return_tensors="pt").input_ids
+outputs = model.generate(input_ids, max_length=500,
+                         num_return_sequences=1)
+# Decode and print the output text
+decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
+print(decoded)

transcript_downloader.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import csv
+import re
+import os
+import yt_dlp
+import whisper
+from tqdm import tqdm
+import ssl
+ssl._create_default_https_context = ssl._create_unverified_context
+# Folder to save audio and transcripts
+audio_folder = 'audio_files'
+transcripts_folder = 'transcripts'
+def sanitize_filename(filename):
+    """
+    Removes invalid characters from filename and truncates it if it's too long.
+    """
+    s = re.sub(r'[\\/*?:"<>|]', '', filename)
+    if len(s) > 200:
+        s = s[:200]
+    return s
+def download_audio_with_yt_dlp(video_id):
+    ydl_opts = {
+        'format': 'bestaudio/best',
+        'postprocessors': [{
+            'key': 'FFmpegExtractAudio',
+            'preferredcodec': 'mp3'
+        }],
+        'outtmpl': f'{audio_folder}/{video_id}.%(ext)s'
+    }
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        ydl.download([f'https://www.youtube.com/watch?v={video_id}'])
+def transcribe_audio_with_whisper(audio_file):
+    model = whisper.load_model("base")
+    result = model.transcribe(audio_file)
+    return result["text"]
+def process_video(video_id):
+    audio_file = f"{audio_folder}/{video_id}.mp3"
+    # Download audio
+    download_audio_with_yt_dlp(video_id)
+    # Transcribe audio
+    transcript = transcribe_audio_with_whisper(audio_file)
+    # Save the transcript
+    with open(f"{transcripts_folder}/{video_id}.txt", 'w', encoding="utf-8") as outfile:
+        outfile.write(transcript)
+def download_audio_from_playlist(playlist_url):
+    ydl_opts = {
+        'format': 'bestaudio/best',
+        'postprocessors': [{
+            'key': 'FFmpegExtractAudio',
+            'preferredcodec': 'mp3',
+            'preferredquality': '192',
+        }],
+        'outtmpl': f'{audio_folder}/%(id)s.%(ext)s',
+        'ignoreerrors': True,  # Continue on download errors
+        'extract_flat': True,  # Just get video IDs from the playlist
+    }
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        info_dict = ydl.extract_info(playlist_url, download=False)
+        for video in info_dict['entries']:
+            if video:  # Video is not None or deleted
+                process_video(video['id'])
+def download_transcripts_from_csv(file_path):
+    with open(file_path, 'r') as csvfile:
+        reader = csv.reader(csvfile)
+        for row in tqdm(reader):
+            url = row[0]
+            if "list=" in url:
+                # It's a playlist, process each video in the playlist
+                download_audio_from_playlist(url)
+            else:
+                # It's a single video
+                video_id = url.split('v=')[-1]
+                process_video(video_id)
+if __name__ == "__main__":
+    # Ensure folders exist
+    os.makedirs(audio_folder, exist_ok=True)
+    os.makedirs(transcripts_folder, exist_ok=True)
+    csv_path = 'urls.csv'
+    download_transcripts_from_csv(csv_path)

urls.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ https://www.youtube.com/playlist?list=PLB13i9vubrBfR0IfNO6ueZBPH_S7Zo0Po