# Connect to Google Drive

In [1]:
# Connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Import Libraries

In [15]:
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
from tqdm import tqdm

In [5]:
# Read csv
elon_tweets = pd.read_csv('/content/drive/MyDrive/elon_musk_tweets.csv')
non_elon_tweets = pd.read_csv('/content/drive/MyDrive/Tweets.csv')

elon_tweets

Unnamed: 0,id,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,retweets,favorites,is_retweet
0,1544379368478212100,Elon Musk,,"Mars & Cars, Chips & Dips",2009-06-02 20:12:29+00:00,101240855,115,13503,True,2022-07-05 17:55:09+00:00,@BillyM2k I find the gold toe sock – inevitabl...,,Twitter for iPhone,335,6542,False
1,1544377493263720450,Elon Musk,,"Mars & Cars, Chips & Dips",2009-06-02 20:12:29+00:00,101240806,115,13503,True,2022-07-05 17:47:42+00:00,"Sock Con, the conference for socks",,Twitter for iPhone,1451,30753,False
2,1544377130590552064,Elon Musk,,"Mars & Cars, Chips & Dips",2009-06-02 20:12:29+00:00,101240806,115,13503,True,2022-07-05 17:46:15+00:00,Always something new for the magazine cover an...,,Twitter for iPhone,1284,28610,False
3,1544375575724400645,Elon Musk,,"Mars & Cars, Chips & Dips",2009-06-02 20:12:29+00:00,101240806,115,13503,True,2022-07-05 17:40:05+00:00,@ExplainThisBob This guy gets it,,Twitter for iPhone,131,3640,False
4,1544375148605853699,Elon Musk,,"Mars & Cars, Chips & Dips",2009-06-02 20:12:29+00:00,101240806,115,13503,True,2022-07-05 17:38:23+00:00,Sock tech is so advanced that you can get pret...,,Twitter for iPhone,1191,23790,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5899,1665143503108677634,Elon Musk,,,2009-06-02 20:12:29+00:00,143325985,330,25655,False,2023-06-03 23:48:42+00:00,@JonErlichman He’s not wrong …,,Twitter for iPhone,361,4791,False
5900,1665139144425631747,Elon Musk,,,2009-06-02 20:12:29+00:00,143325985,330,25655,False,2023-06-03 23:31:23+00:00,"@alifarhat79 Guys, I think I maybe took too mu...",,Twitter for iPhone,1609,61964,False
5901,1665137204782419968,Elon Musk,,,2009-06-02 20:12:29+00:00,143325985,330,25655,False,2023-06-03 23:23:41+00:00,@sriramk Cool,,Twitter for iPhone,46,879,False
5902,1665131126900285445,Elon Musk,,,2009-06-02 20:12:29+00:00,143325985,330,25655,False,2023-06-03 22:59:31+00:00,@cb_doge Time to complete the circle,,Twitter for iPhone,898,12467,False


In [6]:
# Drop off all columns except text from elon musk tweets. Delete row if retweet

elon_tweets = elon_tweets[elon_tweets['is_retweet'] == False]
elon_tweets = elon_tweets[['text']]

elon_tweets

Unnamed: 0,text
0,@BillyM2k I find the gold toe sock – inevitabl...
1,"Sock Con, the conference for socks"
2,Always something new for the magazine cover an...
3,@ExplainThisBob This guy gets it
4,Sock tech is so advanced that you can get pret...
...,...
5899,@JonErlichman He’s not wrong …
5900,"@alifarhat79 Guys, I think I maybe took too mu..."
5901,@sriramk Cool
5902,@cb_doge Time to complete the circle


In [8]:
non_elon_tweets = non_elon_tweets[['text']]
non_elon_tweets

Unnamed: 0,text
0,"I`d have responded, if I were going"
1,Sooo SAD I will miss you here in San Diego!!!
2,my boss is bullying me...
3,what interview! leave me alone
4,"Sons of ****, why couldn`t they put them on t..."
...,...
27476,wish we could come see u on Denver husband l...
27477,I`ve wondered about rake to. The client has ...
27478,Yay good for both of you. Enjoy the break - y...
27479,But it was worth it ****.


In [19]:
def load_and_preprocess_data(elon_file, non_elon_file):
    elon_tweets = pd.read_csv(elon_file)
    non_elon_tweets = pd.read_csv(non_elon_file)
    non_elon_tweets = non_elon_tweets[['text']]
    elon_tweets = elon_tweets[elon_tweets['is_retweet'] == False]
    elon_tweets = elon_tweets[['text']]
    # 'text' 列が存在することを確認し、存在しない場合は適切な列名に変更
    text_column = 'text' if 'text' in elon_tweets.columns else elon_tweets.columns[0]

    elon_tweets['label'] = 1
    non_elon_tweets['label'] = 0

    all_tweets = pd.concat([elon_tweets, non_elon_tweets], ignore_index=True)

    # None値や空の文字列を除去
    all_tweets = all_tweets.dropna(subset=[text_column])
    all_tweets = all_tweets[all_tweets[text_column].astype(bool)]

    # テキストを文字列に変換
    texts = all_tweets[text_column].astype(str).tolist()
    labels = all_tweets['label'].tolist()

    return train_test_split(texts, labels, test_size=0.2, random_state=42)

# データの読み込みと分割
train_texts, test_texts, train_labels, test_labels = load_and_preprocess_data('/content/drive/MyDrive/elon_musk_tweets.csv', '/content/drive/MyDrive/Tweets.csv')

In [20]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [21]:
def preprocess_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
    dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], torch.tensor(labels))
    return dataset

train_dataset = preprocess_data(train_texts, train_labels)
test_dataset = preprocess_data(test_texts, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [22]:
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{num_epochs} completed. Average loss: {avg_loss:.4f}')

Epoch 1/3: 100%|██████████| 1670/1670 [04:16<00:00,  6.50it/s]


Epoch 1/3 completed. Average loss: 0.0444


Epoch 2/3: 100%|██████████| 1670/1670 [04:15<00:00,  6.55it/s]


Epoch 2/3 completed. Average loss: 0.0157


Epoch 3/3: 100%|██████████| 1670/1670 [04:15<00:00,  6.54it/s]

Epoch 3/3 completed. Average loss: 0.0087





In [23]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc='Evaluating'):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Test Accuracy: {accuracy:.2f}')

Evaluating: 100%|██████████| 418/418 [00:17<00:00, 23.91it/s]

Test Accuracy: 0.99





In [29]:
def classify_tweet(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.softmax(outputs.logits, dim=1)
    prediction = torch.argmax(probabilities, dim=1).item()
    return "Elon Musk" if prediction == 1 else "Not Elon Musk"

# 使用例
new_tweet = "I'm Elon"
result = classify_tweet(new_tweet)
print(f"The tweet '{new_tweet}' is classified as: {result}")

The tweet 'I'm Elon' is classified as: Not Elon Musk


In [35]:
model.save_pretrained('/content/drive/MyDrive/EMD')

In [36]:
tokenizer.save_pretrained('/content/drive/MyDrive/EMD')

('/content/drive/MyDrive/EMD/tokenizer_config.json',
 '/content/drive/MyDrive/EMD/special_tokens_map.json',
 '/content/drive/MyDrive/EMD/vocab.txt',
 '/content/drive/MyDrive/EMD/added_tokens.json')

In [37]:
!git clone https://huggingface.co/kix-intl/elon-musk-detector.git

Cloning into 'elon-musk-detector'...
fatal: could not read Username for 'https://huggingface.co': No such device or address


In [41]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [42]:
!git clone https://huggingface.co/kix-intl/elon-musk-detector.git

Cloning into 'elon-musk-detector'...
remote: Enumerating objects: 3, done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 3 (from 1)[K
Unpacking objects: 100% (3/3), 1.05 KiB | 1.05 MiB/s, done.


In [43]:
%cd /content/elon-musk-detector

/content/elon-musk-detector


In [44]:
!git lfs install

Updated git hooks.
Git LFS initialized.


In [46]:
!huggingface-cli lfs-enable-largefiles .

Local repo set up for largefiles


In [47]:
!mv /content/drive/MyDrive/EMD /content/elon-musk-detector/

In [49]:
!git config --global user.email "koko8.dev@gmail.com"
!git config --global user.name "kix-intl"

In [50]:
!git add .
!git commit -m "add model"
!git push

[main 3570286] add model
 5 files changed, 30614 insertions(+)
 create mode 100644 config.json
 create mode 100644 model.safetensors
 create mode 100644 special_tokens_map.json
 create mode 100644 tokenizer_config.json
 create mode 100644 vocab.txt
Uploading LFS objects: 100% (1/1), 272 MB | 11 MB/s, done.
Enumerating objects: 8, done.
Counting objects: 100% (8/8), done.
Delta compression using up to 2 threads
Compressing objects: 100% (7/7), done.
Writing objects: 100% (7/7), 109.34 KiB | 5.47 MiB/s, done.
Total 7 (delta 0), reused 0 (delta 0), pack-reused 0
To https://huggingface.co/kix-intl/elon-musk-detector.git
   c285fdd..3570286  main -> main
