the calling card for how to use dna_bert with huggingface API

#2
by moeinh77 - opened
Files changed (1) hide show
  1. README.md +54 -0
README.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - dna_bert
4
+ ---
5
+ ```
6
+ NUM_CLASSES = number of the classes in your data
7
+
8
+ from transformers import (
9
+ AutoTokenizer,
10
+ AutoModelForSequenceClassification,
11
+ )
12
+ tokenizer = AutoTokenizer.from_pretrained(
13
+ zhihan1996/DNA_bert_6, do_lower_case=False
14
+ )
15
+
16
+ model = AutoModelForSequenceClassification.from_pretrained(
17
+ zhihan1996/DNA_bert_6, num_labels=NUM_CLASSES
18
+ )
19
+
20
+ def return_kmer(seq, K=6):
21
+ """
22
+ This function outputs the K-mers of a sequence
23
+ Parameters
24
+ ----------
25
+ seq : str
26
+ A single sequence to be split into K-mers
27
+ K : int, optional
28
+ The length of the K-mers, by default 6
29
+ Returns
30
+ -------
31
+ kmer_seq : str
32
+ A string of K-mers separated by spaces
33
+ """
34
+
35
+ kmer_list = []
36
+ for x in range(len(seq) - K + 1):
37
+ kmer_list.append(seq[x : x + K])
38
+
39
+ kmer_seq = " ".join(kmer_list)
40
+ return kmer_seq
41
+
42
+ sequence = your DNA sequences
43
+
44
+ train_kmers = [return_kmer(seq) for seq in sequence]
45
+
46
+ train_encodings = tokenizer.batch_encode_plus(
47
+ train_kmers,
48
+ max_length=512, # max len of BERT
49
+ padding=True,
50
+ truncation=True,
51
+ return_attention_mask=True,
52
+ return_tensors="pt",
53
+ )
54
+ ```