| Input: texts, user_dictionary_list, stopwords_list |
| Output: word_sequences |
(1) | //Step 1: content cleaning |
(2) | m = size of (texts) //get number of total documents in texts |
(3) | For i = 1 to m do |
(4) | Content = texts[i] //get the content of ith document |
(5) | Content = content.strip() //remove the blanks |
(6) | n = length (content) //get number of total characters in content |
(7) | For j = 1 to n do |
(8) | Character = content[j] |
(9) | If character < u‘\u4e00’ and character > u‘\u9fa5’ then |
(10) | Character = character.strip() //remove the blanks |
(11) | End if |
(12) | End for |
(13) | End for |
(14) | //Step 2: word splitting |
(15) | Jieba.load () //jieba is a python Chinese word segmentation module |
(16) | User_dictionary_list.load () |
(17) | Stopwords_list.load () |
(18) | Corpus = [] //build internal corpus |
(19) | For i = 1 to m do |
(20) | Word_list = jieba.lcut (texts[i]) //split texts[i] |
(21) | s = length (word_list) //get number of total words in word_list |
(22) | For j = 1 to s do |
(23) | If word_list[j] in stopwords_list then |
(24) | Remove (word_list[j]) |
(25) | End if |
(26) | End for |
(27) | Corpus.append (word_list) |
(28) | End for |
(29) | //Step 3: tokenization |
(30) | Tensorflow.load () //tensorflow is a machine learning framework |
(31) | Tokenizer = tensorflow.keras.pre-processing.text.Tokenizer() //Tokenizer is a class that converts text into sequences |
(32) | Tokenizer.fit_on_texts (corpus) |
(33) | Senquences = tokenizer.texts_to_sequences (corpus) |
(34) | Word_sequences = tensorflow.keras.pre-processing.sequence.pad_sequences (senquences, maxlen = l) |