Reproduction: https://www.iteye.com/blog/dengkane-2406703
Steps:
1 Data with labels.Data: pos_text.txt Comment Text: neg_text.txt
2 Constructive features: Words, double-word collocations (Bigrams), such as "mobile phone is very", "very useful", "easy to use!" These three collocations are used as classification features.By analogy, three-word collocations and four-word collocations can be characterized.
3 Feature Dimension Reduction: Use statistical methods to find informative features.These include: Term Frequency, Document Frequency, Pointwise Mutual Information, Information Entropy, Chi-Square Statistics, and so on.
4 Features denote: nltk - [{Feature 1: True, Feature 2: True, Feature N: True,}, Class Label]
5 Building classifiers and predicting: After choosing the best algorithm, you can adjust the number of features to test accuracy.(1) Use the classification algorithm to train the training set inside to get the classifier.(2) Use the classifier to classify the development test set (Dev-Test Set) and get the classification results.(3) The accuracy of the classifier is given by comparing the correct results given by the classifier with those given by manual labeling.
Nltk is mainly responsible for feature extraction (two-word or multi-word collocation requires nltk) and feature selection (requires statistical methods provided by nltk).Sckit-learn is mainly responsible for the tasks of classification algorithm, evaluation of classification effect, classification and so on.
Experiments:
1. Processing data.STR is the data for all pos+neg.The type is: str()
def text(): f1 = open('pos_text.txt','r') f2 = open('neg_text.txt','r') line1 = f1.readline() line2 = f2.readline() str = '' while line1: str += line1 line1 = f1.readline() while line2: str += line2 line2 = f2.readline() f1.close() f2.close() return str
2. Building Features
#Characterize a single word def bag_of_words(words): d={} for word in words: d[word]=True return d print(bag_of_words(text()[:5]))
{'except': True,'yes': True,'electricity': True,'pool': True,'no': True}
import nltk from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures #Select the top 1000 words using two words as a feature and chi-square statistics def bigram(words,score_fn=BigramAssocMeasures.chi_sq,n=1000): bigram_finder=BigramCollocationFinder.from_words(words) #Turn text into a two-word collocation bigrams = bigram_finder.nbest(score_fn,n) #Use chi-square statistics to select the top 1000 words newBigrams = [u+v for (u,v) in bigrams] # bigrams Knowledge Double Word List return bag_of_words(newBigrams) #call bag_of_words Become{Words: True}Dictionary of print(bigram(text()[:5],score_fn=BigramAssocMeasures.chi_sq,n=1000))
{'Power Out': True,'Battery Not': True,'Battery': True,'Except': True}
#Feature a single word with two words def bigram_words(words,score_fn=BigramAssocMeasures.chi_sq,n=1000): bigram_finder=BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn,n) newBigrams = [u+v for (u,v) in bigrams] word_dict = bag_of_words(words) #A Dictionary of individual words bigrams_dict = bag_of_words(newBigrams)#Dictionary of Binary Phrases word_dict.update(bigrams_dict) #Put the dictionary bigrams_dict Merge into Dictionary word_dict in return word_dict print(bigram_words(text()[:10],score_fn=BigramAssocMeasures.chi_sq,n=1000))
{'except': True,'yes': True,'electricity': True,'pool': True,'no': True,'give': True,'force': True,': True,': True,'all': True,'very': True,'all': True,'no': True,
'Power': True,'Force': True,'Battery No': True,'Battery': True,'Force': True,'All': True,'Except': True}
import jieba #A stubborn participle as a feature
def read_file(filename): stop = [line.strip() for line in open('stopword.txt','r',encoding='utf-8').readlines()] #Stop Words f = open(filename,'r') line = f.readline() str = [] while line: s = line.split('\t')#Remove line breaks #print('s:',s)#['......\n'] #print('s[0]:',s[0])#['......'] fenci = jieba.cut(s[0],HMM=True) #False Default value: Precision mode parameters HMM=True When you do, you have the ability to discover new words str.append(list(set(fenci)-set(stop))) line = f.readline() return str #str Is a list of all comments
print(read_file('pos_text.txt')[:2])
['Really','Big Screen','Zombie','Good','Dare','Excellent','300','14','Buy More Blocks','Du','War','Dual Core','Secondary Kill','Help','Apple','Dot','One','G11','Resolution','Start',
'Local','Plant','Will','\n','Worth','16G','Please','Battery','No Force','Talent','4','2820','Take','Comprehensive','Cover','Sense','Answer','C6','Select','10','Message','Play',
'Big screen','Together','Play','Photo','3','Don't care','Brother','No','Very','Picture','Fruit','Game','Originally','Again','Expensive','Machine','Friends','Between','Decisive','Don't dare',
'G14','Just a moment','Consult','Poor','Decide','Feel','Small','Try','Office','Think','High','Multi-day','After Driving','Heart','Plan','No','Ultimate Car','Entangle','Buy','Play','Very Silent',
'No','Anhui','Fuyang','Wife','Very','Block','Card','Two','Almost','Price','Belt','500W'], ['9','Hope','Can','Battery','Very Beautiful','Very Great','Screen','Good','Inch',
'Almost','Finish','High','Cost Performance','4','Sense','Run','Du','Worth','Now','Hot Hand','16','2.3','One Point','4.3','Long Time','Bossy','Row','Software','Solve','Start',
'Very','True','Flaw does not hide yoga','Actually', '','Fluent','Compatible','Return','3.0','Problem','Real Machine','Whole','Clear','Machine None','\n']]
from nltk.probability import FreqDist,ConditionalFreqDist from nltk.metrics import BigramAssocMeasures #Getting the most information(Front number individual)Features(Chi-square statistics) def jieba_feature(number): posWords = [] negWords = [] for items in read_file('pos_text.txt'):#Turn a collection of collections into a collection for item in items: posWords.append(item) for items in read_file('neg_text.txt'): for item in items: negWords.append(item) word_fd = FreqDist() #Counts the frequency of all words #FreqDist The key in the is a word and the value is the total number of occurrences of the word.Actually FreqDist The constructor accepts any list. #It counts the duplicates in the list, and in this case we're passing in a word list of text. cond_word_fd = ConditionalFreqDist() #Statistics of word frequencies in positive and negative texts #Conditional frequency distributions are sets of frequency distributions, each having a different condition, which is usually the category of the text. #Conditional frequency distributions deal with paired lists, where each pair takes the form of (conditions, events), in the example the condition is a style category, and the event is a word. #Member Method #conditions(),Return a list of conditions #tabulate(conditions, samples),Print conditional frequency distribution table based on specified conditions and samples #plot(conditions, samples),Draw a conditional frequency distribution map based on given conditions and samples for word in posWords: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() #Number of positive words neg_word_count = cond_word_fd['neg'].N() #Number of negative words total_word_count = pos_word_count + neg_word_count word_scores = {}#Includes the amount of information for each word and this word for word, freq in word_fd.items():#word_fd={'word':count} pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) #Calculate the chi-square statistic of the active words, here you can also calculate other statistic such as mutual information. #Chi Square x2 Values describe the degree of correlation between independent and dependent variables: x2 The greater the value, the greater the correlation neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score #The amount of information in a word equals the positive chi-square statistic plus the negative chi-square statistic best_vals = sorted(word_scores.items(), key=lambda item:item[1], reverse=True)[:number] #Sort words in reverse order by amount of information. number Is the dimension of the feature and can be continuously adjusted to optimal best_words = set([w for w,s in best_vals]) return dict([(word, True) for word in best_words])
#Adjust settings to develop and compare results from four feature selection methods def build_features(): feature = bag_of_words(text())#First kind: single word #feature = bigram(text(),score_fn=BigramAssocMeasures.chi_sq,n=500)#Second kind: double word #feature = bigram_words(text(),score_fn=BigramAssocMeasures.chi_sq,n=500)#Third: single and double words #feature = jieba_feature(300)#Fourth kind: stuttering participle posFeatures = [] for items in read_file('pos_text.txt'): a = {} for item in items: #item Is a list of participles for each sentence if item in feature.keys(): a[item]='True' posWords = [a,'pos'] #Give Positive Text"pos" posFeatures.append(posWords) negFeatures = [] for items in read_file('neg_text.txt'): a = {} for item in items: if item in feature.keys(): a[item]='True' negWords = [a,'neg'] #Giving negative text"neg" negFeatures.append(negWords) return posFeatures,negFeatures
#Get training data posFeatures,negFeatures = build_features() from random import shuffle import sklearn from nltk.classify.scikitlearn import SklearnClassifier from sklearn.svm import SVC, LinearSVC, NuSVC from sklearn.naive_bayes import MultinomialNB, BernoulliNB from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score shuffle(posFeatures) shuffle(negFeatures) #Randomize the arrangement of text train = posFeatures[300:]+negFeatures[300:]#training set(70%) test = posFeatures[:300]+negFeatures[:300]#Verification Set(30%) data,tag = zip(*test)#Separate data and labels from test collections for ease of validation and testing def score(classifier): classifier = SklearnClassifier(classifier) classifier.train(train) #Training Classifier pred = classifier.classify_many(data) #Label giving forecast n = 0 s = len(pred) for i in range(0,s): if pred[i]==tag[i]: n = n+1 return n/s #Classifier Accuracy print('BernoulliNB`s accuracy is %f' %score(BernoulliNB())) print('MultinomiaNB`s accuracy is %f' %score(MultinomialNB())) print('LogisticRegression`s accuracy is %f' %score(LogisticRegression(solver='lbfgs'))) print('SVC`s accuracy is %f' %score(SVC(gamma='scale'))) print('LinearSVC`s accuracy is %f' %score(LinearSVC())) #print('NuSVC`s accuracy is %f' %score(NuSVC()))
3. Results
# BernoulliNB`s accuracy is 0.858333
# **** MultinomiaNB`s accuracy is 0.871667*****
# LogisticRegression`s accuracy is 0.820000
# SVC`s accuracy is 0.805000
# LinearSVC`s accuracy is 0.795000
#Fourth kind: stuttering participle
# **** BernoulliNB`s accuracy is 0.761667*****
# MultinomiaNB`s accuracy is 0.701667
# LogisticRegression`s accuracy is 0.756667
# SVC`s accuracy is 0.688333
# LinearSVC`s accuracy is 0.733333
#Third kind: single word and double word
# ***** BernoulliNB`s accuracy is 0.773333******
# MultinomiaNB`s accuracy is 0.688333
# LogisticRegression`s accuracy is 0.726667
# SVC`s accuracy is 0.661667
# LinearSVC`s accuracy is 0.726667
#Second: Two Words
# BernoulliNB`s accuracy is 0.641667
# MultinomiaNB`s accuracy is 0.616667
#***** LogisticRegression`s accuracy is 0.668333*****
# SVC`s accuracy is 0.545000
# LinearSVC`s accuracy is 0.653333
#First kind: single word