diff --git a/sentiment_analysis.py b/sentiment_analysis.py index 9d114d6..19b6e0d 100644 --- a/sentiment_analysis.py +++ b/sentiment_analysis.py @@ -3,7 +3,10 @@ import json import sentiment_analysis_functions as sfun # installation +# import nltk # nltk.download('punkt') +# nltk.download('wordnet') +# nltk.download('stopwords') from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB @@ -32,6 +35,12 @@ if __name__ == '__main__': print("Nb of training reviews - positive:", len(train_positive_reviews), "negative:", len(train_negative_reviews)) print("Nb of test reviews - positive:", len(test_positive_reviews), "negative:", len(test_negative_reviews)) - xx = sfun.tokenize(train_positive_reviews) - xx[0] + x0 = sfun.tokenize(train_positive_reviews) + x1 = sfun.norm_stemming(x0) + x2 = sfun.norm_lemmatize(x0) + + x11 = sfun.select_freq(x1) + x21 = sfun.select_freq(x2) + x12 = sfun.select_rem_stopwords(x1) + x22 = sfun.select_rem_stopwords(x2) \ No newline at end of file diff --git a/sentiment_analysis_functions.py b/sentiment_analysis_functions.py index dd3ac92..979e85b 100644 --- a/sentiment_analysis_functions.py +++ b/sentiment_analysis_functions.py @@ -8,35 +8,83 @@ Created on Sun Oct 27 17:16:54 2019 import nltk import re +from collections import defaultdict ## Normalisation def tokenize(reviews): + tokenizer = nltk.tokenize.ToktokTokenizer() tokenized_reviews = [] for review in reviews: # Plusieurs fin de phrases étaient représentées par deux espaces ou plus. review = re.sub(r"\s{2,}",". ",review) + review = str.lower(review) tokenized_sentences = [] sentences = nltk.sent_tokenize(review) for sentence in sentences: - sentence_tokens = nltk.word_tokenize(sentence) + sentence_tokens = tokenizer.tokenize(sentence) + # Suppression de la ponctuation + for token in sentence_tokens: + if token in "?:!.,;": + sentence_tokens.remove(token) tokenized_sentences.append(sentence_tokens) tokenized_reviews.append(tokenized_sentences) return tokenized_reviews def norm_stemming(tokenized_reviews): - return 0 + porter = nltk.PorterStemmer() + stemmed_reviews = [] + for review in tokenized_reviews: + stemmed_review = [] + for sentence in review: + stemmed_tokens = [porter.stem(token) for token in sentence] + stemmed_review.append(stemmed_tokens) + stemmed_reviews.append(stemmed_review) + return stemmed_reviews def norm_lemmatize(tokenized_reviews): - return 0 + wnl = nltk.WordNetLemmatizer() + normalized_reviews = [] + for review in tokenized_reviews: + normalized_review = [] + for sentence in review: + # Lemmatisation au verbe et non au nom + normalized_tokens = [wnl.lemmatize(token, pos="v") for token in sentence] + normalized_review.append(normalized_tokens) + normalized_reviews.append(normalized_review) + return normalized_reviews ## Feature selection - -def select_freq(reviews): - return 0 -def select_rem_stopwords(reviews): - return 0 +def dict_frequency(norm_reviews): + tokens_frequency = defaultdict(int) + for review in norm_reviews: + for sentence in review: + for token in sentence: + tokens_frequency[token] += 1 + return tokens_frequency + +def select_freq(norm_reviews): + tokens_frequency = dict_frequency(norm_reviews) + norm_reviews_freq3 = [ + [ + [token for token in sentence if tokens_frequency[token] > 3] + for sentence in review + ] + for review in norm_reviews + ] + return norm_reviews_freq3 + +def select_rem_stopwords(norm_reviews): + sws = set(nltk.corpus.stopwords.words('english')) + norm_reviews_stoprem = [ + [ + [token for token in sentence if token not in sws] + for sentence in review + ] + for review in norm_reviews + ] + return norm_reviews_stoprem def select_open_class(reviews): return 0