From 834f38b15157cd42c1e154bb16fc5bf37c3adba1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Pelletier?= Date: Thu, 31 Oct 2019 01:32:02 -0400 Subject: [PATCH] ajout fonction compteurs features --- sentiment_analysis.py | 46 +++++++++-- sentiment_analysis_functions.py | 138 +++++++++++++++++++++++--------- 2 files changed, 141 insertions(+), 43 deletions(-) diff --git a/sentiment_analysis.py b/sentiment_analysis.py index 19b6e0d..36e99dd 100644 --- a/sentiment_analysis.py +++ b/sentiment_analysis.py @@ -7,10 +7,12 @@ import sentiment_analysis_functions as sfun # nltk.download('punkt') # nltk.download('wordnet') # nltk.download('stopwords') +# nltk.download('averaged_perceptron_tagger') +# nltk.download('universal_tagset') -from sklearn.feature_extraction.text import CountVectorizer -from sklearn.naive_bayes import MultinomialNB -from sklearn.linear_model import LogisticRegression +# from sklearn.feature_extraction.text import CountVectorizer +# from sklearn.naive_bayes import MultinomialNB +# from sklearn.linear_model import LogisticRegression train_pos_reviews_fn = "./data/train-positive-t1.txt" train_neg_reviews_fn = "./data/train-negative-t1.txt" @@ -27,6 +29,7 @@ def load_reviews(filename): reviews_list = json.load(fp) return reviews_list + if __name__ == '__main__': train_positive_reviews = load_reviews(train_pos_reviews_fn) train_negative_reviews = load_reviews(train_neg_reviews_fn) @@ -34,13 +37,44 @@ if __name__ == '__main__': test_negative_reviews = load_reviews(test_neg_reviews_fn) print("Nb of training reviews - positive:", len(train_positive_reviews), "negative:", len(train_negative_reviews)) print("Nb of test reviews - positive:", len(test_positive_reviews), "negative:", len(test_negative_reviews)) - + x0 = sfun.tokenize(train_positive_reviews) x1 = sfun.norm_stemming(x0) x2 = sfun.norm_lemmatize(x0) - + x11 = sfun.select_freq(x1) x21 = sfun.select_freq(x2) x12 = sfun.select_rem_stopwords(x1) - x22 = sfun.select_rem_stopwords(x2) \ No newline at end of file + x22 = sfun.select_rem_stopwords(x2) + + x13 = sfun.select_open_class(x1) + x23 = sfun.select_open_class(x2) + + x11c = sfun.corpus_documents(x11) + x21c = sfun.corpus_documents(x21) + x12c = sfun.corpus_documents(x12) + x22c = sfun.corpus_documents(x22) + x13c = sfun.corpus_documents(x13) + x23c = sfun.corpus_documents(x23) + + x111 = sfun.value_count(x11c) + x211 = sfun.value_count(x21c) + x121 = sfun.value_count(x12c) + x221 = sfun.value_count(x22c) + x131 = sfun.value_count(x13c) + x231 = sfun.value_count(x23c) + + x112 = sfun.value_occurence(x11c) + x212 = sfun.value_occurence(x21c) + x122 = sfun.value_occurence(x12c) + x222 = sfun.value_occurence(x22c) + x132 = sfun.value_occurence(x13c) + x232 = sfun.value_occurence(x23c) + + x113 = sfun.value_tfidf(x11c) + x213 = sfun.value_tfidf(x21c) + x123 = sfun.value_tfidf(x12c) + x223 = sfun.value_tfidf(x22c) + x133 = sfun.value_tfidf(x13c) + x233 = sfun.value_tfidf(x23c) \ No newline at end of file diff --git a/sentiment_analysis_functions.py b/sentiment_analysis_functions.py index 979e85b..4e926f4 100644 --- a/sentiment_analysis_functions.py +++ b/sentiment_analysis_functions.py @@ -9,15 +9,17 @@ Created on Sun Oct 27 17:16:54 2019 import nltk import re from collections import defaultdict +from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer + + +# Normalisation -## Normalisation - def tokenize(reviews): tokenizer = nltk.tokenize.ToktokTokenizer() tokenized_reviews = [] for review in reviews: # Plusieurs fin de phrases étaient représentées par deux espaces ou plus. - review = re.sub(r"\s{2,}",". ",review) + review = re.sub(r"\s{2,}", ". ", review) review = str.lower(review) tokenized_sentences = [] sentences = nltk.sent_tokenize(review) @@ -31,6 +33,7 @@ def tokenize(reviews): tokenized_reviews.append(tokenized_sentences) return tokenized_reviews + def norm_stemming(tokenized_reviews): porter = nltk.PorterStemmer() stemmed_reviews = [] @@ -42,6 +45,7 @@ def norm_stemming(tokenized_reviews): stemmed_reviews.append(stemmed_review) return stemmed_reviews + def norm_lemmatize(tokenized_reviews): wnl = nltk.WordNetLemmatizer() normalized_reviews = [] @@ -54,8 +58,9 @@ def norm_lemmatize(tokenized_reviews): normalized_reviews.append(normalized_review) return normalized_reviews -## Feature selection - + +# Feature selection + def dict_frequency(norm_reviews): tokens_frequency = defaultdict(int) for review in norm_reviews: @@ -64,54 +69,113 @@ def dict_frequency(norm_reviews): tokens_frequency[token] += 1 return tokens_frequency + def select_freq(norm_reviews): tokens_frequency = dict_frequency(norm_reviews) norm_reviews_freq3 = [ - [ - [token for token in sentence if tokens_frequency[token] > 3] - for sentence in review - ] - for review in norm_reviews - ] + [ + [token for token in sentence if tokens_frequency[token] > 3] + for sentence in review + ] + for review in norm_reviews + ] return norm_reviews_freq3 - + + def select_rem_stopwords(norm_reviews): sws = set(nltk.corpus.stopwords.words('english')) norm_reviews_stoprem = [ - [ - [token for token in sentence if token not in sws] - for sentence in review - ] - for review in norm_reviews - ] + [ + [token for token in sentence if token not in sws] + for sentence in review + ] + for review in norm_reviews + ] return norm_reviews_stoprem - -def select_open_class(reviews): - return 0 - -## Attribute value - + + +def pos_tag_reviews(norm_reviews): + tagged_reviews = [] + for review in norm_reviews: + tagged_review = nltk.pos_tag_sents(review, tagset='universal') + tagged_reviews.append(tagged_review) + return tagged_reviews + + +def validate_open_class(token_tuple): + # i is not tagged right + if token_tuple[1] in ['NOUN','ADJ','ADV','VERB'] and token_tuple[0] != 'i': + return True + else: + return False + + +def select_open_class(norm_reviews): + tagged_reviews = pos_tag_reviews(norm_reviews) + select_oc_token = [ + [ + [token_tuple[0] for token_tuple in sentence if validate_open_class(token_tuple)] + for sentence in review + ] + for review in tagged_reviews + ] + return select_oc_token + + +# Attribute value + +def dummy_function(x): + return x + +def corpus_documents(corpus): + corpus_documents = [] + for document in corpus: + doc_sentences = [] + for sentence in document: + doc_sentences += sentence + corpus_documents.append(doc_sentences) + return corpus_documents + def value_count(reviews): - return 0 - + cvr = CountVectorizer(analyzer='word', + tokenizer=dummy_function, + preprocessor=dummy_function) + feat_value_count = cvr.fit_transform(reviews) + return feat_value_count + + def value_occurence(reviews): - return 0 - + hvr = CountVectorizer(analyzer='word', + tokenizer=dummy_function, + preprocessor=dummy_function, + binary=True) + feat_value_occurence = hvr.fit_transform(reviews) + return feat_value_occurence + + def value_tfidf(reviews): - return 0 - -## Other attributes - + tdvr = TfidfVectorizer(analyzer='word', + tokenizer=dummy_function, + preprocessor=dummy_function) + feat_value_tfidf = tdvr.fit_transform(reviews) + return feat_value_tfidf + + +# Other attributes + def attribute_polarity_count(reviews): return 0 - + + def attribute_length(reviews): return 0 - -## Training - + + +# Training + def train_naive_model(reviews): return 0 - + + def train_regression_model(reviews): return 0