ajout fonctions formalisation et sélection de mots

This commit is contained in:
François Pelletier 2019-10-28 23:34:06 -04:00
parent 67579c1f03
commit f7b94b575a
2 changed files with 67 additions and 10 deletions

View file

@ -3,7 +3,10 @@ import json
import sentiment_analysis_functions as sfun
# installation
# import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
@ -32,6 +35,12 @@ if __name__ == '__main__':
print("Nb of training reviews - positive:", len(train_positive_reviews), "negative:", len(train_negative_reviews))
print("Nb of test reviews - positive:", len(test_positive_reviews), "negative:", len(test_negative_reviews))
xx = sfun.tokenize(train_positive_reviews)
xx[0]
x0 = sfun.tokenize(train_positive_reviews)
x1 = sfun.norm_stemming(x0)
x2 = sfun.norm_lemmatize(x0)
x11 = sfun.select_freq(x1)
x21 = sfun.select_freq(x2)
x12 = sfun.select_rem_stopwords(x1)
x22 = sfun.select_rem_stopwords(x2)

View file

@ -8,35 +8,83 @@ Created on Sun Oct 27 17:16:54 2019
import nltk
import re
from collections import defaultdict
## Normalisation
def tokenize(reviews):
tokenizer = nltk.tokenize.ToktokTokenizer()
tokenized_reviews = []
for review in reviews:
# Plusieurs fin de phrases étaient représentées par deux espaces ou plus.
review = re.sub(r"\s{2,}",". ",review)
review = str.lower(review)
tokenized_sentences = []
sentences = nltk.sent_tokenize(review)
for sentence in sentences:
sentence_tokens = nltk.word_tokenize(sentence)
sentence_tokens = tokenizer.tokenize(sentence)
# Suppression de la ponctuation
for token in sentence_tokens:
if token in "?:!.,;":
sentence_tokens.remove(token)
tokenized_sentences.append(sentence_tokens)
tokenized_reviews.append(tokenized_sentences)
return tokenized_reviews
def norm_stemming(tokenized_reviews):
return 0
porter = nltk.PorterStemmer()
stemmed_reviews = []
for review in tokenized_reviews:
stemmed_review = []
for sentence in review:
stemmed_tokens = [porter.stem(token) for token in sentence]
stemmed_review.append(stemmed_tokens)
stemmed_reviews.append(stemmed_review)
return stemmed_reviews
def norm_lemmatize(tokenized_reviews):
return 0
wnl = nltk.WordNetLemmatizer()
normalized_reviews = []
for review in tokenized_reviews:
normalized_review = []
for sentence in review:
# Lemmatisation au verbe et non au nom
normalized_tokens = [wnl.lemmatize(token, pos="v") for token in sentence]
normalized_review.append(normalized_tokens)
normalized_reviews.append(normalized_review)
return normalized_reviews
## Feature selection
def select_freq(reviews):
return 0
def select_rem_stopwords(reviews):
return 0
def dict_frequency(norm_reviews):
tokens_frequency = defaultdict(int)
for review in norm_reviews:
for sentence in review:
for token in sentence:
tokens_frequency[token] += 1
return tokens_frequency
def select_freq(norm_reviews):
tokens_frequency = dict_frequency(norm_reviews)
norm_reviews_freq3 = [
[
[token for token in sentence if tokens_frequency[token] > 3]
for sentence in review
]
for review in norm_reviews
]
return norm_reviews_freq3
def select_rem_stopwords(norm_reviews):
sws = set(nltk.corpus.stopwords.words('english'))
norm_reviews_stoprem = [
[
[token for token in sentence if token not in sws]
for sentence in review
]
for review in norm_reviews
]
return norm_reviews_stoprem
def select_open_class(reviews):
return 0