ajout fonctions formalisation et sélection de mots
This commit is contained in:
parent
67579c1f03
commit
f7b94b575a
2 changed files with 67 additions and 10 deletions
|
@ -3,7 +3,10 @@ import json
|
|||
import sentiment_analysis_functions as sfun
|
||||
|
||||
# installation
|
||||
# import nltk
|
||||
# nltk.download('punkt')
|
||||
# nltk.download('wordnet')
|
||||
# nltk.download('stopwords')
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
|
@ -32,6 +35,12 @@ if __name__ == '__main__':
|
|||
print("Nb of training reviews - positive:", len(train_positive_reviews), "negative:", len(train_negative_reviews))
|
||||
print("Nb of test reviews - positive:", len(test_positive_reviews), "negative:", len(test_negative_reviews))
|
||||
|
||||
xx = sfun.tokenize(train_positive_reviews)
|
||||
xx[0]
|
||||
x0 = sfun.tokenize(train_positive_reviews)
|
||||
x1 = sfun.norm_stemming(x0)
|
||||
x2 = sfun.norm_lemmatize(x0)
|
||||
|
||||
x11 = sfun.select_freq(x1)
|
||||
x21 = sfun.select_freq(x2)
|
||||
|
||||
x12 = sfun.select_rem_stopwords(x1)
|
||||
x22 = sfun.select_rem_stopwords(x2)
|
|
@ -8,35 +8,83 @@ Created on Sun Oct 27 17:16:54 2019
|
|||
|
||||
import nltk
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
## Normalisation
|
||||
|
||||
def tokenize(reviews):
|
||||
tokenizer = nltk.tokenize.ToktokTokenizer()
|
||||
tokenized_reviews = []
|
||||
for review in reviews:
|
||||
# Plusieurs fin de phrases étaient représentées par deux espaces ou plus.
|
||||
review = re.sub(r"\s{2,}",". ",review)
|
||||
review = str.lower(review)
|
||||
tokenized_sentences = []
|
||||
sentences = nltk.sent_tokenize(review)
|
||||
for sentence in sentences:
|
||||
sentence_tokens = nltk.word_tokenize(sentence)
|
||||
sentence_tokens = tokenizer.tokenize(sentence)
|
||||
# Suppression de la ponctuation
|
||||
for token in sentence_tokens:
|
||||
if token in "?:!.,;":
|
||||
sentence_tokens.remove(token)
|
||||
tokenized_sentences.append(sentence_tokens)
|
||||
tokenized_reviews.append(tokenized_sentences)
|
||||
return tokenized_reviews
|
||||
|
||||
def norm_stemming(tokenized_reviews):
|
||||
return 0
|
||||
porter = nltk.PorterStemmer()
|
||||
stemmed_reviews = []
|
||||
for review in tokenized_reviews:
|
||||
stemmed_review = []
|
||||
for sentence in review:
|
||||
stemmed_tokens = [porter.stem(token) for token in sentence]
|
||||
stemmed_review.append(stemmed_tokens)
|
||||
stemmed_reviews.append(stemmed_review)
|
||||
return stemmed_reviews
|
||||
|
||||
def norm_lemmatize(tokenized_reviews):
|
||||
return 0
|
||||
wnl = nltk.WordNetLemmatizer()
|
||||
normalized_reviews = []
|
||||
for review in tokenized_reviews:
|
||||
normalized_review = []
|
||||
for sentence in review:
|
||||
# Lemmatisation au verbe et non au nom
|
||||
normalized_tokens = [wnl.lemmatize(token, pos="v") for token in sentence]
|
||||
normalized_review.append(normalized_tokens)
|
||||
normalized_reviews.append(normalized_review)
|
||||
return normalized_reviews
|
||||
|
||||
## Feature selection
|
||||
|
||||
def select_freq(reviews):
|
||||
return 0
|
||||
|
||||
def select_rem_stopwords(reviews):
|
||||
return 0
|
||||
def dict_frequency(norm_reviews):
|
||||
tokens_frequency = defaultdict(int)
|
||||
for review in norm_reviews:
|
||||
for sentence in review:
|
||||
for token in sentence:
|
||||
tokens_frequency[token] += 1
|
||||
return tokens_frequency
|
||||
|
||||
def select_freq(norm_reviews):
|
||||
tokens_frequency = dict_frequency(norm_reviews)
|
||||
norm_reviews_freq3 = [
|
||||
[
|
||||
[token for token in sentence if tokens_frequency[token] > 3]
|
||||
for sentence in review
|
||||
]
|
||||
for review in norm_reviews
|
||||
]
|
||||
return norm_reviews_freq3
|
||||
|
||||
def select_rem_stopwords(norm_reviews):
|
||||
sws = set(nltk.corpus.stopwords.words('english'))
|
||||
norm_reviews_stoprem = [
|
||||
[
|
||||
[token for token in sentence if token not in sws]
|
||||
for sentence in review
|
||||
]
|
||||
for review in norm_reviews
|
||||
]
|
||||
return norm_reviews_stoprem
|
||||
|
||||
def select_open_class(reviews):
|
||||
return 0
|
||||
|
|
Loading…
Reference in a new issue