ajout fonction compteurs features

This commit is contained in:
François Pelletier 2019-10-31 01:32:02 -04:00
parent f7b94b575a
commit 834f38b151
2 changed files with 141 additions and 43 deletions

View file

@ -7,10 +7,12 @@ import sentiment_analysis_functions as sfun
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('universal_tagset')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.linear_model import LogisticRegression
train_pos_reviews_fn = "./data/train-positive-t1.txt"
train_neg_reviews_fn = "./data/train-negative-t1.txt"
@ -27,6 +29,7 @@ def load_reviews(filename):
reviews_list = json.load(fp)
return reviews_list
if __name__ == '__main__':
train_positive_reviews = load_reviews(train_pos_reviews_fn)
train_negative_reviews = load_reviews(train_neg_reviews_fn)
@ -34,13 +37,44 @@ if __name__ == '__main__':
test_negative_reviews = load_reviews(test_neg_reviews_fn)
print("Nb of training reviews - positive:", len(train_positive_reviews), "negative:", len(train_negative_reviews))
print("Nb of test reviews - positive:", len(test_positive_reviews), "negative:", len(test_negative_reviews))
x0 = sfun.tokenize(train_positive_reviews)
x1 = sfun.norm_stemming(x0)
x2 = sfun.norm_lemmatize(x0)
x11 = sfun.select_freq(x1)
x21 = sfun.select_freq(x2)
x12 = sfun.select_rem_stopwords(x1)
x22 = sfun.select_rem_stopwords(x2)
x22 = sfun.select_rem_stopwords(x2)
x13 = sfun.select_open_class(x1)
x23 = sfun.select_open_class(x2)
x11c = sfun.corpus_documents(x11)
x21c = sfun.corpus_documents(x21)
x12c = sfun.corpus_documents(x12)
x22c = sfun.corpus_documents(x22)
x13c = sfun.corpus_documents(x13)
x23c = sfun.corpus_documents(x23)
x111 = sfun.value_count(x11c)
x211 = sfun.value_count(x21c)
x121 = sfun.value_count(x12c)
x221 = sfun.value_count(x22c)
x131 = sfun.value_count(x13c)
x231 = sfun.value_count(x23c)
x112 = sfun.value_occurence(x11c)
x212 = sfun.value_occurence(x21c)
x122 = sfun.value_occurence(x12c)
x222 = sfun.value_occurence(x22c)
x132 = sfun.value_occurence(x13c)
x232 = sfun.value_occurence(x23c)
x113 = sfun.value_tfidf(x11c)
x213 = sfun.value_tfidf(x21c)
x123 = sfun.value_tfidf(x12c)
x223 = sfun.value_tfidf(x22c)
x133 = sfun.value_tfidf(x13c)
x233 = sfun.value_tfidf(x23c)

View file

@ -9,15 +9,17 @@ Created on Sun Oct 27 17:16:54 2019
import nltk
import re
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
# Normalisation
## Normalisation
def tokenize(reviews):
tokenizer = nltk.tokenize.ToktokTokenizer()
tokenized_reviews = []
for review in reviews:
# Plusieurs fin de phrases étaient représentées par deux espaces ou plus.
review = re.sub(r"\s{2,}",". ",review)
review = re.sub(r"\s{2,}", ". ", review)
review = str.lower(review)
tokenized_sentences = []
sentences = nltk.sent_tokenize(review)
@ -31,6 +33,7 @@ def tokenize(reviews):
tokenized_reviews.append(tokenized_sentences)
return tokenized_reviews
def norm_stemming(tokenized_reviews):
porter = nltk.PorterStemmer()
stemmed_reviews = []
@ -42,6 +45,7 @@ def norm_stemming(tokenized_reviews):
stemmed_reviews.append(stemmed_review)
return stemmed_reviews
def norm_lemmatize(tokenized_reviews):
wnl = nltk.WordNetLemmatizer()
normalized_reviews = []
@ -54,8 +58,9 @@ def norm_lemmatize(tokenized_reviews):
normalized_reviews.append(normalized_review)
return normalized_reviews
## Feature selection
# Feature selection
def dict_frequency(norm_reviews):
tokens_frequency = defaultdict(int)
for review in norm_reviews:
@ -64,54 +69,113 @@ def dict_frequency(norm_reviews):
tokens_frequency[token] += 1
return tokens_frequency
def select_freq(norm_reviews):
tokens_frequency = dict_frequency(norm_reviews)
norm_reviews_freq3 = [
[
[token for token in sentence if tokens_frequency[token] > 3]
for sentence in review
]
for review in norm_reviews
]
[
[token for token in sentence if tokens_frequency[token] > 3]
for sentence in review
]
for review in norm_reviews
]
return norm_reviews_freq3
def select_rem_stopwords(norm_reviews):
sws = set(nltk.corpus.stopwords.words('english'))
norm_reviews_stoprem = [
[
[token for token in sentence if token not in sws]
for sentence in review
]
for review in norm_reviews
]
[
[token for token in sentence if token not in sws]
for sentence in review
]
for review in norm_reviews
]
return norm_reviews_stoprem
def select_open_class(reviews):
return 0
## Attribute value
def pos_tag_reviews(norm_reviews):
tagged_reviews = []
for review in norm_reviews:
tagged_review = nltk.pos_tag_sents(review, tagset='universal')
tagged_reviews.append(tagged_review)
return tagged_reviews
def validate_open_class(token_tuple):
# i is not tagged right
if token_tuple[1] in ['NOUN','ADJ','ADV','VERB'] and token_tuple[0] != 'i':
return True
else:
return False
def select_open_class(norm_reviews):
tagged_reviews = pos_tag_reviews(norm_reviews)
select_oc_token = [
[
[token_tuple[0] for token_tuple in sentence if validate_open_class(token_tuple)]
for sentence in review
]
for review in tagged_reviews
]
return select_oc_token
# Attribute value
def dummy_function(x):
return x
def corpus_documents(corpus):
corpus_documents = []
for document in corpus:
doc_sentences = []
for sentence in document:
doc_sentences += sentence
corpus_documents.append(doc_sentences)
return corpus_documents
def value_count(reviews):
return 0
cvr = CountVectorizer(analyzer='word',
tokenizer=dummy_function,
preprocessor=dummy_function)
feat_value_count = cvr.fit_transform(reviews)
return feat_value_count
def value_occurence(reviews):
return 0
hvr = CountVectorizer(analyzer='word',
tokenizer=dummy_function,
preprocessor=dummy_function,
binary=True)
feat_value_occurence = hvr.fit_transform(reviews)
return feat_value_occurence
def value_tfidf(reviews):
return 0
## Other attributes
tdvr = TfidfVectorizer(analyzer='word',
tokenizer=dummy_function,
preprocessor=dummy_function)
feat_value_tfidf = tdvr.fit_transform(reviews)
return feat_value_tfidf
# Other attributes
def attribute_polarity_count(reviews):
return 0
def attribute_length(reviews):
return 0
## Training
# Training
def train_naive_model(reviews):
return 0
def train_regression_model(reviews):
return 0