ajout fonction compteurs features
This commit is contained in:
parent
f7b94b575a
commit
834f38b151
2 changed files with 141 additions and 43 deletions
|
@ -7,10 +7,12 @@ import sentiment_analysis_functions as sfun
|
|||
# nltk.download('punkt')
|
||||
# nltk.download('wordnet')
|
||||
# nltk.download('stopwords')
|
||||
# nltk.download('averaged_perceptron_tagger')
|
||||
# nltk.download('universal_tagset')
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
# from sklearn.feature_extraction.text import CountVectorizer
|
||||
# from sklearn.naive_bayes import MultinomialNB
|
||||
# from sklearn.linear_model import LogisticRegression
|
||||
|
||||
train_pos_reviews_fn = "./data/train-positive-t1.txt"
|
||||
train_neg_reviews_fn = "./data/train-negative-t1.txt"
|
||||
|
@ -27,6 +29,7 @@ def load_reviews(filename):
|
|||
reviews_list = json.load(fp)
|
||||
return reviews_list
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
train_positive_reviews = load_reviews(train_pos_reviews_fn)
|
||||
train_negative_reviews = load_reviews(train_neg_reviews_fn)
|
||||
|
@ -34,13 +37,44 @@ if __name__ == '__main__':
|
|||
test_negative_reviews = load_reviews(test_neg_reviews_fn)
|
||||
print("Nb of training reviews - positive:", len(train_positive_reviews), "negative:", len(train_negative_reviews))
|
||||
print("Nb of test reviews - positive:", len(test_positive_reviews), "negative:", len(test_negative_reviews))
|
||||
|
||||
|
||||
x0 = sfun.tokenize(train_positive_reviews)
|
||||
x1 = sfun.norm_stemming(x0)
|
||||
x2 = sfun.norm_lemmatize(x0)
|
||||
|
||||
|
||||
x11 = sfun.select_freq(x1)
|
||||
x21 = sfun.select_freq(x2)
|
||||
|
||||
x12 = sfun.select_rem_stopwords(x1)
|
||||
x22 = sfun.select_rem_stopwords(x2)
|
||||
x22 = sfun.select_rem_stopwords(x2)
|
||||
|
||||
x13 = sfun.select_open_class(x1)
|
||||
x23 = sfun.select_open_class(x2)
|
||||
|
||||
x11c = sfun.corpus_documents(x11)
|
||||
x21c = sfun.corpus_documents(x21)
|
||||
x12c = sfun.corpus_documents(x12)
|
||||
x22c = sfun.corpus_documents(x22)
|
||||
x13c = sfun.corpus_documents(x13)
|
||||
x23c = sfun.corpus_documents(x23)
|
||||
|
||||
x111 = sfun.value_count(x11c)
|
||||
x211 = sfun.value_count(x21c)
|
||||
x121 = sfun.value_count(x12c)
|
||||
x221 = sfun.value_count(x22c)
|
||||
x131 = sfun.value_count(x13c)
|
||||
x231 = sfun.value_count(x23c)
|
||||
|
||||
x112 = sfun.value_occurence(x11c)
|
||||
x212 = sfun.value_occurence(x21c)
|
||||
x122 = sfun.value_occurence(x12c)
|
||||
x222 = sfun.value_occurence(x22c)
|
||||
x132 = sfun.value_occurence(x13c)
|
||||
x232 = sfun.value_occurence(x23c)
|
||||
|
||||
x113 = sfun.value_tfidf(x11c)
|
||||
x213 = sfun.value_tfidf(x21c)
|
||||
x123 = sfun.value_tfidf(x12c)
|
||||
x223 = sfun.value_tfidf(x22c)
|
||||
x133 = sfun.value_tfidf(x13c)
|
||||
x233 = sfun.value_tfidf(x23c)
|
|
@ -9,15 +9,17 @@ Created on Sun Oct 27 17:16:54 2019
|
|||
import nltk
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
|
||||
|
||||
|
||||
# Normalisation
|
||||
|
||||
## Normalisation
|
||||
|
||||
def tokenize(reviews):
|
||||
tokenizer = nltk.tokenize.ToktokTokenizer()
|
||||
tokenized_reviews = []
|
||||
for review in reviews:
|
||||
# Plusieurs fin de phrases étaient représentées par deux espaces ou plus.
|
||||
review = re.sub(r"\s{2,}",". ",review)
|
||||
review = re.sub(r"\s{2,}", ". ", review)
|
||||
review = str.lower(review)
|
||||
tokenized_sentences = []
|
||||
sentences = nltk.sent_tokenize(review)
|
||||
|
@ -31,6 +33,7 @@ def tokenize(reviews):
|
|||
tokenized_reviews.append(tokenized_sentences)
|
||||
return tokenized_reviews
|
||||
|
||||
|
||||
def norm_stemming(tokenized_reviews):
|
||||
porter = nltk.PorterStemmer()
|
||||
stemmed_reviews = []
|
||||
|
@ -42,6 +45,7 @@ def norm_stemming(tokenized_reviews):
|
|||
stemmed_reviews.append(stemmed_review)
|
||||
return stemmed_reviews
|
||||
|
||||
|
||||
def norm_lemmatize(tokenized_reviews):
|
||||
wnl = nltk.WordNetLemmatizer()
|
||||
normalized_reviews = []
|
||||
|
@ -54,8 +58,9 @@ def norm_lemmatize(tokenized_reviews):
|
|||
normalized_reviews.append(normalized_review)
|
||||
return normalized_reviews
|
||||
|
||||
## Feature selection
|
||||
|
||||
|
||||
# Feature selection
|
||||
|
||||
def dict_frequency(norm_reviews):
|
||||
tokens_frequency = defaultdict(int)
|
||||
for review in norm_reviews:
|
||||
|
@ -64,54 +69,113 @@ def dict_frequency(norm_reviews):
|
|||
tokens_frequency[token] += 1
|
||||
return tokens_frequency
|
||||
|
||||
|
||||
def select_freq(norm_reviews):
|
||||
tokens_frequency = dict_frequency(norm_reviews)
|
||||
norm_reviews_freq3 = [
|
||||
[
|
||||
[token for token in sentence if tokens_frequency[token] > 3]
|
||||
for sentence in review
|
||||
]
|
||||
for review in norm_reviews
|
||||
]
|
||||
[
|
||||
[token for token in sentence if tokens_frequency[token] > 3]
|
||||
for sentence in review
|
||||
]
|
||||
for review in norm_reviews
|
||||
]
|
||||
return norm_reviews_freq3
|
||||
|
||||
|
||||
|
||||
def select_rem_stopwords(norm_reviews):
|
||||
sws = set(nltk.corpus.stopwords.words('english'))
|
||||
norm_reviews_stoprem = [
|
||||
[
|
||||
[token for token in sentence if token not in sws]
|
||||
for sentence in review
|
||||
]
|
||||
for review in norm_reviews
|
||||
]
|
||||
[
|
||||
[token for token in sentence if token not in sws]
|
||||
for sentence in review
|
||||
]
|
||||
for review in norm_reviews
|
||||
]
|
||||
return norm_reviews_stoprem
|
||||
|
||||
def select_open_class(reviews):
|
||||
return 0
|
||||
|
||||
## Attribute value
|
||||
|
||||
|
||||
|
||||
def pos_tag_reviews(norm_reviews):
|
||||
tagged_reviews = []
|
||||
for review in norm_reviews:
|
||||
tagged_review = nltk.pos_tag_sents(review, tagset='universal')
|
||||
tagged_reviews.append(tagged_review)
|
||||
return tagged_reviews
|
||||
|
||||
|
||||
def validate_open_class(token_tuple):
|
||||
# i is not tagged right
|
||||
if token_tuple[1] in ['NOUN','ADJ','ADV','VERB'] and token_tuple[0] != 'i':
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def select_open_class(norm_reviews):
|
||||
tagged_reviews = pos_tag_reviews(norm_reviews)
|
||||
select_oc_token = [
|
||||
[
|
||||
[token_tuple[0] for token_tuple in sentence if validate_open_class(token_tuple)]
|
||||
for sentence in review
|
||||
]
|
||||
for review in tagged_reviews
|
||||
]
|
||||
return select_oc_token
|
||||
|
||||
|
||||
# Attribute value
|
||||
|
||||
def dummy_function(x):
|
||||
return x
|
||||
|
||||
def corpus_documents(corpus):
|
||||
corpus_documents = []
|
||||
for document in corpus:
|
||||
doc_sentences = []
|
||||
for sentence in document:
|
||||
doc_sentences += sentence
|
||||
corpus_documents.append(doc_sentences)
|
||||
return corpus_documents
|
||||
|
||||
def value_count(reviews):
|
||||
return 0
|
||||
|
||||
cvr = CountVectorizer(analyzer='word',
|
||||
tokenizer=dummy_function,
|
||||
preprocessor=dummy_function)
|
||||
feat_value_count = cvr.fit_transform(reviews)
|
||||
return feat_value_count
|
||||
|
||||
|
||||
def value_occurence(reviews):
|
||||
return 0
|
||||
|
||||
hvr = CountVectorizer(analyzer='word',
|
||||
tokenizer=dummy_function,
|
||||
preprocessor=dummy_function,
|
||||
binary=True)
|
||||
feat_value_occurence = hvr.fit_transform(reviews)
|
||||
return feat_value_occurence
|
||||
|
||||
|
||||
def value_tfidf(reviews):
|
||||
return 0
|
||||
|
||||
## Other attributes
|
||||
|
||||
tdvr = TfidfVectorizer(analyzer='word',
|
||||
tokenizer=dummy_function,
|
||||
preprocessor=dummy_function)
|
||||
feat_value_tfidf = tdvr.fit_transform(reviews)
|
||||
return feat_value_tfidf
|
||||
|
||||
|
||||
# Other attributes
|
||||
|
||||
def attribute_polarity_count(reviews):
|
||||
return 0
|
||||
|
||||
|
||||
|
||||
def attribute_length(reviews):
|
||||
return 0
|
||||
|
||||
## Training
|
||||
|
||||
|
||||
|
||||
# Training
|
||||
|
||||
def train_naive_model(reviews):
|
||||
return 0
|
||||
|
||||
|
||||
|
||||
def train_regression_model(reviews):
|
||||
return 0
|
||||
|
|
Loading…
Reference in a new issue