nlp_a2019_tp3/pretraitement.py
2019-12-16 18:25:47 -05:00

54 lines
No EOL
2.3 KiB
Python

import emoji
from nltk.tokenize import sent_tokenize
# Prétraitement
def pretraitement(article,tok,ner_tagger,pos_tagger):
# tokeniser par phrases
article_sentences = sent_tokenize(article)
article_ner_tokens = []
article_pos_tokens = []
article_emoji_tokens = []
for sentence in article_sentences:
try:
if len(sentence) > 0:
# Tokeniser
sentence_tokens = tok.tokenize(sentence)
sentence_tokens = [emoji.demojize(token) for token in sentence_tokens if len(token)>0]
if len(sentence_tokens) > 0:
emoji_tokens = [(token,i) for i, token in enumerate(sentence_tokens,1) if token[0] == ":"]
sentence_tokens = [token for token in sentence_tokens if token[0] != ":"]
if len(sentence_tokens) > 0:
# Assembler les entités nommées et colocations
sentence_ner = ner_tagger.tag(sentence_tokens)
ner_tokens = [ner_token for ner_token in sentence_ner if ner_token[1] != 'O']
# Supprimer les classes fermées avec un POS
sentence_pos = pos_tagger.tag(sentence_tokens)
pos_tokens = [pos_token for pos_token in sentence_pos if pos_token[1] in ['ADJ','ADV','INTJ','NOUN','PROPN','VERB']]
# Ajouter à la liste de phrases tokenisées
article_ner_tokens.append(ner_tokens)
article_pos_tokens.append(pos_tokens)
article_emoji_tokens.append(emoji_tokens)
except:
pass
return article_ner_tokens, article_pos_tokens, article_emoji_tokens
def aggreger_ner_tags(article):
dict_named_entity = {}
for sentence in article[0]:
for entity in sentence:
dict_named_entity[entity] = dict_named_entity.get(entity,0) + 1
return dict_named_entity
def aggreger_pos_tags(article):
dict_pos = {}
for sentence in article[1]:
for pos in sentence:
dict_pos[pos] = dict_pos.get(pos,0) + 1
return dict_pos
def aggreger_emoji(article):
dict_emojis = {}
for sentence in article[2]:
for emoji,loc in sentence:
dict_emojis.setdefault(emoji, []).append(loc)
return dict_emojis