54 lines
2.3 KiB
Python
54 lines
2.3 KiB
Python
|
import emoji
|
||
|
from nltk.tokenize import sent_tokenize
|
||
|
|
||
|
# Prétraitement
|
||
|
def pretraitement(article,tok,ner_tagger,pos_tagger):
|
||
|
# tokeniser par phrases
|
||
|
article_sentences = sent_tokenize(article)
|
||
|
article_ner_tokens = []
|
||
|
article_pos_tokens = []
|
||
|
article_emoji_tokens = []
|
||
|
for sentence in article_sentences:
|
||
|
try:
|
||
|
if len(sentence) > 0:
|
||
|
# Tokeniser
|
||
|
sentence_tokens = tok.tokenize(sentence)
|
||
|
sentence_tokens = [emoji.demojize(token) for token in sentence_tokens if len(token)>0]
|
||
|
if len(sentence_tokens) > 0:
|
||
|
emoji_tokens = [(token,i) for i, token in enumerate(sentence_tokens,1) if token[0] == ":"]
|
||
|
sentence_tokens = [token for token in sentence_tokens if token[0] != ":"]
|
||
|
if len(sentence_tokens) > 0:
|
||
|
# Assembler les entités nommées et colocations
|
||
|
sentence_ner = ner_tagger.tag(sentence_tokens)
|
||
|
ner_tokens = [ner_token for ner_token in sentence_ner if ner_token[1] != 'O']
|
||
|
# Supprimer les classes fermées avec un POS
|
||
|
sentence_pos = pos_tagger.tag(sentence_tokens)
|
||
|
pos_tokens = [pos_token for pos_token in sentence_pos if pos_token[1] in ['ADJ','ADV','INTJ','NOUN','PROPN','VERB']]
|
||
|
# Ajouter à la liste de phrases tokenisées
|
||
|
article_ner_tokens.append(ner_tokens)
|
||
|
article_pos_tokens.append(pos_tokens)
|
||
|
article_emoji_tokens.append(emoji_tokens)
|
||
|
except:
|
||
|
pass
|
||
|
return article_ner_tokens, article_pos_tokens, article_emoji_tokens
|
||
|
|
||
|
def aggreger_ner_tags(article):
|
||
|
dict_named_entity = {}
|
||
|
for sentence in article[0]:
|
||
|
for entity in sentence:
|
||
|
dict_named_entity[entity] = dict_named_entity.get(entity,0) + 1
|
||
|
return dict_named_entity
|
||
|
|
||
|
def aggreger_pos_tags(article):
|
||
|
dict_pos = {}
|
||
|
for sentence in article[1]:
|
||
|
for pos in sentence:
|
||
|
dict_pos[pos] = dict_pos.get(pos,0) + 1
|
||
|
return dict_pos
|
||
|
|
||
|
def aggreger_emoji(article):
|
||
|
dict_emojis = {}
|
||
|
for sentence in article[2]:
|
||
|
for emoji,loc in sentence:
|
||
|
dict_emojis.setdefault(emoji, []).append(loc)
|
||
|
return dict_emojis
|