import emoji from nltk.tokenize import sent_tokenize # Prétraitement def pretraitement(article,tok,ner_tagger,pos_tagger): # tokeniser par phrases article_sentences = sent_tokenize(article) article_ner_tokens = [] article_pos_tokens = [] article_emoji_tokens = [] for sentence in article_sentences: try: if len(sentence) > 0: # Tokeniser sentence_tokens = tok.tokenize(sentence) sentence_tokens = [emoji.demojize(token) for token in sentence_tokens if len(token)>0] if len(sentence_tokens) > 0: emoji_tokens = [(token,i) for i, token in enumerate(sentence_tokens,1) if token[0] == ":"] sentence_tokens = [token for token in sentence_tokens if token[0] != ":"] if len(sentence_tokens) > 0: # Assembler les entités nommées et colocations sentence_ner = ner_tagger.tag(sentence_tokens) ner_tokens = [ner_token for ner_token in sentence_ner if ner_token[1] != 'O'] # Supprimer les classes fermées avec un POS sentence_pos = pos_tagger.tag(sentence_tokens) pos_tokens = [pos_token for pos_token in sentence_pos if pos_token[1] in ['ADJ','ADV','INTJ','NOUN','PROPN','VERB']] # Ajouter à la liste de phrases tokenisées article_ner_tokens.append(ner_tokens) article_pos_tokens.append(pos_tokens) article_emoji_tokens.append(emoji_tokens) except: pass return article_ner_tokens, article_pos_tokens, article_emoji_tokens def aggreger_ner_tags(article): dict_named_entity = {} for sentence in article[0]: for entity in sentence: dict_named_entity[entity] = dict_named_entity.get(entity,0) + 1 return dict_named_entity def aggreger_pos_tags(article): dict_pos = {} for sentence in article[1]: for pos in sentence: dict_pos[pos] = dict_pos.get(pos,0) + 1 return dict_pos def aggreger_emoji(article): dict_emojis = {} for sentence in article[2]: for emoji,loc in sentence: dict_emojis.setdefault(emoji, []).append(loc) return dict_emojis