préparation données terminée
This commit is contained in:
parent
a95f89a69d
commit
c60ee17705
7 changed files with 197 additions and 2167 deletions
61
Analyse_Articles.ipynb
Normal file
61
Analyse_Articles.ipynb
Normal file
|
@ -0,0 +1,61 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pickle"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"f = open(\"pickle/textes_articles_df.pickle\",\"rb\")\n",
|
||||
"textes_articles_df = pickle.load(f)\n",
|
||||
"f.close()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"textes_articles_df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
|
@ -1,261 +0,0 @@
|
|||
<map version="freeplane 1.7.0">
|
||||
<!--To view this file, download free mind mapping software Freeplane from http://freeplane.sourceforge.net -->
|
||||
<node TEXT="Commentaires sur les réseaux sociaux" FOLDED="false" ID="ID_1075161201" CREATED="1573954855044" MODIFIED="1573954867944" STYLE="oval">
|
||||
<font SIZE="18"/>
|
||||
<hook NAME="MapStyle">
|
||||
<properties edgeColorConfiguration="#808080ff,#ff0000ff,#0000ffff,#00ff00ff,#ff00ffff,#00ffffff,#7c0000ff,#00007cff,#007c00ff,#7c007cff,#007c7cff,#7c7c00ff" fit_to_viewport="false"/>
|
||||
|
||||
<map_styles>
|
||||
<stylenode LOCALIZED_TEXT="styles.root_node" STYLE="oval" UNIFORM_SHAPE="true" VGAP_QUANTITY="24.0 pt">
|
||||
<font SIZE="24"/>
|
||||
<stylenode LOCALIZED_TEXT="styles.predefined" POSITION="right" STYLE="bubble">
|
||||
<stylenode LOCALIZED_TEXT="default" ICON_SIZE="12.0 pt" COLOR="#000000" STYLE="fork">
|
||||
<font NAME="SansSerif" SIZE="10" BOLD="false" ITALIC="false"/>
|
||||
</stylenode>
|
||||
<stylenode LOCALIZED_TEXT="defaultstyle.details"/>
|
||||
<stylenode LOCALIZED_TEXT="defaultstyle.attributes">
|
||||
<font SIZE="9"/>
|
||||
</stylenode>
|
||||
<stylenode LOCALIZED_TEXT="defaultstyle.note" COLOR="#000000" BACKGROUND_COLOR="#ffffff" TEXT_ALIGN="LEFT"/>
|
||||
<stylenode LOCALIZED_TEXT="defaultstyle.floating">
|
||||
<edge STYLE="hide_edge"/>
|
||||
<cloud COLOR="#f0f0f0" SHAPE="ROUND_RECT"/>
|
||||
</stylenode>
|
||||
</stylenode>
|
||||
<stylenode LOCALIZED_TEXT="styles.user-defined" POSITION="right" STYLE="bubble">
|
||||
<stylenode LOCALIZED_TEXT="styles.topic" COLOR="#18898b" STYLE="fork">
|
||||
<font NAME="Liberation Sans" SIZE="10" BOLD="true"/>
|
||||
</stylenode>
|
||||
<stylenode LOCALIZED_TEXT="styles.subtopic" COLOR="#cc3300" STYLE="fork">
|
||||
<font NAME="Liberation Sans" SIZE="10" BOLD="true"/>
|
||||
</stylenode>
|
||||
<stylenode LOCALIZED_TEXT="styles.subsubtopic" COLOR="#669900">
|
||||
<font NAME="Liberation Sans" SIZE="10" BOLD="true"/>
|
||||
</stylenode>
|
||||
<stylenode LOCALIZED_TEXT="styles.important">
|
||||
<icon BUILTIN="yes"/>
|
||||
</stylenode>
|
||||
</stylenode>
|
||||
<stylenode LOCALIZED_TEXT="styles.AutomaticLayout" POSITION="right" STYLE="bubble">
|
||||
<stylenode LOCALIZED_TEXT="AutomaticLayout.level.root" COLOR="#000000" STYLE="oval" SHAPE_HORIZONTAL_MARGIN="10.0 pt" SHAPE_VERTICAL_MARGIN="10.0 pt">
|
||||
<font SIZE="18"/>
|
||||
</stylenode>
|
||||
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,1" COLOR="#0033ff">
|
||||
<font SIZE="16"/>
|
||||
</stylenode>
|
||||
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,2" COLOR="#00b439">
|
||||
<font SIZE="14"/>
|
||||
</stylenode>
|
||||
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,3" COLOR="#990000">
|
||||
<font SIZE="12"/>
|
||||
</stylenode>
|
||||
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,4" COLOR="#111111">
|
||||
<font SIZE="10"/>
|
||||
</stylenode>
|
||||
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,5"/>
|
||||
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,6"/>
|
||||
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,7"/>
|
||||
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,8"/>
|
||||
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,9"/>
|
||||
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,10"/>
|
||||
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,11"/>
|
||||
</stylenode>
|
||||
</stylenode>
|
||||
</map_styles>
|
||||
</hook>
|
||||
<hook NAME="AutomaticEdgeColor" COUNTER="6" RULE="ON_BRANCH_CREATION"/>
|
||||
<node TEXT="Schultes" POSITION="left" ID="ID_1889975585" CREATED="1573955304008" MODIFIED="1573955306512">
|
||||
<edge COLOR="#00ff00"/>
|
||||
<node TEXT="Classification Youtube" ID="ID_960774191" CREATED="1573954869370" MODIFIED="1573955309655">
|
||||
<node TEXT="Étude" ID="ID_1621340516" CREATED="1573955028436" MODIFIED="1573955040248">
|
||||
<node TEXT="Aggressif: 42%" ID="ID_221633146" CREATED="1573954944708" MODIFIED="1573954950067"/>
|
||||
<node TEXT="Essentiels: 6%" ID="ID_1440746951" CREATED="1573954967031" MODIFIED="1573954981297"/>
|
||||
<node TEXT="Stupide: 51%" ID="ID_921312086" CREATED="1573954957415" MODIFIED="1573954966495"/>
|
||||
<node TEXT="Non pertinents: 64%" ID="ID_1705312749" CREATED="1573954934920" MODIFIED="1573954944172"/>
|
||||
</node>
|
||||
<node TEXT="The Guardian 2009" ID="ID_352535949" CREATED="1573955053842" MODIFIED="1573955061455">
|
||||
<node TEXT="Juvenile" ID="ID_1646713003" CREATED="1573955061461" MODIFIED="1573955078889"/>
|
||||
<node TEXT="Aggressive" ID="ID_862048526" CREATED="1573955079074" MODIFIED="1573955081407"/>
|
||||
<node TEXT="Misspelled" ID="ID_286709909" CREATED="1573955081602" MODIFIED="1573955087293"/>
|
||||
<node TEXT="Sexist" ID="ID_153876225" CREATED="1573955087779" MODIFIED="1573955092833"/>
|
||||
</node>
|
||||
</node>
|
||||
<node TEXT="Pourquoi commenter?" ID="ID_866509994" CREATED="1573955165314" MODIFIED="1573955312129">
|
||||
<node TEXT="12% commentent" ID="ID_1041837989" CREATED="1573955193866" MODIFIED="1573955202502"/>
|
||||
<node TEXT="34% lisent les commentaires" ID="ID_1372530703" CREATED="1573955232869" MODIFIED="1573955240608"/>
|
||||
<node TEXT="53% regardent les 2-3 premiers commentaires" ID="ID_1662261189" CREATED="1573955241001" MODIFIED="1573955255585"/>
|
||||
</node>
|
||||
<node TEXT="Classification en deux étapes" ID="ID_1588097956" CREATED="1573955358766" MODIFIED="1573955365555">
|
||||
<node TEXT="Type et qualité" ID="ID_1894181819" CREATED="1573955376045" MODIFIED="1573955381723">
|
||||
<node TEXT="La distribution est pertinente pour décrire un vidéo" ID="ID_993533082" CREATED="1573955400296" MODIFIED="1573955411700"/>
|
||||
<node TEXT="Lien direct avec les "Like", donc la rentabilité du vidéo pour Youtube" ID="ID_195371144" CREATED="1573955596788" MODIFIED="1573955616313">
|
||||
<node TEXT="Modélisation Régression Neg. Binomiale" ID="ID_218766081" CREATED="1573956262222" MODIFIED="1573956280088"/>
|
||||
</node>
|
||||
<node TEXT="Types" ID="ID_868118585" CREATED="1573955883446" MODIFIED="1573955884542">
|
||||
<node TEXT="t1: Discussion" ID="ID_1000797050" CREATED="1573955777851" MODIFIED="1573955783201"/>
|
||||
<node TEXT="t3: substantiels" ID="ID_1205292896" CREATED="1573955790350" MODIFIED="1573955794297"/>
|
||||
<node TEXT="t2: inférieurs" ID="ID_392602295" CREATED="1573955783999" MODIFIED="1573955789685"/>
|
||||
</node>
|
||||
<node TEXT="Outils" ID="ID_144483136" CREATED="1573955937321" MODIFIED="1573955941603">
|
||||
<node TEXT="Offensive: SentiStrength" ID="ID_1491683881" CREATED="1573955828111" MODIFIED="1573955833421"/>
|
||||
<node TEXT="Liste manuelle de marqueurs émotionnels" ID="ID_217513801" CREATED="1573955864375" MODIFIED="1573955877200"/>
|
||||
<node TEXT="Appariment de mots-clés et du titre" ID="ID_1268991488" CREATED="1573955952300" MODIFIED="1573955975377"/>
|
||||
</node>
|
||||
<node TEXT="" ID="ID_667449169" CREATED="1573955947623" MODIFIED="1573955947623"/>
|
||||
</node>
|
||||
<node TEXT="Permettent l'analyse sémantique du vidéo" ID="ID_1291677418" CREATED="1573955433090" MODIFIED="1573955442246">
|
||||
<node TEXT="Modélisation avec un SVM" ID="ID_1415408576" CREATED="1573956080194" MODIFIED="1573956086398">
|
||||
<node TEXT="Variable réponse: catégorie du vidéo (News, Sports, Music, ...)" ID="ID_92620342" CREATED="1573956116685" MODIFIED="1573956166425"/>
|
||||
<node TEXT="Caractéristiques: Type et qualité des commentaires" ID="ID_575459159" CREATED="1573956125581" MODIFIED="1573956141215"/>
|
||||
</node>
|
||||
<node TEXT="" ID="ID_1244708423" CREATED="1573956216954" MODIFIED="1573956259964"/>
|
||||
</node>
|
||||
</node>
|
||||
<node TEXT="Inspiration de Ammari et al." ID="ID_473432831" CREATED="1573955655134" MODIFIED="1573955845106">
|
||||
<node TEXT="Identifier les commentaires "noisy"" ID="ID_156080744" CREATED="1573955669957" MODIFIED="1573955686377"/>
|
||||
</node>
|
||||
<node TEXT="https://www.semanticscholar.org/paper/Leave-a-Comment!-An-In-Depth-Analysis-of-User-on-Schultes-Dorner/d84ec961f13ebc56bd45f63ac78a6e07bbba2a63" ID="ID_779661135" CREATED="1573956897563" MODIFIED="1573956898796"/>
|
||||
</node>
|
||||
<node TEXT="Implémentation" POSITION="right" ID="ID_603062380" CREATED="1573956867473" MODIFIED="1573956871207">
|
||||
<edge COLOR="#ff00ff"/>
|
||||
<node TEXT="https://zablo.net/blog/post/twitter-sentiment-analysis-python-scikit-word2vec-nltk-xgboost/index.html" ID="ID_646162134" CREATED="1573956872038" MODIFIED="1573956872849"/>
|
||||
</node>
|
||||
<node TEXT="Halté - Emoticones" POSITION="right" ID="ID_1763549449" CREATED="1575238775212" MODIFIED="1575238785034">
|
||||
<edge COLOR="#00ffff"/>
|
||||
<node TEXT="Emoticones" ID="ID_386506934" CREATED="1575238883772" MODIFIED="1575238887445">
|
||||
<node TEXT="Mimétique et gestuelle" ID="ID_1631325076" CREATED="1575239148209" MODIFIED="1575239154376"/>
|
||||
<node TEXT="Plus fort que les interjections" ID="ID_886136844" CREATED="1575240727401" MODIFIED="1575240735333"/>
|
||||
<node TEXT="Système d'écriture" ID="ID_525765510" CREATED="1575239409720" MODIFIED="1575239420069">
|
||||
<node TEXT="Règles" ID="ID_1483343948" CREATED="1575239421113" MODIFIED="1575239424034"/>
|
||||
<node TEXT="Oppositions" ID="ID_1849414542" CREATED="1575239424924" MODIFIED="1575239427759"/>
|
||||
<node TEXT="Usages" ID="ID_1764780335" CREATED="1575239428189" MODIFIED="1575239429883"/>
|
||||
</node>
|
||||
<node TEXT="Emojis" FOLDED="true" ID="ID_838340950" CREATED="1575242128366" MODIFIED="1575242133912">
|
||||
<node TEXT="Banque normalisée" ID="ID_1322377772" CREATED="1575242133919" MODIFIED="1575242238282">
|
||||
<node TEXT="Verbal" ID="ID_1610044649" CREATED="1575242238977" MODIFIED="1575242243717">
|
||||
<node TEXT="Modalité" ID="ID_1389558503" CREATED="1575242266230" MODIFIED="1575242268798"/>
|
||||
</node>
|
||||
<node TEXT="Non-verbal" FOLDED="true" ID="ID_430063472" CREATED="1575242244105" MODIFIED="1575242250451">
|
||||
<node TEXT="Objets" ID="ID_900313374" CREATED="1575242250461" MODIFIED="1575242253644"/>
|
||||
<node TEXT="Actions" ID="ID_1522816072" CREATED="1575242254098" MODIFIED="1575242264462"/>
|
||||
</node>
|
||||
<node TEXT="Différentes parties du langage (Pierce)" ID="ID_1811986887" CREATED="1575242392026" MODIFIED="1575242450835">
|
||||
<node TEXT="Iconique" ID="ID_1371581343" CREATED="1575242421848" MODIFIED="1575242424675"/>
|
||||
<node TEXT="Indiciel" ID="ID_213205468" CREATED="1575242425224" MODIFIED="1575242427614"/>
|
||||
<node TEXT="Symbolique" ID="ID_349389975" CREATED="1575242428002" MODIFIED="1575242430884"/>
|
||||
</node>
|
||||
</node>
|
||||
</node>
|
||||
<node TEXT="Provine et al. Le sens de l'émoticone est une information additionnelle ou complémentaire au message" ID="ID_790078370" CREATED="1575245786150" MODIFIED="1575245837675">
|
||||
<node TEXT="Nues" ID="ID_1650267953" CREATED="1575245860851" MODIFIED="1575245863060"/>
|
||||
<node TEXT="Début ou fin (+ fréquent)" ID="ID_870280863" CREATED="1575245863647" MODIFIED="1575245884155"/>
|
||||
<node TEXT="Intérieures" ID="ID_37382522" CREATED="1575245868733" MODIFIED="1575245871544"/>
|
||||
</node>
|
||||
</node>
|
||||
<node TEXT="Interjections et sigles" FOLDED="true" ID="ID_818337825" CREATED="1575238887904" MODIFIED="1575238893112">
|
||||
<node TEXT="Attitude subjective" ID="ID_1458901294" CREATED="1575238961067" MODIFIED="1575238969600"/>
|
||||
<node TEXT="Remplace les gestes, mimiques, intonations" ID="ID_1708771216" CREATED="1575238786167" MODIFIED="1575238882360"/>
|
||||
<node TEXT="montrer plutôt que dire" ID="ID_1458650089" CREATED="1575239025028" MODIFIED="1575239031427"/>
|
||||
</node>
|
||||
<node TEXT="Le tchat" ID="ID_972070502" CREATED="1575240740283" MODIFIED="1575240747317">
|
||||
<node TEXT="Parfois synchrone, parfois non." ID="ID_1205295086" CREATED="1575240748630" MODIFIED="1575240779677"/>
|
||||
<node TEXT="Indices contextuels forts" ID="ID_77869570" CREATED="1575240780110" MODIFIED="1575240786499"/>
|
||||
<node TEXT="Tours de paroles segmentés" ID="ID_971520480" CREATED="1575240805520" MODIFIED="1575240878843"/>
|
||||
<node TEXT="Conversations entrelacées" ID="ID_464582331" CREATED="1575240929788" MODIFIED="1575240941056">
|
||||
<node TEXT="Quidproquo" ID="ID_1623621173" CREATED="1575240943203" MODIFIED="1575240948763"/>
|
||||
<node TEXT="Situations humoristiques" ID="ID_816264224" CREATED="1575240949163" MODIFIED="1575240953649"/>
|
||||
</node>
|
||||
<node TEXT="Émoticone: Portée variable" ID="ID_94048404" CREATED="1575260093511" MODIFIED="1575260103300"/>
|
||||
<node TEXT="Énoncés sur plusieurs lignes, séparations syntaxiques ou non" ID="ID_1730871314" CREATED="1575260541509" MODIFIED="1575260562710"/>
|
||||
<node TEXT="Prise en compte/prise en charge" ID="ID_266160815" CREATED="1575260407487" MODIFIED="1575260417401">
|
||||
<node TEXT="L'émoticone permet de ne pas prendre position, mais de montrer qu'on a bien reçu ce qui a été dit" ID="ID_362215456" CREATED="1575260417403" MODIFIED="1575260451519"/>
|
||||
</node>
|
||||
</node>
|
||||
</node>
|
||||
<node TEXT="Georgalou - Discourse and identity on Facebook" POSITION="left" ID="ID_573961500" CREATED="1576432501896" MODIFIED="1576439987458">
|
||||
<edge COLOR="#7c0000"/>
|
||||
<node TEXT="Nouveaux éléments du langage" ID="ID_884645106" CREATED="1576432511262" MODIFIED="1576432547779">
|
||||
<node TEXT="Ponctuations multiples ?!" ID="ID_1995619041" CREATED="1576432549422" MODIFIED="1576437103533"/>
|
||||
<node TEXT="Interjections" ID="ID_607633784" CREATED="1576437113783" MODIFIED="1576437117640"/>
|
||||
<node TEXT="Majuscules" ID="ID_1383386533" CREATED="1576437118071" MODIFIED="1576437122120"/>
|
||||
</node>
|
||||
<node TEXT="Analyse du discours (Baxter 2010)" ID="ID_1424801803" CREATED="1576432904446" MODIFIED="1576433023929">
|
||||
<node TEXT="Variabilité" ID="ID_1655319920" CREATED="1576432921042" MODIFIED="1576432935538">
|
||||
<node TEXT="Audience" ID="ID_1287238103" CREATED="1576432935545" MODIFIED="1576432942327"/>
|
||||
<node TEXT="Contexte" ID="ID_1017431291" CREATED="1576432943825" MODIFIED="1576432954803"/>
|
||||
</node>
|
||||
<node TEXT="Nature du langage" ID="ID_397305792" CREATED="1576432966755" MODIFIED="1576432977141">
|
||||
<node TEXT="Description" ID="ID_950767559" CREATED="1576432978593" MODIFIED="1576432983163"/>
|
||||
<node TEXT="Narration" ID="ID_868814925" CREATED="1576432983797" MODIFIED="1576432987481"/>
|
||||
<node TEXT="Remarques" ID="ID_877450704" CREATED="1576432988675" MODIFIED="1576432996039"/>
|
||||
<node TEXT="Commentaires" ID="ID_1281731741" CREATED="1576432996663" MODIFIED="1576433001128"/>
|
||||
<node TEXT="Blagues" ID="ID_1309797672" CREATED="1576433001508" MODIFIED="1576433003773"/>
|
||||
</node>
|
||||
<node TEXT="Répertoire" ID="ID_1024196052" CREATED="1576433048058" MODIFIED="1576433050691">
|
||||
<node TEXT="Vocabulaire" ID="ID_709732839" CREATED="1576433053478" MODIFIED="1576433057503"/>
|
||||
<node TEXT="Grammaire" ID="ID_736734389" CREATED="1576433058284" MODIFIED="1576433066147"/>
|
||||
<node TEXT="Figures de style" ID="ID_137013127" CREATED="1576433075268" MODIFIED="1576433080277"/>
|
||||
</node>
|
||||
<node TEXT="Approche" ID="ID_1561649830" CREATED="1576433106369" MODIFIED="1576433112084">
|
||||
<node TEXT="Contexte psychologique" ID="ID_1939058865" CREATED="1576433112090" MODIFIED="1576433123678"/>
|
||||
<node TEXT="Contexte sociopolitique" ID="ID_791737082" CREATED="1576433124443" MODIFIED="1576433130862"/>
|
||||
</node>
|
||||
</node>
|
||||
<node TEXT="Éléments d'analyse" ID="ID_994837948" CREATED="1576433185944" MODIFIED="1576433207370">
|
||||
<node TEXT="Intertextualité" ID="ID_1353601895" CREATED="1576433212711" MODIFIED="1576433223175">
|
||||
<node TEXT="Liens avec les textes précédents" ID="ID_261026690" CREATED="1576433278055" MODIFIED="1576433286703"/>
|
||||
</node>
|
||||
<node TEXT="Interdiscursivité" ID="ID_1355846398" CREATED="1576433224084" MODIFIED="1576433232077">
|
||||
<node TEXT="Interaction et superposition des échanges" ID="ID_1296550423" CREATED="1576433265025" MODIFIED="1576433274898"/>
|
||||
</node>
|
||||
<node TEXT="Multimodalité" ID="ID_398609083" CREATED="1576433232685" MODIFIED="1576433253744">
|
||||
<node TEXT="Images" ID="ID_996455266" CREATED="1576433241535" MODIFIED="1576433243920"/>
|
||||
<node TEXT="Textes" ID="ID_1799781603" CREATED="1576433244372" MODIFIED="1576433245825"/>
|
||||
<node TEXT="Vidéos" ID="ID_421888374" CREATED="1576433246259" MODIFIED="1576433247808"/>
|
||||
</node>
|
||||
</node>
|
||||
<node TEXT="Localisation" FOLDED="true" ID="ID_206247466" CREATED="1576433752181" MODIFIED="1576433841871">
|
||||
<node TEXT="Textualisation" FOLDED="true" ID="ID_1774810454" CREATED="1576433842212" MODIFIED="1576433850744">
|
||||
<node TEXT="Élément culturel" ID="ID_726017284" CREATED="1576433850747" MODIFIED="1576433859642"/>
|
||||
</node>
|
||||
<node TEXT="Métonymie" FOLDED="true" ID="ID_1450191917" CREATED="1576433862048" MODIFIED="1576433910984">
|
||||
<node TEXT="Inclus la localisation, mais dans un autre type lexical" ID="ID_1282192414" CREATED="1576433915495" MODIFIED="1576433947584"/>
|
||||
</node>
|
||||
<node TEXT="Personnification" ID="ID_1888056104" CREATED="1576433968022" MODIFIED="1576433972282"/>
|
||||
<node TEXT="Sémiotique transgressive" ID="ID_508849276" CREATED="1576434327152" MODIFIED="1576434342653">
|
||||
<node TEXT="Signes qui ne vont pas ensemble, dans un même discours. Pour marquer l'opposition implicite" ID="ID_1864195644" CREATED="1576434347915" MODIFIED="1576434379898"/>
|
||||
</node>
|
||||
</node>
|
||||
<node TEXT="Temps" FOLDED="true" ID="ID_1259089697" CREATED="1576436972135" MODIFIED="1576436974049">
|
||||
<node TEXT="Temporalité" ID="ID_935107929" CREATED="1576436975051" MODIFIED="1576436979169"/>
|
||||
<node TEXT="Notion de maintenant" ID="ID_63573093" CREATED="1576436979945" MODIFIED="1576436985093"/>
|
||||
<node TEXT="Cycles" ID="ID_1458339480" CREATED="1576437010439" MODIFIED="1576437014557"/>
|
||||
<node TEXT="Saisons, fêtes" ID="ID_566591651" CREATED="1576437044816" MODIFIED="1576437049959"/>
|
||||
<node TEXT="Passé et futur" ID="ID_1383560062" CREATED="1576437051190" MODIFIED="1576437056318"/>
|
||||
<node TEXT="Âge, anniversaires" ID="ID_1886330627" CREATED="1576437056834" MODIFIED="1576437061864"/>
|
||||
</node>
|
||||
<node TEXT="Éducation et expertise" FOLDED="true" ID="ID_869848700" CREATED="1576437570827" MODIFIED="1576437579348">
|
||||
<node TEXT="Montrer son expertise" ID="ID_1657747338" CREATED="1576437580284" MODIFIED="1576437611763"/>
|
||||
<node TEXT="Se réclamer le droit de mener la discussion (entitlement)" ID="ID_1490384821" CREATED="1576437612334" MODIFIED="1576437625926"/>
|
||||
<node TEXT="Montrer ses réussites académiques" ID="ID_1508946328" CREATED="1576437628508" MODIFIED="1576437637956"/>
|
||||
<node TEXT="Références, impératifs, présupposition, souhaits" ID="ID_718153801" CREATED="1576437668961" MODIFIED="1576437681186"/>
|
||||
</node>
|
||||
<node TEXT="Position" ID="ID_727200733" CREATED="1576438549864" MODIFIED="1576438559654">
|
||||
<node TEXT="Expression" FOLDED="true" ID="ID_986508069" CREATED="1576438633543" MODIFIED="1576438637443">
|
||||
<node TEXT="Attitude" ID="ID_274391283" CREATED="1576438566378" MODIFIED="1576438593868"/>
|
||||
<node TEXT="Émotion" ID="ID_920344361" CREATED="1576438594326" MODIFIED="1576438596613"/>
|
||||
<node TEXT="Croyance" ID="ID_592625157" CREATED="1576438596770" MODIFIED="1576438599820"/>
|
||||
<node TEXT="Évaluation/jugement" ID="ID_717619617" CREATED="1576438600172" MODIFIED="1576438605956"/>
|
||||
<node TEXT="Engagement" ID="ID_1367600286" CREATED="1576438606159" MODIFIED="1576438609780"/>
|
||||
</node>
|
||||
<node TEXT="Attributs linguistiques" ID="ID_825473217" CREATED="1576438657794" MODIFIED="1576438663960">
|
||||
<node TEXT="Modalité" ID="ID_197540513" CREATED="1576438664912" MODIFIED="1576438672368"/>
|
||||
<node TEXT="Évaluation" ID="ID_1335244587" CREATED="1576438672980" MODIFIED="1576438676694"/>
|
||||
<node TEXT="Politesse" ID="ID_247379549" CREATED="1576438698682" MODIFIED="1576438703989"/>
|
||||
<node TEXT="Évidentialité" ID="ID_1818362871" CREATED="1576438704862" MODIFIED="1576438708497"/>
|
||||
<node TEXT="Intensité" ID="ID_517078263" CREATED="1576438742226" MODIFIED="1576438746807"/>
|
||||
</node>
|
||||
</node>
|
||||
</node>
|
||||
</node>
|
||||
</map>
|
|
@ -2,7 +2,7 @@
|
|||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 153,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -12,7 +12,9 @@
|
|||
"from nltk.tokenize import toktok, sent_tokenize\n",
|
||||
"from nltk.parse import CoreNLPParser\n",
|
||||
"import re\n",
|
||||
"import pickle"
|
||||
"import pickle\n",
|
||||
"import emoji\n",
|
||||
"import pretraitement as pr"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -62,87 +64,23 @@
|
|||
"del textes_articles_df['Unnamed: 0']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Prétraitement\n",
|
||||
"def pretraitement(article):\n",
|
||||
" # tokeniser par phrases\n",
|
||||
" article_sentences = sent_tokenize(article)\n",
|
||||
" article_ner_tokens = []\n",
|
||||
" article_pos_tokens = []\n",
|
||||
" compteur_phrase = 0\n",
|
||||
" for sentence in article_sentences:\n",
|
||||
" # Tokeniser\n",
|
||||
" sentence_tokens = tok.tokenize(sentence)\n",
|
||||
" # Assembler les entités nommées et colocations\n",
|
||||
" sentence_ner = ner_tagger.tag(sentence_tokens)\n",
|
||||
" ner_tokens = [ner_token for ner_token in sentence_ner if ner_token[1] != 'O']\n",
|
||||
" # Supprimer les classes fermées avec un POS\n",
|
||||
" sentence_pos = pos_tagger.tag(sentence_tokens)\n",
|
||||
" pos_tokens = [pos_token for pos_token in sentence_pos if pos_token[1] in ['ADJ','ADV','INTJ','NOUN','PROPN','VERB']]\n",
|
||||
" # Ajouter à la liste de phrases tokenisées\n",
|
||||
" article_ner_tokens.append(ner_tokens)\n",
|
||||
" article_pos_tokens.append(pos_tokens)\n",
|
||||
" compteur_phrase += 1\n",
|
||||
" return article_ner_tokens, article_pos_tokens"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"article_pretraite = [pretraitement(x) for x in list(textes_articles_df[\"text\"])]"
|
||||
"article_pretraite = [pr.pretraitement(x,tok,ner_tagger,pos_tagger) for x in list(textes_articles_df[\"text\"])]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 131,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def aggreger_ner_tags(article):\n",
|
||||
" dict_named_entity = {}\n",
|
||||
" for sentence in article[0]:\n",
|
||||
" for entity in sentence:\n",
|
||||
" dict_named_entity[entity] = dict_named_entity.get(entity,0) + 1\n",
|
||||
" return dict_named_entity"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 132,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def aggreger_pos_tags(article):\n",
|
||||
" dict_pos = {}\n",
|
||||
" for sentence in article[1]:\n",
|
||||
" for pos in sentence:\n",
|
||||
" dict_pos[pos] = dict_pos.get(pos,0) + 1\n",
|
||||
" return dict_pos"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 165,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"textes_articles_df['ner_dict']=[aggreger_ner_tags(article) for article in article_pretraite]\n",
|
||||
"textes_articles_df['pos_dict']=[aggreger_pos_tags(article) for article in article_pretraite]"
|
||||
"textes_articles_df['ner_dict']=[pr.aggreger_ner_tags(article) for article in article_pretraite]\n",
|
||||
"textes_articles_df['pos_dict']=[pr.aggreger_pos_tags(article) for article in article_pretraite]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -2,7 +2,7 @@
|
|||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -14,7 +14,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -23,7 +23,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -40,7 +40,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -49,7 +49,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
|
54
pretraitement.py
Normal file
54
pretraitement.py
Normal file
|
@ -0,0 +1,54 @@
|
|||
import emoji
|
||||
from nltk.tokenize import sent_tokenize
|
||||
|
||||
# Prétraitement
|
||||
def pretraitement(article,tok,ner_tagger,pos_tagger):
|
||||
# tokeniser par phrases
|
||||
article_sentences = sent_tokenize(article)
|
||||
article_ner_tokens = []
|
||||
article_pos_tokens = []
|
||||
article_emoji_tokens = []
|
||||
for sentence in article_sentences:
|
||||
try:
|
||||
if len(sentence) > 0:
|
||||
# Tokeniser
|
||||
sentence_tokens = tok.tokenize(sentence)
|
||||
sentence_tokens = [emoji.demojize(token) for token in sentence_tokens if len(token)>0]
|
||||
if len(sentence_tokens) > 0:
|
||||
emoji_tokens = [(token,i) for i, token in enumerate(sentence_tokens,1) if token[0] == ":"]
|
||||
sentence_tokens = [token for token in sentence_tokens if token[0] != ":"]
|
||||
if len(sentence_tokens) > 0:
|
||||
# Assembler les entités nommées et colocations
|
||||
sentence_ner = ner_tagger.tag(sentence_tokens)
|
||||
ner_tokens = [ner_token for ner_token in sentence_ner if ner_token[1] != 'O']
|
||||
# Supprimer les classes fermées avec un POS
|
||||
sentence_pos = pos_tagger.tag(sentence_tokens)
|
||||
pos_tokens = [pos_token for pos_token in sentence_pos if pos_token[1] in ['ADJ','ADV','INTJ','NOUN','PROPN','VERB']]
|
||||
# Ajouter à la liste de phrases tokenisées
|
||||
article_ner_tokens.append(ner_tokens)
|
||||
article_pos_tokens.append(pos_tokens)
|
||||
article_emoji_tokens.append(emoji_tokens)
|
||||
except:
|
||||
pass
|
||||
return article_ner_tokens, article_pos_tokens, article_emoji_tokens
|
||||
|
||||
def aggreger_ner_tags(article):
|
||||
dict_named_entity = {}
|
||||
for sentence in article[0]:
|
||||
for entity in sentence:
|
||||
dict_named_entity[entity] = dict_named_entity.get(entity,0) + 1
|
||||
return dict_named_entity
|
||||
|
||||
def aggreger_pos_tags(article):
|
||||
dict_pos = {}
|
||||
for sentence in article[1]:
|
||||
for pos in sentence:
|
||||
dict_pos[pos] = dict_pos.get(pos,0) + 1
|
||||
return dict_pos
|
||||
|
||||
def aggreger_emoji(article):
|
||||
dict_emojis = {}
|
||||
for sentence in article[2]:
|
||||
for emoji,loc in sentence:
|
||||
dict_emojis.setdefault(emoji, []).append(loc)
|
||||
return dict_emojis
|
|
@ -2,7 +2,7 @@
|
|||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -27,47 +27,18 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"jupyter": {
|
||||
"outputs_hidden": true
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "NameError",
|
||||
"evalue": "name 'textes_articles' is not defined",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m<ipython-input-4-b88ef33508d0>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtextes_articles_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtextes_articles\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'media'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'post_id'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'text'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
||||
"\u001b[0;31mNameError\u001b[0m: name 'textes_articles' is not defined"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"textes_articles_df = pd.DataFrame(textes_articles, columns=['media','post_id','text'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "NameError",
|
||||
"evalue": "name 'textes_articles_df' is not defined",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m<ipython-input-5-cc028516ec1f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtextes_articles_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"textes_articles_df.csv\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
||||
"\u001b[0;31mNameError\u001b[0m: name 'textes_articles_df' is not defined"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"textes_articles_df.to_csv(\"textes_articles_df.csv\")"
|
||||
]
|
||||
|
|
Loading…
Reference in a new issue