ajustements faits en écrivant l'article

This commit is contained in:
François Pelletier 2022-02-06 18:26:27 -05:00
parent 76a385e1d0
commit 382ceb5727
11 changed files with 12368 additions and 916 deletions

3
.gitignore vendored Normal file
View file

@ -0,0 +1,3 @@
.idea/
.ipynb_checkpoints/

View file

@ -5,9 +5,9 @@
"id": "423050d6-415a-4d17-b0fc-0446357d6bb7",
"metadata": {},
"source": [
"# Livres à censurer\n",
"Je t'invite à te joindre.# Livres à censurer\n",
" \n",
"Étude de la liste de livres à censurer de Matt Krause, un dinosaure républicain du Texas\n",
"Étude de la liste de livres à censurer de Matt Krause, un élu républicain du Texas\n",
"\n",
"Source des informations: \n",
"\n",
@ -28,11 +28,10 @@
"import spacy\n",
"import re\n",
"from spacy import displacy\n",
"from collections import Counter\n",
"from string import punctuation\n",
"\n",
"# Dataviz stuff\n",
"from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator\n",
"from wordcloud import WordCloud, ImageColorGenerator\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from PIL import Image\n",
@ -57,7 +56,22 @@
"metadata": {},
"outputs": [],
"source": [
"krausebooklist = pd.read_csv(\"krausebooklist.csv\", dtype={'Published': int})"
"krausebooklist = pd.read_csv(\"scraped_data/krausebooklist.csv\",\n",
" dtype={'Published': int})"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8c32d2af",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"krausebooklist"
]
},
{
@ -75,16 +89,28 @@
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(5,5)) \n",
"sns.histplot(data=krausebooklist, \n",
" x=\"Published\", \n",
"plt.figure(figsize=(5,5))\n",
"plt.tight_layout()\n",
"sns.histplot(data=krausebooklist,\n",
" x=\"Published\",\n",
" discrete=False, \n",
" binrange=[1965,2025],\n",
" binwidth=5)\n",
"plt.tight_layout()\n",
"plt.show()"
"plt.savefig(\"out_img/annees_publication.png\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ceaa1c99",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "2881f4b2-1569-42a6-8d04-50b2e6ee1c59",
@ -100,18 +126,75 @@
"metadata": {},
"outputs": [],
"source": [
"goodreadsbooks = pd.read_json(\"scraped_data/goodreadsbooks.json\")"
"goodreadsbooks = pd.read_json(\"scraped_data/goodreadsbooks_new.json\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1c7c84ff-7a85-4164-bf58-5b8f946eed21",
"metadata": {},
"id": "1b83a00e",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"goodreadsbooks"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6bb0a8f9",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"def extraire_debut_descr(str_desc):\n",
" description = \"\".join(str_desc.split(\"\\n\"))\n",
" try:\n",
" desc = description.split(\"## Get A Copy\")[0]\n",
" if desc is None:\n",
" desc_clean=\"None\"\n",
" else:\n",
" desc_clean=desc\n",
" except Exception as N:\n",
" desc_clean = str(N)\n",
" return desc_clean"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "46bd7edd",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"extraire_debut_descr(goodreadsbooks['description'][0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "445a9d81",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"goodreadsbooks['description_clean'] = \\\n",
"goodreadsbooks['description'] \\\n",
".apply(lambda x: extraire_debut_descr(x)) \\\n",
".apply(lambda x: x.replace('\\n',' ') \\\n",
" .replace('_',' ') \\\n",
" .replace('*',' ') \\\n",
@ -157,6 +240,20 @@
"### Chargement des données avec Spacy"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4b452f5f",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"spacy.prefer_gpu()"
]
},
{
"cell_type": "code",
"execution_count": null,
@ -164,7 +261,7 @@
"metadata": {},
"outputs": [],
"source": [
"nlp = spacy.load(\"en_core_web_md\")"
"nlp = spacy.load(\"en_core_web_trf\")"
]
},
{
@ -184,7 +281,7 @@
},
"outputs": [],
"source": [
"docs = nlp.pipe(goodreadsbooks['description_clean'], n_process=7)"
"docs = nlp.pipe(goodreadsbooks['description_clean'])"
]
},
{
@ -212,9 +309,9 @@
" keyword = []\n",
" docs_list.append(doc)\n",
" for token in doc:\n",
" if(token.is_stop or token.text in punctuation):\n",
" if token.is_stop or token.text in punctuation:\n",
" continue\n",
" if(token.pos_ in pos_tag):\n",
" if token.pos_ in pos_tag:\n",
" keyword.append({\n",
" 'token': j,\n",
" 'text': token.text,\n",
@ -238,6 +335,20 @@
"docs_keyword_exp = pd.DataFrame(docs_keyword).explode('keywords')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1cead436",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"docs_keyword_exp[\"keywords\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
@ -245,27 +356,91 @@
"metadata": {},
"outputs": [],
"source": [
"docs_keyword_exp2 = pd.DataFrame(docs_keyword_exp['keywords'].values.tolist(), index=docs_keyword_exp.index)"
"docs_keyword_exp2 = pd.json_normalize(docs_keyword_exp['keywords'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6d8bf599-af04-410b-9cc7-8113118a9daa",
"metadata": {},
"id": "443c23aa",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"docs_keyword_exp2['document'] = docs_keyword_exp['document']"
"docs_keyword_exp2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9c08de94-dc78-4f40-a925-d3f3a81b7db5",
"metadata": {},
"id": "5699ab10",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"docs_keyword_exp2 = docs_keyword_exp2.reset_index()"
"docs_keyword_exp2.reset_index(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f67259e5",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"del docs_keyword_exp2[\"index\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7a9fbd37",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"docs_keyword_exp2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8591c414",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"docs_keyword_exp2['document'] = docs_keyword_exp['document'].values"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "91c9bd5a",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"docs_keyword_exp2"
]
},
{
@ -275,18 +450,10 @@
"metadata": {},
"outputs": [],
"source": [
"docs_keyword_exp2['lemma'] = docs_keyword_exp2['lemma'].apply(lambda x: x.lower())\n",
"docs_keyword_exp2['text'] = docs_keyword_exp2['text'].apply(lambda x: x.lower())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "47e681aa-645b-49ce-97b0-a264aea863c4",
"metadata": {},
"outputs": [],
"source": [
"del docs_keyword_exp2['index']"
"docs_keyword_exp2['lemma'] = \\\n",
" docs_keyword_exp2['lemma'].apply(lambda x: str(x).lower())\n",
"docs_keyword_exp2['text'] = \\\n",
" docs_keyword_exp2['text'].apply(lambda x: str(x).lower())"
]
},
{
@ -306,7 +473,60 @@
"metadata": {},
"outputs": [],
"source": [
"word_frequency = docs_keyword_exp2['lemma'].value_counts().to_dict()"
"word_frequency = docs_keyword_exp2['lemma'] \\\n",
" .value_counts() \\\n",
" .to_dict()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "15b050fd",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"word_frequency"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eebc4dce",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"wordcloud = WordCloud(width=512,\n",
" height=512,\n",
" max_font_size=72,\n",
" max_words=1000,\n",
" background_color=\"#ece9dc\") \\\n",
".generate_from_frequencies(word_frequency)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "102f4099",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"plt.figure(figsize=(10, 10))\n",
"plt.imshow(wordcloud, interpolation='bilinear')\n",
"plt.axis(\"off\")\n",
"plt.savefig(\"out_img/wordcloud_words_regular.png\", format=\"png\")\n",
"plt.show()"
]
},
{
@ -324,7 +544,7 @@
"metadata": {},
"outputs": [],
"source": [
"docs2 = nlp.pipe(goodreadsbooks['description_clean'], n_process=7)"
"docs2 = nlp.pipe(goodreadsbooks['description_clean'])"
]
},
{
@ -376,7 +596,7 @@
"metadata": {},
"outputs": [],
"source": [
"docs_chunks_exp2 = pd.DataFrame(docs_chunks_exp['keywords'].values.tolist())"
"docs_chunks_exp2 = pd.json_normalize(docs_chunks_exp['keywords'])"
]
},
{
@ -386,7 +606,7 @@
"metadata": {},
"outputs": [],
"source": [
"docs_chunks_exp2['document'] = docs_chunks_exp['document']"
"docs_chunks_exp2['document'] = docs_chunks_exp['document'].values"
]
},
{
@ -396,7 +616,21 @@
"metadata": {},
"outputs": [],
"source": [
"docs_chunks_exp2 = docs_chunks_exp2.reset_index()"
"docs_chunks_exp2.reset_index(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1119daae",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"del docs_chunks_exp2[\"index\"]"
]
},
{
@ -406,19 +640,12 @@
"metadata": {},
"outputs": [],
"source": [
"docs_chunks_exp2['lemma'] = docs_chunks_exp2['lemma'].apply(lambda x: x.lower())\n",
"docs_chunks_exp2['root'] = docs_chunks_exp2['root'].apply(lambda x: x.lower())\n",
"docs_chunks_exp2['text'] = docs_chunks_exp2['text'].apply(lambda x: x.lower())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fd59bca3-19b3-4a25-abf3-bdbf00aac5b6",
"metadata": {},
"outputs": [],
"source": [
"del docs_chunks_exp2['index']"
"docs_chunks_exp2['lemma'] = docs_chunks_exp2['lemma'] \\\n",
" .apply(lambda x: str(x).lower())\n",
"docs_chunks_exp2['root'] = docs_chunks_exp2['root'] \\\n",
" .apply(lambda x: str(x).lower())\n",
"docs_chunks_exp2['text'] = docs_chunks_exp2['text'] \\\n",
" .apply(lambda x: str(x).lower())"
]
},
{
@ -431,18 +658,6 @@
"docs_chunks_exp2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5ec40fff-c929-47bb-bd89-24ebd4d82808",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"word_frequency_chunks = docs_chunks_exp2['text'].value_counts().to_dict()"
]
},
{
"cell_type": "markdown",
"id": "eb82dc33-19a3-40a3-bbad-18df4fdc32bf",
@ -453,6 +668,36 @@
"### Word Cloud"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3da4c819",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"word_frequency_chunks = docs_chunks_exp2['text'] \\\n",
" .value_counts() \\\n",
" .to_dict()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b3eb6ff4",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"word_frequency_chunks"
]
},
{
"cell_type": "code",
"execution_count": null,
@ -515,7 +760,7 @@
"image_colors = ImageColorGenerator(my_mask)\n",
"\n",
"plt.figure(figsize=(10, 10))\n",
"plt.imshow(wordcloud_book.recolor(color_func=image_colors), interpolation='bilinear')\n",
"plt.imshow(wordcloud_mask.recolor(color_func=image_colors), interpolation='bilinear')\n",
"plt.axis(\"off\")\n",
"plt.savefig(\"out_img/wordcloud_heart.png\", format=\"png\")\n",
"plt.show()"
@ -592,7 +837,7 @@
"metadata": {},
"outputs": [],
"source": [
"your_word=sentence2_1[2].text"
"your_word=sentence2_1[2].text.lower()"
]
},
{
@ -609,7 +854,11 @@
"cell_type": "code",
"execution_count": null,
"id": "8d1405a6-d25f-4bcc-917e-48b3aad70b01",
"metadata": {},
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"ms = nlp.vocab.vectors.most_similar(\n",
@ -622,8 +871,12 @@
{
"cell_type": "code",
"execution_count": null,
"id": "bae29e2f-b945-45a8-b925-5d35839e4f0b",
"metadata": {},
"id": "6154d722",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": []
}
@ -644,9 +897,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.9"
"version": "3.10.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
}

File diff suppressed because it is too large Load diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 228 KiB

After

Width:  |  Height:  |  Size: 176 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 341 KiB

After

Width:  |  Height:  |  Size: 385 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 401 KiB

File diff suppressed because one or more lines are too long

View file

@ -1,13 +1,15 @@
#!/usr/bin/env python
# coding: utf-8
import datetime
import pandas as pd
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.exporters import JsonLinesItemExporter
from scrapy.item import Item, Field
import html2text
import re
from dateutil.parser import parse
class GoodReadsBookItem(Item):
bookTitle = Field()
@ -20,32 +22,44 @@ class GoodReadsBookItem(Item):
reviewCount = Field()
bookFormat = Field()
numberOfPages = Field()
datePublished = Field()
isbn = Field()
class GoodReadsBookSpider(scrapy.Spider):
name = "goodreadsbookspider"
def start_requests(self):
urls = pd.read_json("scraped_data/goodreadslist.json").href
for url in urls:
yield scrapy.Request(url=url,
callback=self.parse,
dont_filter=True
)
)
def clean_string(self, strExtract):
if strExtract==None:
if strExtract is None:
x = ""
else:
x = strExtract.strip('\n ')
return x
def split_published_date(self, strPublisher):
try:
datepublished_str = re.search(
r'^.*Published\s(\w+\s\d+\w*\s\d+)\sby.*',
strPublisher).group(1)
datepublished = parse(datepublished_str).isoformat()
except:
datepublished = datetime.datetime.now()
return datepublished
def parse(self, response):
metacol = response.css('#metacol')
converter = html2text.HTML2Text()
converter.ignore_links = True
data = GoodReadsBookItem(
{
'bookTitle': self.clean_string(metacol.css(
@ -55,29 +69,48 @@ class GoodReadsBookSpider(scrapy.Spider):
'authorURL': self.clean_string(metacol.css(
'.authorName').attrib["href"]),
'authorName': self.clean_string(metacol.css(
'.authorName > span:nth-child(1)::text').extract_first()),
'.authorName > span:nth-child(1)::text')
.extract_first()),
'description': converter.handle(metacol.css(
'#description').extract_first()),
'#descriptionContainer').extract_first()),
'ratingValue': self.clean_string(metacol.css(
'#bookMeta > span:nth-child(2)::text').extract_first()),
'#bookMeta > span:nth-child(2)::text')
.extract_first()),
'ratingCount': self.clean_string(metacol.css(
'a.gr-hyperlink:nth-child(7) > meta:nth-child(1)').attrib['content']),
'a.gr-hyperlink:nth-child(7) > '
'meta:nth-child(1)')
.attrib[
'content']),
'reviewCount': self.clean_string(metacol.css(
'a.gr-hyperlink:nth-child(9) > meta:nth-child(1)').attrib['content']),
'a.gr-hyperlink:nth-child(9) > '
'meta:nth-child(1)')
.attrib[
'content']),
'bookFormat': self.clean_string(metacol.css(
'#details > div:nth-child(1) > span:nth-child(1)::text').extract_first()),
'#details > div:nth-child(1) >'
' span:nth-child(1)::text').extract_first()),
'numberOfPages': self.clean_string(metacol.css(
'#details > div:nth-child(1) > span:nth-child(2)::text').extract_first()),
'#details > div:nth-child(1) >'
' span:nth-child(2)::text').extract_first())
.replace(
r' pages', ''),
'datePublished': self.split_published_date(
self.clean_string(metacol.css(
'#details > div:nth-child(2)')
.extract_first())),
'isbn': self.clean_string(metacol.css(
'div.infoBoxRowItem:nth-child(2) > span:nth-child(1) > span:nth-child(1)::text').extract_first())
'div.infoBoxRowItem:nth-child(2) >'
' span:nth-child(1) > span:nth-child(1)::text')
.extract_first())
}
)
if data['bookTitle'] != "":
yield data
else:
pass
process = CrawlerProcess({
'AUTOTHROTTLE_ENABLED': 'True',
'AUTOTHROTTLE_START_DELAY': 1.0,
@ -85,15 +118,15 @@ process = CrawlerProcess({
'COOKIES_ENABLED': False,
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
'FEEDS': {
'goodreadsbooks.json':{
'goodreadsbooks_new.json': {
'format': 'json',
'encoding': 'utf8',
'store_empty': False,
'fields': None,
'indent': 4,
'item_export_kwargs': {
'export_empty_fields': True,
}
'encoding': 'utf8',
'store_empty': False,
'fields': None,
'indent': 4,
'item_export_kwargs': {
'export_empty_fields': True,
}
}
}
})

View file

@ -4,40 +4,56 @@
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.exporters import JsonLinesItemExporter
from scrapy.item import Item, Field
class GoodReadsListItem(Item):
title = Field()
href = Field()
class GoodReadsListSpider(scrapy.Spider):
name = "goodreadslistspider"
def start_requests(self):
urls = ["https://www.goodreads.com/list/show/168413.Banned_Books_According_to_Krause_Part_1_",
"https://www.goodreads.com/list/show/168429.Banned_Books_According_to_Krause_Part_7",
"https://www.goodreads.com/list/show/168430.Banned_Books_According_to_Krause_Part_8",
"https://www.goodreads.com/list/show/168425.Banned_Books_According_to_Krause_Part_4",
"https://www.goodreads.com/list/show/168420.Banned_Books_According_to_Krause_Part_2",
"https://www.goodreads.com/list/show/168424.Banned_Books_According_to_Krause_Part_3",
"https://www.goodreads.com/list/show/168428.Banned_Books_According_to_Krause_Part_6",
"https://www.goodreads.com/list/show/168426.Banned_Books_According_to_Krause_Part_5",
"https://www.goodreads.com/list/show/168432.Banned_Books_According_to_Krause_Part_9"
]
urls = [
"https://www.goodreads.com/list/show/"
"168413.Banned_Books_According_to_Krause_Part_1_",
"https://www.goodreads.com/list/show/"
"168429.Banned_Books_According_to_Krause_Part_7",
"https://www.goodreads.com/list/show/"
"168430.Banned_Books_According_to_Krause_Part_8",
"https://www.goodreads.com/list/show/"
"168425.Banned_Books_According_to_Krause_Part_4",
"https://www.goodreads.com/list/show/"
"168420.Banned_Books_According_to_Krause_Part_2",
"https://www.goodreads.com/list/show/"
"168424.Banned_Books_According_to_Krause_Part_3",
"https://www.goodreads.com/list/show/"
"168428.Banned_Books_According_to_Krause_Part_6",
"https://www.goodreads.com/list/show/"
"168426.Banned_Books_According_to_Krause_Part_5",
"https://www.goodreads.com/list/show/"
"168432.Banned_Books_According_to_Krause_Part_9"
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")
anchors = response.css("a.bookTitle")
for anchor in anchors:
data = GoodReadsListItem({
'title': anchor.css('span::text').get(),
'href': "https://goodreads.com"+anchor.attrib['href']
'href': "https://goodreads.com" + anchor.attrib[
'href']
})
yield data
# Il faut désactiver le cache et les cookies sinon le site web a un comportement imprévisible et ne retourne pas les bonnes données
# Il faut désactiver le cache et les cookies
# sinon le site web a un comportement imprévisible
# et ne retourne pas les bonnes données
process = CrawlerProcess({
'AUTOTHROTTLE_ENABLED': 'True',
@ -45,20 +61,19 @@ process = CrawlerProcess({
'HTTPCACHE_ENABLED': False,
'COOKIES_ENABLED': False,
'FEEDS': {
'scraped_data/goodreadslist.json':{
'scraped_data/goodreadslist.json': {
'format': 'json',
'encoding': 'utf8',
'store_empty': False,
'fields': None,
'indent': 4,
'item_export_kwargs': {
'export_empty_fields': True,
}
'encoding': 'utf8',
'store_empty': False,
'fields': None,
'indent': 4,
'item_export_kwargs': {
'export_empty_fields': True,
}
}
}
})
process.crawl(GoodReadsListSpider)
process.start()
process.stop()
process.stop()