ajustements faits en écrivant l'article
This commit is contained in:
parent
76a385e1d0
commit
382ceb5727
11 changed files with 12368 additions and 916 deletions
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
.idea/
|
||||
.ipynb_checkpoints/
|
||||
|
|
@ -5,9 +5,9 @@
|
|||
"id": "423050d6-415a-4d17-b0fc-0446357d6bb7",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Livres à censurer\n",
|
||||
"Je t'invite à te joindre.# Livres à censurer\n",
|
||||
" \n",
|
||||
"Étude de la liste de livres à censurer de Matt Krause, un dinosaure républicain du Texas\n",
|
||||
"Étude de la liste de livres à censurer de Matt Krause, un élu républicain du Texas\n",
|
||||
"\n",
|
||||
"Source des informations: \n",
|
||||
"\n",
|
||||
|
@ -28,11 +28,10 @@
|
|||
"import spacy\n",
|
||||
"import re\n",
|
||||
"from spacy import displacy\n",
|
||||
"from collections import Counter\n",
|
||||
"from string import punctuation\n",
|
||||
"\n",
|
||||
"# Dataviz stuff\n",
|
||||
"from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator\n",
|
||||
"from wordcloud import WordCloud, ImageColorGenerator\n",
|
||||
"import seaborn as sns\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"from PIL import Image\n",
|
||||
|
@ -57,7 +56,22 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"krausebooklist = pd.read_csv(\"krausebooklist.csv\", dtype={'Published': int})"
|
||||
"krausebooklist = pd.read_csv(\"scraped_data/krausebooklist.csv\",\n",
|
||||
" dtype={'Published': int})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8c32d2af",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"krausebooklist"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -75,16 +89,28 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"plt.figure(figsize=(5,5)) \n",
|
||||
"sns.histplot(data=krausebooklist, \n",
|
||||
" x=\"Published\", \n",
|
||||
"plt.figure(figsize=(5,5))\n",
|
||||
"plt.tight_layout()\n",
|
||||
"sns.histplot(data=krausebooklist,\n",
|
||||
" x=\"Published\",\n",
|
||||
" discrete=False, \n",
|
||||
" binrange=[1965,2025],\n",
|
||||
" binwidth=5)\n",
|
||||
"plt.tight_layout()\n",
|
||||
"plt.show()"
|
||||
"plt.savefig(\"out_img/annees_publication.png\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ceaa1c99",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2881f4b2-1569-42a6-8d04-50b2e6ee1c59",
|
||||
|
@ -100,18 +126,75 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"goodreadsbooks = pd.read_json(\"scraped_data/goodreadsbooks.json\")"
|
||||
"goodreadsbooks = pd.read_json(\"scraped_data/goodreadsbooks_new.json\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1c7c84ff-7a85-4164-bf58-5b8f946eed21",
|
||||
"metadata": {},
|
||||
"id": "1b83a00e",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"goodreadsbooks"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6bb0a8f9",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def extraire_debut_descr(str_desc):\n",
|
||||
" description = \"\".join(str_desc.split(\"\\n\"))\n",
|
||||
" try:\n",
|
||||
" desc = description.split(\"## Get A Copy\")[0]\n",
|
||||
" if desc is None:\n",
|
||||
" desc_clean=\"None\"\n",
|
||||
" else:\n",
|
||||
" desc_clean=desc\n",
|
||||
" except Exception as N:\n",
|
||||
" desc_clean = str(N)\n",
|
||||
" return desc_clean"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "46bd7edd",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"extraire_debut_descr(goodreadsbooks['description'][0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "445a9d81",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"goodreadsbooks['description_clean'] = \\\n",
|
||||
"goodreadsbooks['description'] \\\n",
|
||||
".apply(lambda x: extraire_debut_descr(x)) \\\n",
|
||||
".apply(lambda x: x.replace('\\n',' ') \\\n",
|
||||
" .replace('_',' ') \\\n",
|
||||
" .replace('*',' ') \\\n",
|
||||
|
@ -157,6 +240,20 @@
|
|||
"### Chargement des données avec Spacy"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4b452f5f",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"spacy.prefer_gpu()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
@ -164,7 +261,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"nlp = spacy.load(\"en_core_web_md\")"
|
||||
"nlp = spacy.load(\"en_core_web_trf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -184,7 +281,7 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = nlp.pipe(goodreadsbooks['description_clean'], n_process=7)"
|
||||
"docs = nlp.pipe(goodreadsbooks['description_clean'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -212,9 +309,9 @@
|
|||
" keyword = []\n",
|
||||
" docs_list.append(doc)\n",
|
||||
" for token in doc:\n",
|
||||
" if(token.is_stop or token.text in punctuation):\n",
|
||||
" if token.is_stop or token.text in punctuation:\n",
|
||||
" continue\n",
|
||||
" if(token.pos_ in pos_tag):\n",
|
||||
" if token.pos_ in pos_tag:\n",
|
||||
" keyword.append({\n",
|
||||
" 'token': j,\n",
|
||||
" 'text': token.text,\n",
|
||||
|
@ -238,6 +335,20 @@
|
|||
"docs_keyword_exp = pd.DataFrame(docs_keyword).explode('keywords')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1cead436",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs_keyword_exp[\"keywords\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
@ -245,27 +356,91 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs_keyword_exp2 = pd.DataFrame(docs_keyword_exp['keywords'].values.tolist(), index=docs_keyword_exp.index)"
|
||||
"docs_keyword_exp2 = pd.json_normalize(docs_keyword_exp['keywords'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6d8bf599-af04-410b-9cc7-8113118a9daa",
|
||||
"metadata": {},
|
||||
"id": "443c23aa",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs_keyword_exp2['document'] = docs_keyword_exp['document']"
|
||||
"docs_keyword_exp2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9c08de94-dc78-4f40-a925-d3f3a81b7db5",
|
||||
"metadata": {},
|
||||
"id": "5699ab10",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs_keyword_exp2 = docs_keyword_exp2.reset_index()"
|
||||
"docs_keyword_exp2.reset_index(inplace=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f67259e5",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"del docs_keyword_exp2[\"index\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7a9fbd37",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs_keyword_exp2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8591c414",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs_keyword_exp2['document'] = docs_keyword_exp['document'].values"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "91c9bd5a",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs_keyword_exp2"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -275,18 +450,10 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs_keyword_exp2['lemma'] = docs_keyword_exp2['lemma'].apply(lambda x: x.lower())\n",
|
||||
"docs_keyword_exp2['text'] = docs_keyword_exp2['text'].apply(lambda x: x.lower())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "47e681aa-645b-49ce-97b0-a264aea863c4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"del docs_keyword_exp2['index']"
|
||||
"docs_keyword_exp2['lemma'] = \\\n",
|
||||
" docs_keyword_exp2['lemma'].apply(lambda x: str(x).lower())\n",
|
||||
"docs_keyword_exp2['text'] = \\\n",
|
||||
" docs_keyword_exp2['text'].apply(lambda x: str(x).lower())"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -306,7 +473,60 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"word_frequency = docs_keyword_exp2['lemma'].value_counts().to_dict()"
|
||||
"word_frequency = docs_keyword_exp2['lemma'] \\\n",
|
||||
" .value_counts() \\\n",
|
||||
" .to_dict()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "15b050fd",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"word_frequency"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "eebc4dce",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"wordcloud = WordCloud(width=512,\n",
|
||||
" height=512,\n",
|
||||
" max_font_size=72,\n",
|
||||
" max_words=1000,\n",
|
||||
" background_color=\"#ece9dc\") \\\n",
|
||||
".generate_from_frequencies(word_frequency)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "102f4099",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"plt.figure(figsize=(10, 10))\n",
|
||||
"plt.imshow(wordcloud, interpolation='bilinear')\n",
|
||||
"plt.axis(\"off\")\n",
|
||||
"plt.savefig(\"out_img/wordcloud_words_regular.png\", format=\"png\")\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -324,7 +544,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs2 = nlp.pipe(goodreadsbooks['description_clean'], n_process=7)"
|
||||
"docs2 = nlp.pipe(goodreadsbooks['description_clean'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -376,7 +596,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs_chunks_exp2 = pd.DataFrame(docs_chunks_exp['keywords'].values.tolist())"
|
||||
"docs_chunks_exp2 = pd.json_normalize(docs_chunks_exp['keywords'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -386,7 +606,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs_chunks_exp2['document'] = docs_chunks_exp['document']"
|
||||
"docs_chunks_exp2['document'] = docs_chunks_exp['document'].values"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -396,7 +616,21 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs_chunks_exp2 = docs_chunks_exp2.reset_index()"
|
||||
"docs_chunks_exp2.reset_index(inplace=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1119daae",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"del docs_chunks_exp2[\"index\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -406,19 +640,12 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs_chunks_exp2['lemma'] = docs_chunks_exp2['lemma'].apply(lambda x: x.lower())\n",
|
||||
"docs_chunks_exp2['root'] = docs_chunks_exp2['root'].apply(lambda x: x.lower())\n",
|
||||
"docs_chunks_exp2['text'] = docs_chunks_exp2['text'].apply(lambda x: x.lower())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fd59bca3-19b3-4a25-abf3-bdbf00aac5b6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"del docs_chunks_exp2['index']"
|
||||
"docs_chunks_exp2['lemma'] = docs_chunks_exp2['lemma'] \\\n",
|
||||
" .apply(lambda x: str(x).lower())\n",
|
||||
"docs_chunks_exp2['root'] = docs_chunks_exp2['root'] \\\n",
|
||||
" .apply(lambda x: str(x).lower())\n",
|
||||
"docs_chunks_exp2['text'] = docs_chunks_exp2['text'] \\\n",
|
||||
" .apply(lambda x: str(x).lower())"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -431,18 +658,6 @@
|
|||
"docs_chunks_exp2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5ec40fff-c929-47bb-bd89-24ebd4d82808",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"word_frequency_chunks = docs_chunks_exp2['text'].value_counts().to_dict()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "eb82dc33-19a3-40a3-bbad-18df4fdc32bf",
|
||||
|
@ -453,6 +668,36 @@
|
|||
"### Word Cloud"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3da4c819",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"word_frequency_chunks = docs_chunks_exp2['text'] \\\n",
|
||||
" .value_counts() \\\n",
|
||||
" .to_dict()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b3eb6ff4",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"word_frequency_chunks"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
@ -515,7 +760,7 @@
|
|||
"image_colors = ImageColorGenerator(my_mask)\n",
|
||||
"\n",
|
||||
"plt.figure(figsize=(10, 10))\n",
|
||||
"plt.imshow(wordcloud_book.recolor(color_func=image_colors), interpolation='bilinear')\n",
|
||||
"plt.imshow(wordcloud_mask.recolor(color_func=image_colors), interpolation='bilinear')\n",
|
||||
"plt.axis(\"off\")\n",
|
||||
"plt.savefig(\"out_img/wordcloud_heart.png\", format=\"png\")\n",
|
||||
"plt.show()"
|
||||
|
@ -592,7 +837,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"your_word=sentence2_1[2].text"
|
||||
"your_word=sentence2_1[2].text.lower()"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -609,7 +854,11 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8d1405a6-d25f-4bcc-917e-48b3aad70b01",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ms = nlp.vocab.vectors.most_similar(\n",
|
||||
|
@ -622,8 +871,12 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "bae29e2f-b945-45a8-b925-5d35839e4f0b",
|
||||
"metadata": {},
|
||||
"id": "6154d722",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
|
@ -644,9 +897,9 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.9"
|
||||
"version": "3.10.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load diff
BIN
out_img/annees_publication.png
Normal file
BIN
out_img/annees_publication.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 5.8 KiB |
Binary file not shown.
Before Width: | Height: | Size: 228 KiB After Width: | Height: | Size: 176 KiB |
Binary file not shown.
Before Width: | Height: | Size: 341 KiB After Width: | Height: | Size: 385 KiB |
BIN
out_img/wordcloud_words_regular.png
Normal file
BIN
out_img/wordcloud_words_regular.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 401 KiB |
11146
scraped_data/goodreadsbooks_new.json
Normal file
11146
scraped_data/goodreadsbooks_new.json
Normal file
File diff suppressed because one or more lines are too long
|
@ -1,13 +1,15 @@
|
|||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
import datetime
|
||||
|
||||
import pandas as pd
|
||||
import scrapy
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
from scrapy.exporters import JsonLinesItemExporter
|
||||
from scrapy.item import Item, Field
|
||||
import html2text
|
||||
import re
|
||||
from dateutil.parser import parse
|
||||
|
||||
|
||||
class GoodReadsBookItem(Item):
|
||||
bookTitle = Field()
|
||||
|
@ -20,32 +22,44 @@ class GoodReadsBookItem(Item):
|
|||
reviewCount = Field()
|
||||
bookFormat = Field()
|
||||
numberOfPages = Field()
|
||||
datePublished = Field()
|
||||
isbn = Field()
|
||||
|
||||
|
||||
class GoodReadsBookSpider(scrapy.Spider):
|
||||
name = "goodreadsbookspider"
|
||||
|
||||
|
||||
def start_requests(self):
|
||||
urls = pd.read_json("scraped_data/goodreadslist.json").href
|
||||
for url in urls:
|
||||
yield scrapy.Request(url=url,
|
||||
callback=self.parse,
|
||||
dont_filter=True
|
||||
)
|
||||
|
||||
)
|
||||
|
||||
def clean_string(self, strExtract):
|
||||
if strExtract==None:
|
||||
if strExtract is None:
|
||||
x = ""
|
||||
else:
|
||||
x = strExtract.strip('\n ')
|
||||
return x
|
||||
|
||||
|
||||
def split_published_date(self, strPublisher):
|
||||
try:
|
||||
datepublished_str = re.search(
|
||||
r'^.*Published\s(\w+\s\d+\w*\s\d+)\sby.*',
|
||||
strPublisher).group(1)
|
||||
datepublished = parse(datepublished_str).isoformat()
|
||||
except:
|
||||
datepublished = datetime.datetime.now()
|
||||
return datepublished
|
||||
|
||||
def parse(self, response):
|
||||
metacol = response.css('#metacol')
|
||||
|
||||
|
||||
converter = html2text.HTML2Text()
|
||||
converter.ignore_links = True
|
||||
|
||||
|
||||
data = GoodReadsBookItem(
|
||||
{
|
||||
'bookTitle': self.clean_string(metacol.css(
|
||||
|
@ -55,29 +69,48 @@ class GoodReadsBookSpider(scrapy.Spider):
|
|||
'authorURL': self.clean_string(metacol.css(
|
||||
'.authorName').attrib["href"]),
|
||||
'authorName': self.clean_string(metacol.css(
|
||||
'.authorName > span:nth-child(1)::text').extract_first()),
|
||||
'.authorName > span:nth-child(1)::text')
|
||||
.extract_first()),
|
||||
'description': converter.handle(metacol.css(
|
||||
'#description').extract_first()),
|
||||
'#descriptionContainer').extract_first()),
|
||||
'ratingValue': self.clean_string(metacol.css(
|
||||
'#bookMeta > span:nth-child(2)::text').extract_first()),
|
||||
'#bookMeta > span:nth-child(2)::text')
|
||||
.extract_first()),
|
||||
'ratingCount': self.clean_string(metacol.css(
|
||||
'a.gr-hyperlink:nth-child(7) > meta:nth-child(1)').attrib['content']),
|
||||
'a.gr-hyperlink:nth-child(7) > '
|
||||
'meta:nth-child(1)')
|
||||
.attrib[
|
||||
'content']),
|
||||
'reviewCount': self.clean_string(metacol.css(
|
||||
'a.gr-hyperlink:nth-child(9) > meta:nth-child(1)').attrib['content']),
|
||||
'a.gr-hyperlink:nth-child(9) > '
|
||||
'meta:nth-child(1)')
|
||||
.attrib[
|
||||
'content']),
|
||||
'bookFormat': self.clean_string(metacol.css(
|
||||
'#details > div:nth-child(1) > span:nth-child(1)::text').extract_first()),
|
||||
'#details > div:nth-child(1) >'
|
||||
' span:nth-child(1)::text').extract_first()),
|
||||
'numberOfPages': self.clean_string(metacol.css(
|
||||
'#details > div:nth-child(1) > span:nth-child(2)::text').extract_first()),
|
||||
'#details > div:nth-child(1) >'
|
||||
' span:nth-child(2)::text').extract_first())
|
||||
.replace(
|
||||
r' pages', ''),
|
||||
'datePublished': self.split_published_date(
|
||||
self.clean_string(metacol.css(
|
||||
'#details > div:nth-child(2)')
|
||||
.extract_first())),
|
||||
'isbn': self.clean_string(metacol.css(
|
||||
'div.infoBoxRowItem:nth-child(2) > span:nth-child(1) > span:nth-child(1)::text').extract_first())
|
||||
'div.infoBoxRowItem:nth-child(2) >'
|
||||
' span:nth-child(1) > span:nth-child(1)::text')
|
||||
.extract_first())
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
if data['bookTitle'] != "":
|
||||
yield data
|
||||
else:
|
||||
pass
|
||||
|
||||
|
||||
|
||||
process = CrawlerProcess({
|
||||
'AUTOTHROTTLE_ENABLED': 'True',
|
||||
'AUTOTHROTTLE_START_DELAY': 1.0,
|
||||
|
@ -85,15 +118,15 @@ process = CrawlerProcess({
|
|||
'COOKIES_ENABLED': False,
|
||||
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
|
||||
'FEEDS': {
|
||||
'goodreadsbooks.json':{
|
||||
'goodreadsbooks_new.json': {
|
||||
'format': 'json',
|
||||
'encoding': 'utf8',
|
||||
'store_empty': False,
|
||||
'fields': None,
|
||||
'indent': 4,
|
||||
'item_export_kwargs': {
|
||||
'export_empty_fields': True,
|
||||
}
|
||||
'encoding': 'utf8',
|
||||
'store_empty': False,
|
||||
'fields': None,
|
||||
'indent': 4,
|
||||
'item_export_kwargs': {
|
||||
'export_empty_fields': True,
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
|
|
|
@ -4,40 +4,56 @@
|
|||
|
||||
import scrapy
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
from scrapy.exporters import JsonLinesItemExporter
|
||||
from scrapy.item import Item, Field
|
||||
|
||||
|
||||
class GoodReadsListItem(Item):
|
||||
title = Field()
|
||||
href = Field()
|
||||
|
||||
|
||||
class GoodReadsListSpider(scrapy.Spider):
|
||||
name = "goodreadslistspider"
|
||||
|
||||
def start_requests(self):
|
||||
urls = ["https://www.goodreads.com/list/show/168413.Banned_Books_According_to_Krause_Part_1_",
|
||||
"https://www.goodreads.com/list/show/168429.Banned_Books_According_to_Krause_Part_7",
|
||||
"https://www.goodreads.com/list/show/168430.Banned_Books_According_to_Krause_Part_8",
|
||||
"https://www.goodreads.com/list/show/168425.Banned_Books_According_to_Krause_Part_4",
|
||||
"https://www.goodreads.com/list/show/168420.Banned_Books_According_to_Krause_Part_2",
|
||||
"https://www.goodreads.com/list/show/168424.Banned_Books_According_to_Krause_Part_3",
|
||||
"https://www.goodreads.com/list/show/168428.Banned_Books_According_to_Krause_Part_6",
|
||||
"https://www.goodreads.com/list/show/168426.Banned_Books_According_to_Krause_Part_5",
|
||||
"https://www.goodreads.com/list/show/168432.Banned_Books_According_to_Krause_Part_9"
|
||||
]
|
||||
urls = [
|
||||
"https://www.goodreads.com/list/show/"
|
||||
"168413.Banned_Books_According_to_Krause_Part_1_",
|
||||
"https://www.goodreads.com/list/show/"
|
||||
"168429.Banned_Books_According_to_Krause_Part_7",
|
||||
"https://www.goodreads.com/list/show/"
|
||||
"168430.Banned_Books_According_to_Krause_Part_8",
|
||||
"https://www.goodreads.com/list/show/"
|
||||
"168425.Banned_Books_According_to_Krause_Part_4",
|
||||
"https://www.goodreads.com/list/show/"
|
||||
"168420.Banned_Books_According_to_Krause_Part_2",
|
||||
"https://www.goodreads.com/list/show/"
|
||||
"168424.Banned_Books_According_to_Krause_Part_3",
|
||||
"https://www.goodreads.com/list/show/"
|
||||
"168428.Banned_Books_According_to_Krause_Part_6",
|
||||
"https://www.goodreads.com/list/show/"
|
||||
"168426.Banned_Books_According_to_Krause_Part_5",
|
||||
"https://www.goodreads.com/list/show/"
|
||||
"168432.Banned_Books_According_to_Krause_Part_9"
|
||||
]
|
||||
for url in urls:
|
||||
yield scrapy.Request(url=url, callback=self.parse)
|
||||
|
||||
|
||||
def parse(self, response):
|
||||
page = response.url.split("/")
|
||||
anchors = response.css("a.bookTitle")
|
||||
for anchor in anchors:
|
||||
data = GoodReadsListItem({
|
||||
'title': anchor.css('span::text').get(),
|
||||
'href': "https://goodreads.com"+anchor.attrib['href']
|
||||
'href': "https://goodreads.com" + anchor.attrib[
|
||||
'href']
|
||||
})
|
||||
yield data
|
||||
|
||||
# Il faut désactiver le cache et les cookies sinon le site web a un comportement imprévisible et ne retourne pas les bonnes données
|
||||
|
||||
# Il faut désactiver le cache et les cookies
|
||||
# sinon le site web a un comportement imprévisible
|
||||
# et ne retourne pas les bonnes données
|
||||
|
||||
process = CrawlerProcess({
|
||||
'AUTOTHROTTLE_ENABLED': 'True',
|
||||
|
@ -45,20 +61,19 @@ process = CrawlerProcess({
|
|||
'HTTPCACHE_ENABLED': False,
|
||||
'COOKIES_ENABLED': False,
|
||||
'FEEDS': {
|
||||
'scraped_data/goodreadslist.json':{
|
||||
'scraped_data/goodreadslist.json': {
|
||||
'format': 'json',
|
||||
'encoding': 'utf8',
|
||||
'store_empty': False,
|
||||
'fields': None,
|
||||
'indent': 4,
|
||||
'item_export_kwargs': {
|
||||
'export_empty_fields': True,
|
||||
}
|
||||
'encoding': 'utf8',
|
||||
'store_empty': False,
|
||||
'fields': None,
|
||||
'indent': 4,
|
||||
'item_export_kwargs': {
|
||||
'export_empty_fields': True,
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
|
||||
process.crawl(GoodReadsListSpider)
|
||||
process.start()
|
||||
process.stop()
|
||||
process.stop()
|
||||
|
|
Loading…
Reference in a new issue