ajustements faits en écrivant l'article

2022-02-06 18:26:27 -05:00 · 2022-02-06 18:26:27 -05:00 · 382ceb5727
commit 382ceb5727
parent 76a385e1d0
11 changed files with 12368 additions and 916 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+.idea/
+.ipynb_checkpoints/
+
--- a/Enrichissement.ipynb
+++ b/Enrichissement.ipynb
@ -5,9 +5,9 @@
   "id": "423050d6-415a-4d17-b0fc-0446357d6bb7",
   "metadata": {},
   "source": [
-    "# Livres à censurer\n",
+    "Je t'invite à te joindre.# Livres à censurer\n",
    " \n",
-    "Étude de la liste de livres à censurer de Matt Krause, un dinosaure républicain du Texas\n",
+    "Étude de la liste de livres à censurer de Matt Krause, un élu républicain du Texas\n",
    "\n",
    "Source des informations: \n",
    "\n",
@ -28,11 +28,10 @@
    "import spacy\n",
    "import re\n",
    "from spacy import displacy\n",
-    "from collections import Counter\n",
    "from string import punctuation\n",
    "\n",
    "# Dataviz stuff\n",
-    "from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator\n",
+    "from wordcloud import WordCloud, ImageColorGenerator\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "from PIL import Image\n",
@ -57,7 +56,22 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "krausebooklist = pd.read_csv(\"krausebooklist.csv\", dtype={'Published': int})"
+    "krausebooklist = pd.read_csv(\"scraped_data/krausebooklist.csv\",\n",
+    "                             dtype={'Published': int})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8c32d2af",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "krausebooklist"
   ]
  },
  {
@ -75,16 +89,28 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "plt.figure(figsize=(5,5)) \n",
-    "sns.histplot(data=krausebooklist, \n",
-    "             x=\"Published\", \n",
+    "plt.figure(figsize=(5,5))\n",
+    "plt.tight_layout()\n",
+    "sns.histplot(data=krausebooklist,\n",
+    "             x=\"Published\",\n",
    "             discrete=False, \n",
    "             binrange=[1965,2025],\n",
    "             binwidth=5)\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
+    "plt.savefig(\"out_img/annees_publication.png\")"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ceaa1c99",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": []
+  },
  {
   "cell_type": "markdown",
   "id": "2881f4b2-1569-42a6-8d04-50b2e6ee1c59",
@ -100,18 +126,75 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "goodreadsbooks = pd.read_json(\"scraped_data/goodreadsbooks.json\")"
+    "goodreadsbooks = pd.read_json(\"scraped_data/goodreadsbooks_new.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "1c7c84ff-7a85-4164-bf58-5b8f946eed21",
-   "metadata": {},
+   "id": "1b83a00e",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "goodreadsbooks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6bb0a8f9",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def extraire_debut_descr(str_desc):\n",
+    "    description = \"\".join(str_desc.split(\"\\n\"))\n",
+    "    try:\n",
+    "        desc = description.split(\"## Get A Copy\")[0]\n",
+    "        if desc is None:\n",
+    "            desc_clean=\"None\"\n",
+    "        else:\n",
+    "            desc_clean=desc\n",
+    "    except Exception as N:\n",
+    "        desc_clean = str(N)\n",
+    "    return desc_clean"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "46bd7edd",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "extraire_debut_descr(goodreadsbooks['description'][0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "445a9d81",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
   "outputs": [],
   "source": [
    "goodreadsbooks['description_clean'] = \\\n",
    "goodreadsbooks['description'] \\\n",
+    ".apply(lambda x: extraire_debut_descr(x)) \\\n",
    ".apply(lambda x: x.replace('\\n',' ') \\\n",
    "       .replace('_',' ') \\\n",
    "       .replace('*',' ') \\\n",
@ -157,6 +240,20 @@
    "### Chargement des données avec Spacy"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4b452f5f",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "spacy.prefer_gpu()"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -164,7 +261,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "nlp = spacy.load(\"en_core_web_md\")"
+    "nlp = spacy.load(\"en_core_web_trf\")"
   ]
  },
  {
@ -184,7 +281,7 @@
   },
   "outputs": [],
   "source": [
-    "docs = nlp.pipe(goodreadsbooks['description_clean'], n_process=7)"
+    "docs = nlp.pipe(goodreadsbooks['description_clean'])"
   ]
  },
  {
@ -212,9 +309,9 @@
    "    keyword = []\n",
    "    docs_list.append(doc)\n",
    "    for token in doc:\n",
-    "        if(token.is_stop or token.text in punctuation):\n",
+    "        if token.is_stop or token.text in punctuation:\n",
    "            continue\n",
-    "        if(token.pos_ in pos_tag):\n",
+    "        if token.pos_ in pos_tag:\n",
    "            keyword.append({\n",
    "                'token': j,\n",
    "                'text': token.text,\n",
@ -238,6 +335,20 @@
    "docs_keyword_exp = pd.DataFrame(docs_keyword).explode('keywords')"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1cead436",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "docs_keyword_exp[\"keywords\"]"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -245,27 +356,91 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "docs_keyword_exp2 = pd.DataFrame(docs_keyword_exp['keywords'].values.tolist(), index=docs_keyword_exp.index)"
+    "docs_keyword_exp2 = pd.json_normalize(docs_keyword_exp['keywords'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "6d8bf599-af04-410b-9cc7-8113118a9daa",
-   "metadata": {},
+   "id": "443c23aa",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
   "outputs": [],
   "source": [
-    "docs_keyword_exp2['document'] = docs_keyword_exp['document']"
+    "docs_keyword_exp2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "9c08de94-dc78-4f40-a925-d3f3a81b7db5",
-   "metadata": {},
+   "id": "5699ab10",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
   "outputs": [],
   "source": [
-    "docs_keyword_exp2 = docs_keyword_exp2.reset_index()"
+    "docs_keyword_exp2.reset_index(inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f67259e5",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "del docs_keyword_exp2[\"index\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7a9fbd37",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "docs_keyword_exp2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8591c414",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "docs_keyword_exp2['document'] = docs_keyword_exp['document'].values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "91c9bd5a",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "docs_keyword_exp2"
   ]
  },
  {
@ -275,18 +450,10 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "docs_keyword_exp2['lemma'] = docs_keyword_exp2['lemma'].apply(lambda x: x.lower())\n",
-    "docs_keyword_exp2['text'] = docs_keyword_exp2['text'].apply(lambda x: x.lower())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "47e681aa-645b-49ce-97b0-a264aea863c4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "del docs_keyword_exp2['index']"
+    "docs_keyword_exp2['lemma'] = \\\n",
+    "    docs_keyword_exp2['lemma'].apply(lambda x: str(x).lower())\n",
+    "docs_keyword_exp2['text'] = \\\n",
+    "    docs_keyword_exp2['text'].apply(lambda x: str(x).lower())"
   ]
  },
  {
@ -306,7 +473,60 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "word_frequency = docs_keyword_exp2['lemma'].value_counts().to_dict()"
+    "word_frequency = docs_keyword_exp2['lemma'] \\\n",
+    "    .value_counts() \\\n",
+    "    .to_dict()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15b050fd",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "word_frequency"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eebc4dce",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "wordcloud = WordCloud(width=512,\n",
+    "                      height=512,\n",
+    "                      max_font_size=72,\n",
+    "                      max_words=1000,\n",
+    "                      background_color=\"#ece9dc\") \\\n",
+    ".generate_from_frequencies(word_frequency)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "102f4099",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(10, 10))\n",
+    "plt.imshow(wordcloud, interpolation='bilinear')\n",
+    "plt.axis(\"off\")\n",
+    "plt.savefig(\"out_img/wordcloud_words_regular.png\", format=\"png\")\n",
+    "plt.show()"
   ]
  },
  {
@ -324,7 +544,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "docs2 = nlp.pipe(goodreadsbooks['description_clean'], n_process=7)"
+    "docs2 = nlp.pipe(goodreadsbooks['description_clean'])"
   ]
  },
  {
@ -376,7 +596,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "docs_chunks_exp2 = pd.DataFrame(docs_chunks_exp['keywords'].values.tolist())"
+    "docs_chunks_exp2 = pd.json_normalize(docs_chunks_exp['keywords'])"
   ]
  },
  {
@ -386,7 +606,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "docs_chunks_exp2['document'] = docs_chunks_exp['document']"
+    "docs_chunks_exp2['document'] = docs_chunks_exp['document'].values"
   ]
  },
  {
@ -396,7 +616,21 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "docs_chunks_exp2 = docs_chunks_exp2.reset_index()"
+    "docs_chunks_exp2.reset_index(inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1119daae",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "del docs_chunks_exp2[\"index\"]"
   ]
  },
  {
@ -406,19 +640,12 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "docs_chunks_exp2['lemma'] = docs_chunks_exp2['lemma'].apply(lambda x: x.lower())\n",
-    "docs_chunks_exp2['root'] = docs_chunks_exp2['root'].apply(lambda x: x.lower())\n",
-    "docs_chunks_exp2['text'] = docs_chunks_exp2['text'].apply(lambda x: x.lower())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fd59bca3-19b3-4a25-abf3-bdbf00aac5b6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "del docs_chunks_exp2['index']"
+    "docs_chunks_exp2['lemma'] = docs_chunks_exp2['lemma'] \\\n",
+    "    .apply(lambda x: str(x).lower())\n",
+    "docs_chunks_exp2['root'] = docs_chunks_exp2['root'] \\\n",
+    "    .apply(lambda x: str(x).lower())\n",
+    "docs_chunks_exp2['text'] = docs_chunks_exp2['text'] \\\n",
+    "    .apply(lambda x: str(x).lower())"
   ]
  },
  {
@ -431,18 +658,6 @@
    "docs_chunks_exp2"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5ec40fff-c929-47bb-bd89-24ebd4d82808",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "word_frequency_chunks = docs_chunks_exp2['text'].value_counts().to_dict()"
-   ]
-  },
  {
   "cell_type": "markdown",
   "id": "eb82dc33-19a3-40a3-bbad-18df4fdc32bf",
@ -453,6 +668,36 @@
    "### Word Cloud"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3da4c819",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "word_frequency_chunks = docs_chunks_exp2['text'] \\\n",
+    "    .value_counts() \\\n",
+    "    .to_dict()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b3eb6ff4",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "word_frequency_chunks"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -515,7 +760,7 @@
    "image_colors = ImageColorGenerator(my_mask)\n",
    "\n",
    "plt.figure(figsize=(10, 10))\n",
-    "plt.imshow(wordcloud_book.recolor(color_func=image_colors), interpolation='bilinear')\n",
+    "plt.imshow(wordcloud_mask.recolor(color_func=image_colors), interpolation='bilinear')\n",
    "plt.axis(\"off\")\n",
    "plt.savefig(\"out_img/wordcloud_heart.png\", format=\"png\")\n",
    "plt.show()"
@ -592,7 +837,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "your_word=sentence2_1[2].text"
+    "your_word=sentence2_1[2].text.lower()"
   ]
  },
  {
@ -609,7 +854,11 @@
   "cell_type": "code",
   "execution_count": null,
   "id": "8d1405a6-d25f-4bcc-917e-48b3aad70b01",
-   "metadata": {},
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
   "outputs": [],
   "source": [
    "ms = nlp.vocab.vectors.most_similar(\n",
@ -622,8 +871,12 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "bae29e2f-b945-45a8-b925-5d35839e4f0b",
-   "metadata": {},
+   "id": "6154d722",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
   "outputs": [],
   "source": []
  }
@ -644,9 +897,9 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.9"
+   "version": "3.10.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
-}
+}
--- a/clean_data/goodreads_description_clean.csv
+++ b/clean_data/goodreads_description_clean.csv
--- a/out_img/annees_publication.png
+++ b/out_img/annees_publication.png
--- a/out_img/wordcloud_heart.png
+++ b/out_img/wordcloud_heart.png
--- a/out_img/wordcloud_regular.png
+++ b/out_img/wordcloud_regular.png
--- a/out_img/wordcloud_words_regular.png
+++ b/out_img/wordcloud_words_regular.png
--- a/scraped_data/goodreadsbooks_new.json
+++ b/scraped_data/goodreadsbooks_new.json
--- a/scraped_data/goodreadsbooks_old.json
+++ b/scraped_data/goodreadsbooks_old.json
--- a/scraping_goodreads_books.py
+++ b/scraping_goodreads_books.py
@ -1,13 +1,15 @@
 #!/usr/bin/env python
 # coding: utf-8
-
+import datetime

 import pandas as pd
 import scrapy
 from scrapy.crawler import CrawlerProcess
-from scrapy.exporters import JsonLinesItemExporter
 from scrapy.item import Item, Field
 import html2text
+import re
+from dateutil.parser import parse
+

 class GoodReadsBookItem(Item):
    bookTitle = Field()
@ -20,32 +22,44 @@ class GoodReadsBookItem(Item):
    reviewCount = Field()
    bookFormat = Field()
    numberOfPages = Field()
+    datePublished = Field()
    isbn = Field()

+
 class GoodReadsBookSpider(scrapy.Spider):
    name = "goodreadsbookspider"
-    
+
    def start_requests(self):
        urls = pd.read_json("scraped_data/goodreadslist.json").href
        for url in urls:
            yield scrapy.Request(url=url,
                                 callback=self.parse,
                                 dont_filter=True
-                                )
-            
+                                 )
+
    def clean_string(self, strExtract):
-        if strExtract==None:
+        if strExtract is None:
            x = ""
        else:
            x = strExtract.strip('\n ')
        return x
-    
+
+    def split_published_date(self, strPublisher):
+        try:
+            datepublished_str = re.search(
+                r'^.*Published\s(\w+\s\d+\w*\s\d+)\sby.*',
+                strPublisher).group(1)
+            datepublished = parse(datepublished_str).isoformat()
+        except:
+            datepublished = datetime.datetime.now()
+        return datepublished
+
    def parse(self, response):
        metacol = response.css('#metacol')
-        
+
        converter = html2text.HTML2Text()
        converter.ignore_links = True
-        
+
        data = GoodReadsBookItem(
            {
                'bookTitle': self.clean_string(metacol.css(
@ -55,29 +69,48 @@ class GoodReadsBookSpider(scrapy.Spider):
                'authorURL': self.clean_string(metacol.css(
                    '.authorName').attrib["href"]),
                'authorName': self.clean_string(metacol.css(
-                    '.authorName > span:nth-child(1)::text').extract_first()),
+                    '.authorName > span:nth-child(1)::text')
+                                                .extract_first()),
                'description': converter.handle(metacol.css(
-                    '#description').extract_first()),
+                    '#descriptionContainer').extract_first()),
                'ratingValue': self.clean_string(metacol.css(
-                    '#bookMeta > span:nth-child(2)::text').extract_first()),
+                    '#bookMeta > span:nth-child(2)::text')
+                                                 .extract_first()),
                'ratingCount': self.clean_string(metacol.css(
-                    'a.gr-hyperlink:nth-child(7) > meta:nth-child(1)').attrib['content']),
+                    'a.gr-hyperlink:nth-child(7) > '
+                    'meta:nth-child(1)')
+                                                 .attrib[
+                                                     'content']),
                'reviewCount': self.clean_string(metacol.css(
-                    'a.gr-hyperlink:nth-child(9) > meta:nth-child(1)').attrib['content']),
+                    'a.gr-hyperlink:nth-child(9) > '
+                    'meta:nth-child(1)')
+                                                 .attrib[
+                                                     'content']),
                'bookFormat': self.clean_string(metacol.css(
-                    '#details > div:nth-child(1) > span:nth-child(1)::text').extract_first()),
+                    '#details > div:nth-child(1) >'
+                    ' span:nth-child(1)::text').extract_first()),
                'numberOfPages': self.clean_string(metacol.css(
-                    '#details > div:nth-child(1) > span:nth-child(2)::text').extract_first()),
+                    '#details > div:nth-child(1) >'
+                    ' span:nth-child(2)::text').extract_first())
+                    .replace(
+                    r' pages', ''),
+                'datePublished': self.split_published_date(
+                    self.clean_string(metacol.css(
+                        '#details > div:nth-child(2)')
+                                      .extract_first())),
                'isbn': self.clean_string(metacol.css(
-                    'div.infoBoxRowItem:nth-child(2) > span:nth-child(1) > span:nth-child(1)::text').extract_first())
+                    'div.infoBoxRowItem:nth-child(2) >'
+                    ' span:nth-child(1) > span:nth-child(1)::text')
+                                          .extract_first())
            }
        )
-        
+
        if data['bookTitle'] != "":
            yield data
        else:
            pass
-        
+
+
 process = CrawlerProcess({
    'AUTOTHROTTLE_ENABLED': 'True',
    'AUTOTHROTTLE_START_DELAY': 1.0,
@ -85,15 +118,15 @@ process = CrawlerProcess({
    'COOKIES_ENABLED': False,
    'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
    'FEEDS': {
-        'goodreadsbooks.json':{
+        'goodreadsbooks_new.json': {
            'format': 'json',
-        'encoding': 'utf8',
-        'store_empty': False,
-        'fields': None,
-        'indent': 4,
-        'item_export_kwargs': {
-           'export_empty_fields': True,
-        }
+            'encoding': 'utf8',
+            'store_empty': False,
+            'fields': None,
+            'indent': 4,
+            'item_export_kwargs': {
+                'export_empty_fields': True,
+            }
        }
    }
 })
--- a/scraping_goodreads_list.py
+++ b/scraping_goodreads_list.py
@ -4,40 +4,56 @@

 import scrapy
 from scrapy.crawler import CrawlerProcess
-from scrapy.exporters import JsonLinesItemExporter
 from scrapy.item import Item, Field

+
 class GoodReadsListItem(Item):
    title = Field()
    href = Field()

+
 class GoodReadsListSpider(scrapy.Spider):
    name = "goodreadslistspider"
+
    def start_requests(self):
-        urls = ["https://www.goodreads.com/list/show/168413.Banned_Books_According_to_Krause_Part_1_",
-               "https://www.goodreads.com/list/show/168429.Banned_Books_According_to_Krause_Part_7",
-               "https://www.goodreads.com/list/show/168430.Banned_Books_According_to_Krause_Part_8",
-               "https://www.goodreads.com/list/show/168425.Banned_Books_According_to_Krause_Part_4",
-               "https://www.goodreads.com/list/show/168420.Banned_Books_According_to_Krause_Part_2",
-               "https://www.goodreads.com/list/show/168424.Banned_Books_According_to_Krause_Part_3",
-               "https://www.goodreads.com/list/show/168428.Banned_Books_According_to_Krause_Part_6",
-               "https://www.goodreads.com/list/show/168426.Banned_Books_According_to_Krause_Part_5",
-               "https://www.goodreads.com/list/show/168432.Banned_Books_According_to_Krause_Part_9"
-              ]
+        urls = [
+            "https://www.goodreads.com/list/show/"
+            "168413.Banned_Books_According_to_Krause_Part_1_",
+            "https://www.goodreads.com/list/show/"
+            "168429.Banned_Books_According_to_Krause_Part_7",
+            "https://www.goodreads.com/list/show/"
+            "168430.Banned_Books_According_to_Krause_Part_8",
+            "https://www.goodreads.com/list/show/"
+            "168425.Banned_Books_According_to_Krause_Part_4",
+            "https://www.goodreads.com/list/show/"
+            "168420.Banned_Books_According_to_Krause_Part_2",
+            "https://www.goodreads.com/list/show/"
+            "168424.Banned_Books_According_to_Krause_Part_3",
+            "https://www.goodreads.com/list/show/"
+            "168428.Banned_Books_According_to_Krause_Part_6",
+            "https://www.goodreads.com/list/show/"
+            "168426.Banned_Books_According_to_Krause_Part_5",
+            "https://www.goodreads.com/list/show/"
+            "168432.Banned_Books_According_to_Krause_Part_9"
+            ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)
-    
+
    def parse(self, response):
        page = response.url.split("/")
        anchors = response.css("a.bookTitle")
        for anchor in anchors:
            data = GoodReadsListItem({
                'title': anchor.css('span::text').get(),
-                'href': "https://goodreads.com"+anchor.attrib['href']
+                'href': "https://goodreads.com" + anchor.attrib[
+                    'href']
            })
            yield data

-# Il faut désactiver le cache et les cookies sinon le site web a un comportement imprévisible et ne retourne pas les bonnes données
+
+# Il faut désactiver le cache et les cookies
+# sinon le site web a un comportement imprévisible
+# et ne retourne pas les bonnes données

 process = CrawlerProcess({
    'AUTOTHROTTLE_ENABLED': 'True',
@ -45,20 +61,19 @@ process = CrawlerProcess({
    'HTTPCACHE_ENABLED': False,
    'COOKIES_ENABLED': False,
    'FEEDS': {
-        'scraped_data/goodreadslist.json':{
+        'scraped_data/goodreadslist.json': {
            'format': 'json',
-        'encoding': 'utf8',
-        'store_empty': False,
-        'fields': None,
-        'indent': 4,
-        'item_export_kwargs': {
-           'export_empty_fields': True,
-        }
+            'encoding': 'utf8',
+            'store_empty': False,
+            'fields': None,
+            'indent': 4,
+            'item_export_kwargs': {
+                'export_empty_fields': True,
+            }
        }
    }
 })

-
 process.crawl(GoodReadsListSpider)
 process.start()
-process.stop()
+process.stop()