{ "cells": [ { "cell_type": "markdown", "source": [ "# Backup ThriveCart Learn\n", "\n", "Apache Licence 2.0\n", "Copyright 2023 François Pelletier\n", "\n", "## Chargement des informations" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "import os" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "from thrivecart_utils import extract_image_filename_thrivecart" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "# Mettre l'URL de ta formation ici tel qu'accédé par les apprenants. Doit inclure le / à la fin de l'URL.\n", "\n", "url = ''" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "# Mettre ici le User-Agent\n", "headers = {\n", " 'User-Agent': ''\n", "}" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "# Mettre tous les cookies ici dans un dictionnaire, changer les noms au besoin.\n", "cookies = {\n", " 'thrivecart_v2': '',\n", " '__stripe_mid': '',\n", " 'tc_account': '',\n", " 'tcc_v1_23439': ''\n", "}" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "## Page d'accueil\n", "\n", "Dans cette section, on va sauvegarder la page d'accueil de la formation, visible une fois connecté à ThriveCart." ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "### Téléchargement de la page" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "# Send a GET request to the URL with the user agent and cookies\n", "response_home = requests.get(url, headers=headers, cookies=cookies)\n", "html_content_home = response_home.text" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "### Enregistrement de la page" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "current_dir = 'html'\n", "os.makedirs(current_dir, exist_ok=True)\n", "with open(os.path.join(current_dir,'index.html'), 'w') as f:\n", " f.write(html_content_home)" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "### Soupe de la page" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "# Create a BeautifulSoup object with the HTML content\n", "home_soup = BeautifulSoup(html_content_home, 'html.parser')" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "### Téléchargement des images de la page" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "# Extract all URLs from the page that start with the desired prefix\n", "home_images_dir = os.path.join(current_dir, 'images')\n", "os.makedirs(os.path.join(current_dir,'images'), exist_ok=True)\n", "home_images = []\n", "for image in home_soup.find_all('img'):\n", " href = image.get('src')\n", " home_images.append(href)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "# Send a GET request to the URL with the user agent and cookies\n", "for home_image in home_images:\n", " response_lesson_image = requests.get(home_image, headers=headers, cookies=cookies)\n", " image_filename = extract_image_filename_thrivecart(home_image)\n", " if response_lesson_image.status_code == 200:\n", " # Save the image to a file\n", " with open(os.path.join(home_images_dir,image_filename), 'wb') as file:\n", " file.write(response_lesson_image.content)\n", " print(f\"Image downloaded and saved as {image_filename}\")\n", " else:\n", " print(\"Failed to download the image\")" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "## Modules\n", "\n", "Dans cette section, on va sauvegarder les modules de la formation, accessible via des liens sur la page d'accueil de la formation." ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "### Extraction des URLs des modules depuis la page d'accueil" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "# Extract all URLs from the page that start with the desired prefix\n", "module_urls = []\n", "for link in home_soup.find_all('a'):\n", " href = link.get('href')\n", " if href and href.startswith(url) and not href.endswith(url):\n", " module_urls.append(href)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "# Process the extracted URLs as per your requirement\n", "lessons_urls = []\n", "for module_url in module_urls:\n", " print(module_url)\n", "\n", " # Send a GET request to the URL with the user agent and cookies\n", " response_module = requests.get(module_url, headers=headers, cookies=cookies)\n", " html_content_module = response_module.text\n", "\n", " # Enregistrement de la page\n", " current_dir = 'html/modules/'\n", " current_module = module_url.split('/')[-2]\n", " os.makedirs(os.path.join(current_dir,\n", " current_module), exist_ok=True)\n", " with open(os.path.join(current_dir,current_module,'index.html'), 'w') as f:\n", " f.write(html_content_module)\n", "\n", " # Create a BeautifulSoup object with the HTML content\n", " module_soup = BeautifulSoup(html_content_module, 'html.parser')\n", "\n", " # Extract all images from the page\n", " module_images = []\n", " for image in module_soup.find_all('img'):\n", " href = image.get('src')\n", " if href.startswith(\"https://spark.thrivecart.com/\"):\n", " module_images.append(href)\n", "\n", " # Send a GET request to the URL with the user agent and cookies\n", " os.makedirs(os.path.join(current_dir,\n", " current_module,'images'), exist_ok=True)\n", " for module_image in module_images:\n", " response_module_image = requests.get(module_image, headers=headers, cookies=cookies)\n", " image_filename = extract_image_filename_thrivecart(module_image)\n", " if response_module_image.status_code == 200:\n", " # Save the image to a file\n", " with open(os.path.join(current_dir,current_module,'images',image_filename), 'wb') as file:\n", " file.write(response_module_image.content)\n", "\n", " # Extract all URLs from the page that start with the desired prefix\n", " for link in module_soup.find_all('a'):\n", " lesson_url = link.get('href')\n", " if lesson_url and lesson_url.startswith(url) and not lesson_url.endswith(url) and lesson_url not in module_urls:\n", " lessons_urls.append((current_module,lesson_url))" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "## Leçons\n", "\n", "Dans cette section, on va sauvegarder les leçons de la formation, accessible via des liens dans chaque module de la formation." ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "### Extraction des URLs des leçons depuis la page d'accueil" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "# Process the extracted URLs as per your requirement\n", "for lesson_urlpair in lessons_urls:\n", " current_module = lesson_urlpair[0]\n", " lesson_url = lesson_urlpair[1]\n", "\n", " # Send a GET request to the URL with the user agent and cookies\n", " response_lesson = requests.get(lesson_url, headers=headers, cookies=cookies)\n", " html_content_lesson = response_lesson.text\n", "\n", " # Enregistrement de la page\n", " current_dir = os.path.join('html/modules/',current_module)\n", " current_lesson = lesson_url.split('/')[-2]\n", " os.makedirs(os.path.join(current_dir,\n", " current_lesson), exist_ok=True)\n", " with open(os.path.join(current_dir,current_lesson,'index.html'), 'w') as f:\n", " f.write(html_content_lesson)\n", " print(f'Fichier enregistré: {current_lesson}/index.html')\n", " # Create a BeautifulSoup object with the HTML content\n", " lesson_soup = BeautifulSoup(html_content_lesson, 'html.parser')\n", "\n", " # Extract all images from the page\n", " lesson_images = []\n", " for image in lesson_soup.find_all('img'):\n", " href = image.get('src')\n", " if href.startswith(\"https://spark.thrivecart.com/\"):\n", " lesson_images.append(href)\n", "\n", " # Send a GET request to the URL with the user agent and cookies\n", " os.makedirs(os.path.join(current_dir,\n", " current_lesson,'images'), exist_ok=True)\n", " for lesson_image in lesson_images:\n", " response_lesson_image = requests.get(lesson_image, headers=headers, cookies=cookies)\n", " image_filename = extract_image_filename_thrivecart(lesson_image)\n", " # Save the image to a file\n", " with open(os.path.join(current_dir,current_lesson,'images',image_filename), 'wb') as file:\n", " file.write(response_lesson_image.content)\n", " print(f'Fichier enregistré: {image_filename}')" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "## Création d'une table des matières" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "list_index_html = [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(\"html\")) for f in fn if f.endswith('index.html')]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "# extract title from index.html\n", "list_title = []\n", "for index_html in list_index_html:\n", " soup = BeautifulSoup(open(index_html), 'html.parser')\n", " title = soup.title.string\n", " page = url+index_html.replace(\"html/\",\"\").replace(\"modules/\",\"\").replace(\"index.html\",\"\")\n", " title = title.replace(\" » Propulsé par ThriveCart\",\"\")\n", " level = len([i for i in page.split(\"/\") if i != \"\"])-3\n", " list_title.append({\"page\": page, \"title\": title, \"level\": level})" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "def generate_link(page, title, level):\n", " if level == 3:\n", " splitss = page.split(\"/\")\n", " splitss.remove(splitss[-3])\n", " page = \"/\".join(splitss)\n", " link = (\"Module\" if level==2 else \"-\")+\" [\"+title+\"](\"+page+\")\\n\"\n", " return link" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "if os.path.exists(\"table_matiere.md\"):\n", " os.remove(\"table_matiere.md\")\n", "with open(\"table_matiere.md\", \"a\") as f:\n", " f.write(\"# Table des matières\\n\\n\")\n", " for tt in list_title:\n", " if tt[\"level\"] == 2:\n", " f.write(\"\\n\\n----\\n\\n\")\n", " f.write(generate_link(tt[\"page\"], tt[\"title\"], tt[\"level\"]))\n", " if tt[\"level\"] == 2:\n", " f.write(\"\\n\\n\")" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "!pandoc -i table_matiere.md -o table_matiere.html" ], "metadata": { "collapsed": false } } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }