backup-thrivecart-learn/thrivecart-backup.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "# Backup ThriveCart Learn\n",
    "\n",
    "Apache Licence 2.0\n",
    "Copyright 2023 François Pelletier\n",
    "\n",
    "## Chargement des informations"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import os"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "from thrivecart_utils import extract_image_filename_thrivecart"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# Mettre l'URL de ta formation ici tel qu'accédé par les apprenants. Doit inclure le / à la fin de l'URL.\n",
    "\n",
    "url = ''"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# Mettre ici le User-Agent\n",
    "headers = {\n",
    "    'User-Agent': ''\n",
    "}"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# Mettre tous les cookies ici dans un dictionnaire, changer les noms au besoin.\n",
    "cookies = {\n",
    "    'thrivecart_v2': '',\n",
    "    '__stripe_mid': '',\n",
    "    'tc_account': '',\n",
    "    'tcc_v1_23439': ''\n",
    "}"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Page d'accueil\n",
    "\n",
    "Dans cette section, on va sauvegarder la page d'accueil de la formation, visible une fois connecté à ThriveCart."
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "### Téléchargement de la page"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# Send a GET request to the URL with the user agent and cookies\n",
    "response_home = requests.get(url, headers=headers, cookies=cookies)\n",
    "html_content_home = response_home.text"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "### Enregistrement de la page"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "current_dir = 'html'\n",
    "os.makedirs(current_dir, exist_ok=True)\n",
    "with open(os.path.join(current_dir,'index.html'), 'w') as f:\n",
    "    f.write(html_content_home)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "### Soupe de la page"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# Create a BeautifulSoup object with the HTML content\n",
    "home_soup = BeautifulSoup(html_content_home, 'html.parser')"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "### Téléchargement des images de la page"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# Extract all URLs from the page that start with the desired prefix\n",
    "home_images_dir = os.path.join(current_dir, 'images')\n",
    "os.makedirs(os.path.join(current_dir,'images'), exist_ok=True)\n",
    "home_images = []\n",
    "for image in home_soup.find_all('img'):\n",
    "    href = image.get('src')\n",
    "    home_images.append(href)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# Send a GET request to the URL with the user agent and cookies\n",
    "for home_image in home_images:\n",
    "    response_lesson_image = requests.get(home_image, headers=headers, cookies=cookies)\n",
    "    image_filename = extract_image_filename_thrivecart(home_image)\n",
    "    if response_lesson_image.status_code == 200:\n",
    "        # Save the image to a file\n",
    "        with open(os.path.join(home_images_dir,image_filename), 'wb') as file:\n",
    "            file.write(response_lesson_image.content)\n",
    "        print(f\"Image downloaded and saved as {image_filename}\")\n",
    "    else:\n",
    "        print(\"Failed to download the image\")"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Modules\n",
    "\n",
    "Dans cette section, on va sauvegarder les modules de la formation, accessible via des liens sur la page d'accueil de la formation."
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "### Extraction des URLs des modules depuis la page d'accueil"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# Extract all URLs from the page that start with the desired prefix\n",
    "module_urls = []\n",
    "for link in home_soup.find_all('a'):\n",
    "    href = link.get('href')\n",
    "    if href and href.startswith(url) and not href.endswith(url):\n",
    "        module_urls.append(href)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# Process the extracted URLs as per your requirement\n",
    "lessons_urls = []\n",
    "for module_url in module_urls:\n",
    "    print(module_url)\n",
    "\n",
    "    # Send a GET request to the URL with the user agent and cookies\n",
    "    response_module = requests.get(module_url, headers=headers, cookies=cookies)\n",
    "    html_content_module = response_module.text\n",
    "\n",
    "    # Enregistrement de la page\n",
    "    current_dir = 'html/modules/'\n",
    "    current_module = module_url.split('/')[-2]\n",
    "    os.makedirs(os.path.join(current_dir,\n",
    "                             current_module), exist_ok=True)\n",
    "    with open(os.path.join(current_dir,current_module,'index.html'), 'w') as f:\n",
    "        f.write(html_content_module)\n",
    "\n",
    "    # Create a BeautifulSoup object with the HTML content\n",
    "    module_soup = BeautifulSoup(html_content_module, 'html.parser')\n",
    "\n",
    "    # Extract all images from the page\n",
    "    module_images = []\n",
    "    for image in module_soup.find_all('img'):\n",
    "        href = image.get('src')\n",
    "        if href.startswith(\"https://spark.thrivecart.com/\"):\n",
    "            module_images.append(href)\n",
    "\n",
    "    # Send a GET request to the URL with the user agent and cookies\n",
    "    os.makedirs(os.path.join(current_dir,\n",
    "                             current_module,'images'), exist_ok=True)\n",
    "    for module_image in module_images:\n",
    "        response_module_image = requests.get(module_image, headers=headers, cookies=cookies)\n",
    "        image_filename = extract_image_filename_thrivecart(module_image)\n",
    "        if response_module_image.status_code == 200:\n",
    "            # Save the image to a file\n",
    "            with open(os.path.join(current_dir,current_module,'images',image_filename), 'wb') as file:\n",
    "                file.write(response_module_image.content)\n",
    "\n",
    "    # Extract all URLs from the page that start with the desired prefix\n",
    "    for link in module_soup.find_all('a'):\n",
    "        lesson_url = link.get('href')\n",
    "        if lesson_url and lesson_url.startswith(url) and not lesson_url.endswith(url) and lesson_url not in module_urls:\n",
    "            lessons_urls.append((current_module,lesson_url))"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Leçons\n",
    "\n",
    "Dans cette section, on va sauvegarder les leçons de la formation, accessible via des liens dans chaque module de la formation."
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "### Extraction des URLs des leçons depuis la page d'accueil"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# Process the extracted URLs as per your requirement\n",
    "for lesson_urlpair in lessons_urls:\n",
    "    current_module = lesson_urlpair[0]\n",
    "    lesson_url = lesson_urlpair[1]\n",
    "\n",
    "    # Send a GET request to the URL with the user agent and cookies\n",
    "    response_lesson = requests.get(lesson_url, headers=headers, cookies=cookies)\n",
    "    html_content_lesson = response_lesson.text\n",
    "\n",
    "    # Enregistrement de la page\n",
    "    current_dir = os.path.join('html/modules/',current_module)\n",
    "    current_lesson = lesson_url.split('/')[-2]\n",
    "    os.makedirs(os.path.join(current_dir,\n",
    "                             current_lesson), exist_ok=True)\n",
    "    with open(os.path.join(current_dir,current_lesson,'index.html'), 'w') as f:\n",
    "        f.write(html_content_lesson)\n",
    "        print(f'Fichier enregistré: {current_lesson}/index.html')\n",
    "    # Create a BeautifulSoup object with the HTML content\n",
    "    lesson_soup = BeautifulSoup(html_content_lesson, 'html.parser')\n",
    "\n",
    "    # Extract all images from the page\n",
    "    lesson_images = []\n",
    "    for image in lesson_soup.find_all('img'):\n",
    "        href = image.get('src')\n",
    "        if href.startswith(\"https://spark.thrivecart.com/\"):\n",
    "            lesson_images.append(href)\n",
    "\n",
    "    # Send a GET request to the URL with the user agent and cookies\n",
    "    os.makedirs(os.path.join(current_dir,\n",
    "                             current_lesson,'images'), exist_ok=True)\n",
    "    for lesson_image in lesson_images:\n",
    "        response_lesson_image = requests.get(lesson_image, headers=headers, cookies=cookies)\n",
    "        image_filename = extract_image_filename_thrivecart(lesson_image)\n",
    "        # Save the image to a file\n",
    "        with open(os.path.join(current_dir,current_lesson,'images',image_filename), 'wb') as file:\n",
    "            file.write(response_lesson_image.content)\n",
    "            print(f'Fichier enregistré: {image_filename}')"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Création d'une table des matières"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "list_index_html = [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(\"html\")) for f in fn if f.endswith('index.html')]"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# extract title from index.html\n",
    "list_title = []\n",
    "for index_html in list_index_html:\n",
    "    soup = BeautifulSoup(open(index_html), 'html.parser')\n",
    "    title = soup.title.string\n",
    "    page = url+index_html.replace(\"html/\",\"\").replace(\"modules/\",\"\").replace(\"index.html\",\"\")\n",
    "    title = title.replace(\" » Propulsé par ThriveCart\",\"\")\n",
    "    level = len([i for i in page.split(\"/\") if i != \"\"])-3\n",
    "    list_title.append({\"page\": page, \"title\": title, \"level\": level})"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "def generate_link(page, title, level):\n",
    "    if level == 3:\n",
    "        splitss = page.split(\"/\")\n",
    "        splitss.remove(splitss[-3])\n",
    "        page = \"/\".join(splitss)\n",
    "    link = (\"Module\" if level==2 else \"-\")+\" [\"+title+\"](\"+page+\")\\n\"\n",
    "    return link"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "if os.path.exists(\"table_matiere.md\"):\n",
    "    os.remove(\"table_matiere.md\")\n",
    "with open(\"table_matiere.md\", \"a\") as f:\n",
    "    f.write(\"# Table des matières\\n\\n\")\n",
    "    for tt in list_title:\n",
    "        if tt[\"level\"] == 2:\n",
    "            f.write(\"\\n\\n----\\n\\n\")\n",
    "        f.write(generate_link(tt[\"page\"], tt[\"title\"], tt[\"level\"]))\n",
    "        if tt[\"level\"] == 2:\n",
    "            f.write(\"\\n\\n\")"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "!pandoc -i table_matiere.md -o table_matiere.html"
   ],
   "metadata": {
    "collapsed": false
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}