backup-thrivecart-learn/thrivecart-backup.ipynb
2023-07-10 14:59:05 -04:00

474 lines
13 KiB
Text

{
"cells": [
{
"cell_type": "markdown",
"source": [
"# Backup ThriveCart Learn\n",
"\n",
"Apache Licence 2.0\n",
"Copyright 2023 François Pelletier\n",
"\n",
"## Chargement des informations"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import os"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"from thrivecart_utils import extract_image_filename_thrivecart"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Mettre l'URL de ta formation ici tel qu'accédé par les apprenants. Doit inclure le / à la fin de l'URL.\n",
"\n",
"url = ''"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Mettre ici le User-Agent\n",
"headers = {\n",
" 'User-Agent': ''\n",
"}"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Mettre tous les cookies ici dans un dictionnaire, changer les noms au besoin.\n",
"cookies = {\n",
" 'thrivecart_v2': '',\n",
" '__stripe_mid': '',\n",
" 'tc_account': '',\n",
" 'tcc_v1_23439': ''\n",
"}"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"## Page d'accueil\n",
"\n",
"Dans cette section, on va sauvegarder la page d'accueil de la formation, visible une fois connecté à ThriveCart."
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Téléchargement de la page"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Send a GET request to the URL with the user agent and cookies\n",
"response_home = requests.get(url, headers=headers, cookies=cookies)\n",
"html_content_home = response_home.text"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Enregistrement de la page"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"current_dir = 'html'\n",
"os.makedirs(current_dir, exist_ok=True)\n",
"with open(os.path.join(current_dir,'index.html'), 'w') as f:\n",
" f.write(html_content_home)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Soupe de la page"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Create a BeautifulSoup object with the HTML content\n",
"home_soup = BeautifulSoup(html_content_home, 'html.parser')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Téléchargement des images de la page"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Extract all URLs from the page that start with the desired prefix\n",
"home_images_dir = os.path.join(current_dir, 'images')\n",
"os.makedirs(os.path.join(current_dir,'images'), exist_ok=True)\n",
"home_images = []\n",
"for image in home_soup.find_all('img'):\n",
" href = image.get('src')\n",
" home_images.append(href)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Send a GET request to the URL with the user agent and cookies\n",
"for home_image in home_images:\n",
" response_lesson_image = requests.get(home_image, headers=headers, cookies=cookies)\n",
" image_filename = extract_image_filename_thrivecart(home_image)\n",
" if response_lesson_image.status_code == 200:\n",
" # Save the image to a file\n",
" with open(os.path.join(home_images_dir,image_filename), 'wb') as file:\n",
" file.write(response_lesson_image.content)\n",
" print(f\"Image downloaded and saved as {image_filename}\")\n",
" else:\n",
" print(\"Failed to download the image\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"## Modules\n",
"\n",
"Dans cette section, on va sauvegarder les modules de la formation, accessible via des liens sur la page d'accueil de la formation."
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Extraction des URLs des modules depuis la page d'accueil"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Extract all URLs from the page that start with the desired prefix\n",
"module_urls = []\n",
"for link in home_soup.find_all('a'):\n",
" href = link.get('href')\n",
" if href and href.startswith(url) and not href.endswith(url):\n",
" module_urls.append(href)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Process the extracted URLs as per your requirement\n",
"lessons_urls = []\n",
"for module_url in module_urls:\n",
" print(module_url)\n",
"\n",
" # Send a GET request to the URL with the user agent and cookies\n",
" response_module = requests.get(module_url, headers=headers, cookies=cookies)\n",
" html_content_module = response_module.text\n",
"\n",
" # Enregistrement de la page\n",
" current_dir = 'html/modules/'\n",
" current_module = module_url.split('/')[-2]\n",
" os.makedirs(os.path.join(current_dir,\n",
" current_module), exist_ok=True)\n",
" with open(os.path.join(current_dir,current_module,'index.html'), 'w') as f:\n",
" f.write(html_content_module)\n",
"\n",
" # Create a BeautifulSoup object with the HTML content\n",
" module_soup = BeautifulSoup(html_content_module, 'html.parser')\n",
"\n",
" # Extract all images from the page\n",
" module_images = []\n",
" for image in module_soup.find_all('img'):\n",
" href = image.get('src')\n",
" if href.startswith(\"https://spark.thrivecart.com/\"):\n",
" module_images.append(href)\n",
"\n",
" # Send a GET request to the URL with the user agent and cookies\n",
" os.makedirs(os.path.join(current_dir,\n",
" current_module,'images'), exist_ok=True)\n",
" for module_image in module_images:\n",
" response_module_image = requests.get(module_image, headers=headers, cookies=cookies)\n",
" image_filename = extract_image_filename_thrivecart(module_image)\n",
" if response_module_image.status_code == 200:\n",
" # Save the image to a file\n",
" with open(os.path.join(current_dir,current_module,'images',image_filename), 'wb') as file:\n",
" file.write(response_module_image.content)\n",
"\n",
" # Extract all URLs from the page that start with the desired prefix\n",
" for link in module_soup.find_all('a'):\n",
" lesson_url = link.get('href')\n",
" if lesson_url and lesson_url.startswith(url) and not lesson_url.endswith(url) and lesson_url not in module_urls:\n",
" lessons_urls.append((current_module,lesson_url))"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"## Leçons\n",
"\n",
"Dans cette section, on va sauvegarder les leçons de la formation, accessible via des liens dans chaque module de la formation."
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Extraction des URLs des leçons depuis la page d'accueil"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Process the extracted URLs as per your requirement\n",
"for lesson_urlpair in lessons_urls:\n",
" current_module = lesson_urlpair[0]\n",
" lesson_url = lesson_urlpair[1]\n",
"\n",
" # Send a GET request to the URL with the user agent and cookies\n",
" response_lesson = requests.get(lesson_url, headers=headers, cookies=cookies)\n",
" html_content_lesson = response_lesson.text\n",
"\n",
" # Enregistrement de la page\n",
" current_dir = os.path.join('html/modules/',current_module)\n",
" current_lesson = lesson_url.split('/')[-2]\n",
" os.makedirs(os.path.join(current_dir,\n",
" current_lesson), exist_ok=True)\n",
" with open(os.path.join(current_dir,current_lesson,'index.html'), 'w') as f:\n",
" f.write(html_content_lesson)\n",
" print(f'Fichier enregistré: {current_lesson}/index.html')\n",
" # Create a BeautifulSoup object with the HTML content\n",
" lesson_soup = BeautifulSoup(html_content_lesson, 'html.parser')\n",
"\n",
" # Extract all images from the page\n",
" lesson_images = []\n",
" for image in lesson_soup.find_all('img'):\n",
" href = image.get('src')\n",
" if href.startswith(\"https://spark.thrivecart.com/\"):\n",
" lesson_images.append(href)\n",
"\n",
" # Send a GET request to the URL with the user agent and cookies\n",
" os.makedirs(os.path.join(current_dir,\n",
" current_lesson,'images'), exist_ok=True)\n",
" for lesson_image in lesson_images:\n",
" response_lesson_image = requests.get(lesson_image, headers=headers, cookies=cookies)\n",
" image_filename = extract_image_filename_thrivecart(lesson_image)\n",
" # Save the image to a file\n",
" with open(os.path.join(current_dir,current_lesson,'images',image_filename), 'wb') as file:\n",
" file.write(response_lesson_image.content)\n",
" print(f'Fichier enregistré: {image_filename}')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"## Création d'une table des matières"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"list_index_html = [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(\"html\")) for f in fn if f.endswith('index.html')]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# extract title from index.html\n",
"list_title = []\n",
"for index_html in list_index_html:\n",
" soup = BeautifulSoup(open(index_html), 'html.parser')\n",
" title = soup.title.string\n",
" page = url+index_html.replace(\"html/\",\"\").replace(\"modules/\",\"\").replace(\"index.html\",\"\")\n",
" title = title.replace(\" » Propulsé par ThriveCart\",\"\")\n",
" level = len([i for i in page.split(\"/\") if i != \"\"])-3\n",
" list_title.append({\"page\": page, \"title\": title, \"level\": level})"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"def generate_link(page, title, level):\n",
" if level == 3:\n",
" splitss = page.split(\"/\")\n",
" splitss.remove(splitss[-3])\n",
" page = \"/\".join(splitss)\n",
" link = (\"Module\" if level==2 else \"-\")+\" [\"+title+\"](\"+page+\")\\n\"\n",
" return link"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"if os.path.exists(\"table_matiere.md\"):\n",
" os.remove(\"table_matiere.md\")\n",
"with open(\"table_matiere.md\", \"a\") as f:\n",
" f.write(\"# Table des matières\\n\\n\")\n",
" for tt in list_title:\n",
" if tt[\"level\"] == 2:\n",
" f.write(\"\\n\\n----\\n\\n\")\n",
" f.write(generate_link(tt[\"page\"], tt[\"title\"], tt[\"level\"]))\n",
" if tt[\"level\"] == 2:\n",
" f.write(\"\\n\\n\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"!pandoc -i table_matiere.md -o table_matiere.html"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}