392 lines
11 KiB
Text
392 lines
11 KiB
Text
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"source": [
|
||
|
"## Chargement des informations"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import requests\n",
|
||
|
"from bs4 import BeautifulSoup\n",
|
||
|
"import os"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"from thrivecart_utils import extract_image_filename_thrivecart"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Mettre l'URL de ta formation ici tel qu'accédé par les apprenants. Doit inclure le / à la fin de l'URL.\n",
|
||
|
"\n",
|
||
|
"url = ''"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Mettre ici le User-Agent\n",
|
||
|
"headers = {\n",
|
||
|
" 'User-Agent': ''\n",
|
||
|
"}"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Mettre tous les cookies ici dans un dictionnaire, changer les noms au besoin.\n",
|
||
|
"cookies = {\n",
|
||
|
" 'thrivecart_v2': '',\n",
|
||
|
" '__stripe_mid': '',\n",
|
||
|
" 'tc_account': '',\n",
|
||
|
" 'tcc_v1_23439': ''\n",
|
||
|
"}"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"source": [
|
||
|
"## Page d'accueil\n",
|
||
|
"\n",
|
||
|
"Dans cette section, on va sauvegarder la page d'accueil de la formation, visible une fois connecté à ThriveCart."
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"source": [
|
||
|
"### Téléchargement de la page"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Send a GET request to the URL with the user agent and cookies\n",
|
||
|
"response_home = requests.get(url, headers=headers, cookies=cookies)\n",
|
||
|
"html_content_home = response_home.text"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"source": [
|
||
|
"### Enregistrement de la page"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"current_dir = 'html'\n",
|
||
|
"os.makedirs(current_dir, exist_ok=True)\n",
|
||
|
"with open(os.path.join(current_dir,'index.html'), 'w') as f:\n",
|
||
|
" f.write(html_content_home)"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"source": [
|
||
|
"### Soupe de la page"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Create a BeautifulSoup object with the HTML content\n",
|
||
|
"home_soup = BeautifulSoup(html_content_home, 'html.parser')"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"source": [
|
||
|
"### Téléchargement des images de la page"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Extract all URLs from the page that start with the desired prefix\n",
|
||
|
"home_images_dir = os.path.join(current_dir, 'images')\n",
|
||
|
"os.makedirs(os.path.join(current_dir,'images'), exist_ok=True)\n",
|
||
|
"home_images = []\n",
|
||
|
"for image in home_soup.find_all('img'):\n",
|
||
|
" href = image.get('src')\n",
|
||
|
" home_images.append(href)"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Send a GET request to the URL with the user agent and cookies\n",
|
||
|
"for home_image in home_images:\n",
|
||
|
" response_lesson_image = requests.get(home_image, headers=headers, cookies=cookies)\n",
|
||
|
" image_filename = extract_image_filename_thrivecart(home_image)\n",
|
||
|
" if response_lesson_image.status_code == 200:\n",
|
||
|
" # Save the image to a file\n",
|
||
|
" with open(os.path.join(home_images_dir,image_filename), 'wb') as file:\n",
|
||
|
" file.write(response_lesson_image.content)\n",
|
||
|
" print(f\"Image downloaded and saved as {image_filename}\")\n",
|
||
|
" else:\n",
|
||
|
" print(\"Failed to download the image\")"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"source": [
|
||
|
"## Modules\n",
|
||
|
"\n",
|
||
|
"Dans cette section, on va sauvegarder les modules de la formation, accessible via des liens sur la page d'accueil de la formation."
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"source": [
|
||
|
"### Extraction des URLs des modules depuis la page d'accueil"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Extract all URLs from the page that start with the desired prefix\n",
|
||
|
"module_urls = []\n",
|
||
|
"for link in home_soup.find_all('a'):\n",
|
||
|
" href = link.get('href')\n",
|
||
|
" if href and href.startswith(url) and not href.endswith(url):\n",
|
||
|
" module_urls.append(href)"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Process the extracted URLs as per your requirement\n",
|
||
|
"lessons_urls = []\n",
|
||
|
"for module_url in module_urls:\n",
|
||
|
" print(module_url)\n",
|
||
|
"\n",
|
||
|
" # Send a GET request to the URL with the user agent and cookies\n",
|
||
|
" response_module = requests.get(module_url, headers=headers, cookies=cookies)\n",
|
||
|
" html_content_module = response_module.text\n",
|
||
|
"\n",
|
||
|
" # Enregistrement de la page\n",
|
||
|
" current_dir = 'html/modules/'\n",
|
||
|
" current_module = module_url.split('/')[-2]\n",
|
||
|
" os.makedirs(os.path.join(current_dir,\n",
|
||
|
" current_module), exist_ok=True)\n",
|
||
|
" with open(os.path.join(current_dir,current_module,'index.html'), 'w') as f:\n",
|
||
|
" f.write(html_content_module)\n",
|
||
|
"\n",
|
||
|
" # Create a BeautifulSoup object with the HTML content\n",
|
||
|
" module_soup = BeautifulSoup(html_content_module, 'html.parser')\n",
|
||
|
"\n",
|
||
|
" # Extract all images from the page\n",
|
||
|
" module_images = []\n",
|
||
|
" for image in module_soup.find_all('img'):\n",
|
||
|
" href = image.get('src')\n",
|
||
|
" if href.startswith(\"https://spark.thrivecart.com/\"):\n",
|
||
|
" module_images.append(href)\n",
|
||
|
"\n",
|
||
|
" # Send a GET request to the URL with the user agent and cookies\n",
|
||
|
" os.makedirs(os.path.join(current_dir,\n",
|
||
|
" current_module,'images'), exist_ok=True)\n",
|
||
|
" for module_image in module_images:\n",
|
||
|
" response_module_image = requests.get(module_image, headers=headers, cookies=cookies)\n",
|
||
|
" image_filename = extract_image_filename_thrivecart(module_image)\n",
|
||
|
" if response_module_image.status_code == 200:\n",
|
||
|
" # Save the image to a file\n",
|
||
|
" with open(os.path.join(current_dir,current_module,'images',image_filename), 'wb') as file:\n",
|
||
|
" file.write(response_module_image.content)\n",
|
||
|
"\n",
|
||
|
" # Extract all URLs from the page that start with the desired prefix\n",
|
||
|
" for link in module_soup.find_all('a'):\n",
|
||
|
" lesson_url = link.get('href')\n",
|
||
|
" if lesson_url and lesson_url.startswith(url) and not lesson_url.endswith(url) and lesson_url not in module_urls:\n",
|
||
|
" lessons_urls.append((current_module,lesson_url))"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"source": [
|
||
|
"## Leçons\n",
|
||
|
"\n",
|
||
|
"Dans cette section, on va sauvegarder les leçons de la formation, accessible via des liens dans chaque module de la formation."
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"source": [
|
||
|
"### Extraction des URLs des leçons depuis la page d'accueil"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Process the extracted URLs as per your requirement\n",
|
||
|
"for lesson_urlpair in lessons_urls:\n",
|
||
|
" current_module = lesson_urlpair[0]\n",
|
||
|
" lesson_url = lesson_urlpair[1]\n",
|
||
|
"\n",
|
||
|
" # Send a GET request to the URL with the user agent and cookies\n",
|
||
|
" response_lesson = requests.get(lesson_url, headers=headers, cookies=cookies)\n",
|
||
|
" html_content_lesson = response_lesson.text\n",
|
||
|
"\n",
|
||
|
" # Enregistrement de la page\n",
|
||
|
" current_dir = os.path.join('html/modules/',current_module)\n",
|
||
|
" current_lesson = lesson_url.split('/')[-2]\n",
|
||
|
" os.makedirs(os.path.join(current_dir,\n",
|
||
|
" current_lesson), exist_ok=True)\n",
|
||
|
" with open(os.path.join(current_dir,current_lesson,'index.html'), 'w') as f:\n",
|
||
|
" f.write(html_content_lesson)\n",
|
||
|
" print(f'Fichier enregistré: {current_lesson}/index.html')\n",
|
||
|
" # Create a BeautifulSoup object with the HTML content\n",
|
||
|
" lesson_soup = BeautifulSoup(html_content_lesson, 'html.parser')\n",
|
||
|
"\n",
|
||
|
" # Extract all images from the page\n",
|
||
|
" lesson_images = []\n",
|
||
|
" for image in lesson_soup.find_all('img'):\n",
|
||
|
" href = image.get('src')\n",
|
||
|
" if href.startswith(\"https://spark.thrivecart.com/\"):\n",
|
||
|
" lesson_images.append(href)\n",
|
||
|
"\n",
|
||
|
" # Send a GET request to the URL with the user agent and cookies\n",
|
||
|
" os.makedirs(os.path.join(current_dir,\n",
|
||
|
" current_lesson,'images'), exist_ok=True)\n",
|
||
|
" for lesson_image in lesson_images:\n",
|
||
|
" response_lesson_image = requests.get(lesson_image, headers=headers, cookies=cookies)\n",
|
||
|
" image_filename = extract_image_filename_thrivecart(lesson_image)\n",
|
||
|
" # Save the image to a file\n",
|
||
|
" with open(os.path.join(current_dir,current_lesson,'images',image_filename), 'wb') as file:\n",
|
||
|
" file.write(response_lesson_image.content)\n",
|
||
|
" print(f'Fichier enregistré: {image_filename}')"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"outputs": [],
|
||
|
"source": [],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 2
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython2",
|
||
|
"version": "2.7.6"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 0
|
||
|
}
|