🚀 Add feature: WordPress conversion

This commit is contained in:
François Pelletier 2025-05-19 18:46:59 -04:00
parent f3dec3b49a
commit 5ee50a0f0f
7 changed files with 67064 additions and 4 deletions

View file

@ -1,5 +1,6 @@
import logging
import os
import dotenv
from minio import Minio
@ -72,7 +73,7 @@ available_sources = AvailableSourcesResponse(
display_name="Markdown", name="markdown", format="markdown"
),
AvailableSource(
display_name="Wordpress", name="wordpress_xml", format="xml"
display_name="Wordpress", name="wordpress", format="xml"
),
AvailableSource(display_name="Ebook", name="ebook_pdf", format="pdf"),
AvailableSource(

View file

@ -0,0 +1,66 @@
import datetime
import re
from typing import Dict, Union, List
import markdownify
import pandas as pd
import xmltodict
from app.config import logger
from app.convert.base_converter import BaseConverter
class WordPressXmlConverter(BaseConverter):
def __init__(self, content: Union[str, bytes]):
super().__init__(content)
self.wordpress_dict = None
self.df = None
def read_file(self) -> None:
self.wordpress_dict = xmltodict.parse(self.content)
self.df = pd.DataFrame(self.wordpress_dict['rss']['channel']['item'])
def add_metadata(self) -> None:
self.df['source'] = 'WordPress'
self.df['type'] = self.df['wp:post_type'].map(
{'post': 'Article WordPress', 'page': 'Page WordPress'})
def convert_columns(self) -> None:
self.df = self.df[
(self.df['wp:post_type'].isin(['page', 'post'])) &
(self.df['wp:status'] == 'publish')
].copy()
self.df['date'] = self.df['wp:post_date'].apply(
lambda x: str(datetime.datetime.fromisoformat(x).isoformat())
)
self.df['content'] = self.df['content:encoded'].apply(
self.wp_to_markdown)
def rename_columns(self) -> None:
self.df.rename(columns={"link": "url"}, inplace=True)
def clean_data(self) -> None:
self.df = self.df[['source', 'type', 'date', 'url', 'content']].copy()
self.df['path'] = "" # Add empty 'path' column
self.df.fillna(value="", inplace=True)
@staticmethod
def wp_to_markdown(x):
try:
md_text = re.sub(r'\n+', ' ', markdownify.markdownify(x,
heading_style='ATX')).strip()
except Exception as e:
logger.error(
f"Error converting WordPress content to Markdown: {str(e)}")
md_text = ""
return md_text
def convert_wordpress_xml(content: Union[str, bytes]) -> List[Dict]:
logger.info(f"Starting conversion of WordPress XML content")
converter = WordPressXmlConverter(content)
result = converter.convert()
return converter.convert()

View file

@ -1,7 +1,7 @@
import datetime
import json
import os
from typing import Any, Dict, Union, List
from typing import Dict, Union, List
from fastapi import APIRouter, HTTPException
@ -37,6 +37,7 @@ from app.convert.convert_linkedin_comments_csv import (
)
from app.convert.convert_linkedin_shares_csv import convert_linkedin_shares_csv
from app.convert.convert_markdown_txt import convert_markdown_txt
from app.convert.convert_wordpress_xml import convert_wordpress_xml
from app.convert.convert_youtube_shorts_video import (
convert_youtube_shorts_video,
)
@ -90,6 +91,9 @@ CONVERTERS = {
},
"export": {
"txt": convert_export_txt
},
"wordpress": {
"xml": convert_wordpress_xml
}
}

View file

@ -4,4 +4,6 @@ pydantic
pytest
pandas
minio
python-dotenv
python-dotenv
xmltodict
markdownify

View file

@ -18,4 +18,14 @@ Content-Type: application/json
"source_type": "linkedin_shares",
"source_format": "csv",
"filename": "linkedin_shares.csv"
}
### Convert WordPress Posts
POST {{baseUrl}}/convert
Content-Type: application/json
{
"source_type": "wordpress",
"source_format": "xml",
"filename": "wordpress.xml"
}

File diff suppressed because one or more lines are too long

View file

@ -4,4 +4,7 @@ pandas~=2.2.3
pydantic~=2.11.4
python-dotenv~=1.1.0
requests~=2.32.3
streamlit~=1.45.0
streamlit~=1.45.0
xmltodict~=0.14.2
markdownify~=1.1.0
minio~=7.2.15