🚀 Add feature: WordPress conversion

This commit is contained in:
François Pelletier 2025-05-19 18:46:59 -04:00
parent f3dec3b49a
commit 5ee50a0f0f
7 changed files with 67064 additions and 4 deletions

View file

@ -0,0 +1,66 @@
import datetime
import re
from typing import Dict, Union, List
import markdownify
import pandas as pd
import xmltodict
from app.config import logger
from app.convert.base_converter import BaseConverter
class WordPressXmlConverter(BaseConverter):
def __init__(self, content: Union[str, bytes]):
super().__init__(content)
self.wordpress_dict = None
self.df = None
def read_file(self) -> None:
self.wordpress_dict = xmltodict.parse(self.content)
self.df = pd.DataFrame(self.wordpress_dict['rss']['channel']['item'])
def add_metadata(self) -> None:
self.df['source'] = 'WordPress'
self.df['type'] = self.df['wp:post_type'].map(
{'post': 'Article WordPress', 'page': 'Page WordPress'})
def convert_columns(self) -> None:
self.df = self.df[
(self.df['wp:post_type'].isin(['page', 'post'])) &
(self.df['wp:status'] == 'publish')
].copy()
self.df['date'] = self.df['wp:post_date'].apply(
lambda x: str(datetime.datetime.fromisoformat(x).isoformat())
)
self.df['content'] = self.df['content:encoded'].apply(
self.wp_to_markdown)
def rename_columns(self) -> None:
self.df.rename(columns={"link": "url"}, inplace=True)
def clean_data(self) -> None:
self.df = self.df[['source', 'type', 'date', 'url', 'content']].copy()
self.df['path'] = "" # Add empty 'path' column
self.df.fillna(value="", inplace=True)
@staticmethod
def wp_to_markdown(x):
try:
md_text = re.sub(r'\n+', ' ', markdownify.markdownify(x,
heading_style='ATX')).strip()
except Exception as e:
logger.error(
f"Error converting WordPress content to Markdown: {str(e)}")
md_text = ""
return md_text
def convert_wordpress_xml(content: Union[str, bytes]) -> List[Dict]:
logger.info(f"Starting conversion of WordPress XML content")
converter = WordPressXmlConverter(content)
result = converter.convert()
return converter.convert()