🚀 Add feature: WordPress conversion
This commit is contained in:
parent
f3dec3b49a
commit
5ee50a0f0f
7 changed files with 67064 additions and 4 deletions
66
backend/app/convert/convert_wordpress_xml.py
Normal file
66
backend/app/convert/convert_wordpress_xml.py
Normal file
|
@ -0,0 +1,66 @@
|
|||
import datetime
|
||||
import re
|
||||
from typing import Dict, Union, List
|
||||
|
||||
import markdownify
|
||||
import pandas as pd
|
||||
import xmltodict
|
||||
|
||||
from app.config import logger
|
||||
from app.convert.base_converter import BaseConverter
|
||||
|
||||
|
||||
class WordPressXmlConverter(BaseConverter):
|
||||
def __init__(self, content: Union[str, bytes]):
|
||||
super().__init__(content)
|
||||
self.wordpress_dict = None
|
||||
self.df = None
|
||||
|
||||
def read_file(self) -> None:
|
||||
self.wordpress_dict = xmltodict.parse(self.content)
|
||||
self.df = pd.DataFrame(self.wordpress_dict['rss']['channel']['item'])
|
||||
|
||||
def add_metadata(self) -> None:
|
||||
self.df['source'] = 'WordPress'
|
||||
self.df['type'] = self.df['wp:post_type'].map(
|
||||
{'post': 'Article WordPress', 'page': 'Page WordPress'})
|
||||
|
||||
def convert_columns(self) -> None:
|
||||
self.df = self.df[
|
||||
(self.df['wp:post_type'].isin(['page', 'post'])) &
|
||||
(self.df['wp:status'] == 'publish')
|
||||
].copy()
|
||||
|
||||
self.df['date'] = self.df['wp:post_date'].apply(
|
||||
lambda x: str(datetime.datetime.fromisoformat(x).isoformat())
|
||||
)
|
||||
|
||||
self.df['content'] = self.df['content:encoded'].apply(
|
||||
self.wp_to_markdown)
|
||||
|
||||
def rename_columns(self) -> None:
|
||||
self.df.rename(columns={"link": "url"}, inplace=True)
|
||||
|
||||
def clean_data(self) -> None:
|
||||
self.df = self.df[['source', 'type', 'date', 'url', 'content']].copy()
|
||||
self.df['path'] = "" # Add empty 'path' column
|
||||
self.df.fillna(value="", inplace=True)
|
||||
|
||||
@staticmethod
|
||||
def wp_to_markdown(x):
|
||||
try:
|
||||
md_text = re.sub(r'\n+', ' ', markdownify.markdownify(x,
|
||||
heading_style='ATX')).strip()
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error converting WordPress content to Markdown: {str(e)}")
|
||||
md_text = ""
|
||||
return md_text
|
||||
|
||||
|
||||
def convert_wordpress_xml(content: Union[str, bytes]) -> List[Dict]:
|
||||
logger.info(f"Starting conversion of WordPress XML content")
|
||||
converter = WordPressXmlConverter(content)
|
||||
result = converter.convert()
|
||||
|
||||
return converter.convert()
|
Loading…
Add table
Add a link
Reference in a new issue