🚀 Add feature: WordPress conversion
This commit is contained in:
parent
f3dec3b49a
commit
5ee50a0f0f
7 changed files with 67064 additions and 4 deletions
|
@ -1,5 +1,6 @@
|
|||
import logging
|
||||
import os
|
||||
|
||||
import dotenv
|
||||
from minio import Minio
|
||||
|
||||
|
@ -72,7 +73,7 @@ available_sources = AvailableSourcesResponse(
|
|||
display_name="Markdown", name="markdown", format="markdown"
|
||||
),
|
||||
AvailableSource(
|
||||
display_name="Wordpress", name="wordpress_xml", format="xml"
|
||||
display_name="Wordpress", name="wordpress", format="xml"
|
||||
),
|
||||
AvailableSource(display_name="Ebook", name="ebook_pdf", format="pdf"),
|
||||
AvailableSource(
|
||||
|
|
66
backend/app/convert/convert_wordpress_xml.py
Normal file
66
backend/app/convert/convert_wordpress_xml.py
Normal file
|
@ -0,0 +1,66 @@
|
|||
import datetime
|
||||
import re
|
||||
from typing import Dict, Union, List
|
||||
|
||||
import markdownify
|
||||
import pandas as pd
|
||||
import xmltodict
|
||||
|
||||
from app.config import logger
|
||||
from app.convert.base_converter import BaseConverter
|
||||
|
||||
|
||||
class WordPressXmlConverter(BaseConverter):
|
||||
def __init__(self, content: Union[str, bytes]):
|
||||
super().__init__(content)
|
||||
self.wordpress_dict = None
|
||||
self.df = None
|
||||
|
||||
def read_file(self) -> None:
|
||||
self.wordpress_dict = xmltodict.parse(self.content)
|
||||
self.df = pd.DataFrame(self.wordpress_dict['rss']['channel']['item'])
|
||||
|
||||
def add_metadata(self) -> None:
|
||||
self.df['source'] = 'WordPress'
|
||||
self.df['type'] = self.df['wp:post_type'].map(
|
||||
{'post': 'Article WordPress', 'page': 'Page WordPress'})
|
||||
|
||||
def convert_columns(self) -> None:
|
||||
self.df = self.df[
|
||||
(self.df['wp:post_type'].isin(['page', 'post'])) &
|
||||
(self.df['wp:status'] == 'publish')
|
||||
].copy()
|
||||
|
||||
self.df['date'] = self.df['wp:post_date'].apply(
|
||||
lambda x: str(datetime.datetime.fromisoformat(x).isoformat())
|
||||
)
|
||||
|
||||
self.df['content'] = self.df['content:encoded'].apply(
|
||||
self.wp_to_markdown)
|
||||
|
||||
def rename_columns(self) -> None:
|
||||
self.df.rename(columns={"link": "url"}, inplace=True)
|
||||
|
||||
def clean_data(self) -> None:
|
||||
self.df = self.df[['source', 'type', 'date', 'url', 'content']].copy()
|
||||
self.df['path'] = "" # Add empty 'path' column
|
||||
self.df.fillna(value="", inplace=True)
|
||||
|
||||
@staticmethod
|
||||
def wp_to_markdown(x):
|
||||
try:
|
||||
md_text = re.sub(r'\n+', ' ', markdownify.markdownify(x,
|
||||
heading_style='ATX')).strip()
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error converting WordPress content to Markdown: {str(e)}")
|
||||
md_text = ""
|
||||
return md_text
|
||||
|
||||
|
||||
def convert_wordpress_xml(content: Union[str, bytes]) -> List[Dict]:
|
||||
logger.info(f"Starting conversion of WordPress XML content")
|
||||
converter = WordPressXmlConverter(content)
|
||||
result = converter.convert()
|
||||
|
||||
return converter.convert()
|
|
@ -1,7 +1,7 @@
|
|||
import datetime
|
||||
import json
|
||||
import os
|
||||
from typing import Any, Dict, Union, List
|
||||
from typing import Dict, Union, List
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
|
||||
|
@ -37,6 +37,7 @@ from app.convert.convert_linkedin_comments_csv import (
|
|||
)
|
||||
from app.convert.convert_linkedin_shares_csv import convert_linkedin_shares_csv
|
||||
from app.convert.convert_markdown_txt import convert_markdown_txt
|
||||
from app.convert.convert_wordpress_xml import convert_wordpress_xml
|
||||
from app.convert.convert_youtube_shorts_video import (
|
||||
convert_youtube_shorts_video,
|
||||
)
|
||||
|
@ -90,6 +91,9 @@ CONVERTERS = {
|
|||
},
|
||||
"export": {
|
||||
"txt": convert_export_txt
|
||||
},
|
||||
"wordpress": {
|
||||
"xml": convert_wordpress_xml
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -4,4 +4,6 @@ pydantic
|
|||
pytest
|
||||
pandas
|
||||
minio
|
||||
python-dotenv
|
||||
python-dotenv
|
||||
xmltodict
|
||||
markdownify
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue