russ_react/.import/mysql_json_convert.py

108 lines
3.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import io
from datetime import datetime
from bs4 import BeautifulSoup
# Файл дампа
input_directory = "D:/__TEMP/____RUSS_CULT/__convert"
json_file = "1-2025-p"
output_file = "1-2025-p-convert.json"
def main():
try:
f = io.open(input_directory + '/' + json_file + '.json', encoding='utf-8')
records = json.loads(f.read())
except FileNotFoundError:
print("Ошибка: Входной JSON файл не найден.")
return
except json.JSONDecodeError as e:
print(f"Ошибка декодирования JSON: {e}")
return
articles = list()
for item in records:
try:
article = dict()
pre = item['pre']
full = item['full']
article['id'] = item['id']
article['title'] = item['title']
article['content'] = full
article['categoryId'] = item['cat_id']
article['cityId'] = item['city_id']
soup = BeautifulSoup(pre, "html.parser")
# Извлекаем URL изображения
img_tag = soup.find("img")
img_url = img_tag["src"] if img_tag else None
# Удаляем тег <img> из HTML
if img_tag:
img_tag.decompose()
# Удаляем пустые <p> (с пробелами или полностью пустые)
for p in soup.find_all("p"):
if not p.get_text(strip=True): # strip=True убирает пробелы и невидимые символы
p.decompose()
# Извлекаем текст из оставшихся <p>
text_content = " ".join(p.get_text(strip=True) for p in soup.find_all("p"))
if not text_content:
# Находим первый тег <p> и извлекаем текст
soup = BeautifulSoup(full, "html.parser")
first_p = soup.find("p")
text_content = first_p.get_text(strip=True) if first_p else ""
article['excerpt'] = text_content
article['coverImage'] = img_url
article['readTime'] = 2
article['likes'] = 0
article['dislikes'] = 0
article['gallery'] = []
# Разбираем строку в объект datetime
date_obj = datetime.strptime(item['date'], "%d.%m.%Y")
# Преобразуем в нужный формат
formatted_date = date_obj.strftime("%Y-%m-%dT00:00:00Z")
article['publishedAt'] = formatted_date
author = dict()
author['id'] = '41e09d9a-f9c1-44a7-97f4-0be694371e7e'
article['author'] = author
articles.append(article)
except KeyError as e:
print(f"Потерян ключ в записи: {e}")
continue
save_to_json_file(output_file, articles, 'w')
def save_to_file(path, data, mode):
f = open(input_directory + '/' + path, mode)
f.write(data)
f.close()
def save_to_json_file(path, data, mode):
f = io.open(input_directory + '/' + path, encoding='utf-8', mode=mode)
json.dump(data, f, ensure_ascii=False, indent=2)
f.close()
#def create_article:
if __name__ == '__main__':
main()