import json
import io
from datetime import datetime
from bs4 import BeautifulSoup
# Файл дампа
input_directory = "D:/__TEMP/____4/__convert"
json_file = "1-2025"
output_file = "2-2025-convert.json"
def main():
try:
f = io.open(input_directory + '/' + json_file + '.json', encoding='utf-8')
records = json.loads(f.read())
except FileNotFoundError:
print("Ошибка: Входной JSON файл не найден.")
return
except json.JSONDecodeError as e:
print(f"Ошибка декодирования JSON: {e}")
return
articles = list()
for item in records:
try:
article = dict()
pre = item['pre']
full = item['full']
article['id'] = item['id']
article['title'] = item['title']
article['content'] = full
article['categoryId'] = item['cat_id']
article['cityId'] = item['city_id']
soup = BeautifulSoup(pre, "html.parser")
# Извлекаем URL изображения
img_tag = soup.find("img")
img_url = img_tag["src"] if img_tag else None
# Удаляем тег из HTML
if img_tag:
img_tag.decompose()
# Удаляем пустые
(с пробелами или полностью пустые) for p in soup.find_all("p"): if not p.get_text(strip=True): # strip=True убирает пробелы и невидимые символы p.decompose() # Извлекаем текст из оставшихся
text_content = " ".join(p.get_text(strip=True) for p in soup.find_all("p")) if not text_content: # Находим первый тег
и извлекаем текст soup = BeautifulSoup(full, "html.parser") first_p = soup.find("p") text_content = first_p.get_text(strip=True) if first_p else "" article['excerpt'] = text_content article['coverImage'] = img_url article['readTime'] = 2 article['likes'] = 0 article['dislikes'] = 0 article['gallery'] = [] # Разбираем строку в объект datetime date_obj = datetime.strptime(item['date'], "%d.%m.%Y") # Преобразуем в нужный формат formatted_date = date_obj.strftime("%Y-%m-%dT00:00:00Z") article['publishedAt'] = formatted_date author = dict() author['id'] = '41e09d9a-f9c1-44a7-97f4-0be694371e7e' article['author'] = author articles.append(article) except KeyError as e: print(f"Потерян ключ в записи: {e}") continue save_to_json_file(output_file, articles, 'w') def save_to_file(path, data, mode): f = open(input_directory + '/' + path, mode) f.write(data) f.close() def save_to_json_file(path, data, mode): f = io.open(input_directory + '/' + path, encoding='utf-8', mode=mode) json.dump(data, f, ensure_ascii=False, indent=2) f.close() #def create_article: if __name__ == '__main__': main()