import json import io from datetime import datetime from bs4 import BeautifulSoup # Файл дампа input_directory = "D:/__TEMP/____4/__convert" json_file = "1-2025" output_file = "2-2025-convert.json" def main(): try: f = io.open(input_directory + '/' + json_file + '.json', encoding='utf-8') records = json.loads(f.read()) except FileNotFoundError: print("Ошибка: Входной JSON файл не найден.") return except json.JSONDecodeError as e: print(f"Ошибка декодирования JSON: {e}") return articles = list() for item in records: try: article = dict() pre = item['pre'] full = item['full'] article['id'] = item['id'] article['title'] = item['title'] article['content'] = full article['categoryId'] = item['cat_id'] article['cityId'] = item['city_id'] soup = BeautifulSoup(pre, "html.parser") # Извлекаем URL изображения img_tag = soup.find("img") img_url = img_tag["src"] if img_tag else None # Удаляем тег из HTML if img_tag: img_tag.decompose() # Удаляем пустые

(с пробелами или полностью пустые) for p in soup.find_all("p"): if not p.get_text(strip=True): # strip=True убирает пробелы и невидимые символы p.decompose() # Извлекаем текст из оставшихся

text_content = " ".join(p.get_text(strip=True) for p in soup.find_all("p")) if not text_content: # Находим первый тег

и извлекаем текст soup = BeautifulSoup(full, "html.parser") first_p = soup.find("p") text_content = first_p.get_text(strip=True) if first_p else "" article['excerpt'] = text_content article['coverImage'] = img_url article['readTime'] = 2 article['likes'] = 0 article['dislikes'] = 0 article['gallery'] = [] # Разбираем строку в объект datetime date_obj = datetime.strptime(item['date'], "%d.%m.%Y") # Преобразуем в нужный формат formatted_date = date_obj.strftime("%Y-%m-%dT00:00:00Z") article['publishedAt'] = formatted_date author = dict() author['id'] = '41e09d9a-f9c1-44a7-97f4-0be694371e7e' article['author'] = author articles.append(article) except KeyError as e: print(f"Потерян ключ в записи: {e}") continue save_to_json_file(output_file, articles, 'w') def save_to_file(path, data, mode): f = open(input_directory + '/' + path, mode) f.write(data) f.close() def save_to_json_file(path, data, mode): f = io.open(input_directory + '/' + path, encoding='utf-8', mode=mode) json.dump(data, f, ensure_ascii=False, indent=2) f.close() #def create_article: if __name__ == '__main__': main()