108 lines
3.3 KiB
Python
108 lines
3.3 KiB
Python
import json
|
||
import io
|
||
from datetime import datetime
|
||
from bs4 import BeautifulSoup
|
||
|
||
# Файл дампа
|
||
input_directory = "D:/__TEMP/____4/__convert"
|
||
json_file = "1-2025"
|
||
output_file = "2-2025-convert.json"
|
||
|
||
def main():
|
||
try:
|
||
f = io.open(input_directory + '/' + json_file + '.json', encoding='utf-8')
|
||
records = json.loads(f.read())
|
||
except FileNotFoundError:
|
||
print("Ошибка: Входной JSON файл не найден.")
|
||
return
|
||
except json.JSONDecodeError as e:
|
||
print(f"Ошибка декодирования JSON: {e}")
|
||
return
|
||
|
||
articles = list()
|
||
|
||
for item in records:
|
||
try:
|
||
article = dict()
|
||
|
||
pre = item['pre']
|
||
full = item['full']
|
||
|
||
article['id'] = item['id']
|
||
article['title'] = item['title']
|
||
article['content'] = full
|
||
article['categoryId'] = item['cat_id']
|
||
article['cityId'] = item['city_id']
|
||
|
||
soup = BeautifulSoup(pre, "html.parser")
|
||
|
||
# Извлекаем URL изображения
|
||
img_tag = soup.find("img")
|
||
img_url = img_tag["src"] if img_tag else None
|
||
|
||
# Удаляем тег <img> из HTML
|
||
if img_tag:
|
||
img_tag.decompose()
|
||
|
||
# Удаляем пустые <p> (с пробелами или полностью пустые)
|
||
for p in soup.find_all("p"):
|
||
if not p.get_text(strip=True): # strip=True убирает пробелы и невидимые символы
|
||
p.decompose()
|
||
|
||
# Извлекаем текст из оставшихся <p>
|
||
text_content = " ".join(p.get_text(strip=True) for p in soup.find_all("p"))
|
||
|
||
if not text_content:
|
||
# Находим первый тег <p> и извлекаем текст
|
||
soup = BeautifulSoup(full, "html.parser")
|
||
first_p = soup.find("p")
|
||
text_content = first_p.get_text(strip=True) if first_p else ""
|
||
|
||
article['excerpt'] = text_content
|
||
article['coverImage'] = img_url
|
||
|
||
article['readTime'] = 2
|
||
article['likes'] = 0
|
||
article['dislikes'] = 0
|
||
|
||
article['gallery'] = []
|
||
|
||
# Разбираем строку в объект datetime
|
||
date_obj = datetime.strptime(item['date'], "%d.%m.%Y")
|
||
|
||
# Преобразуем в нужный формат
|
||
formatted_date = date_obj.strftime("%Y-%m-%dT00:00:00Z")
|
||
|
||
article['publishedAt'] = formatted_date
|
||
|
||
author = dict()
|
||
author['id'] = '41e09d9a-f9c1-44a7-97f4-0be694371e7e'
|
||
|
||
article['author'] = author
|
||
|
||
articles.append(article)
|
||
|
||
except KeyError as e:
|
||
print(f"Потерян ключ в записи: {e}")
|
||
continue
|
||
|
||
save_to_json_file(output_file, articles, 'w')
|
||
|
||
|
||
def save_to_file(path, data, mode):
|
||
f = open(input_directory + '/' + path, mode)
|
||
f.write(data)
|
||
f.close()
|
||
|
||
def save_to_json_file(path, data, mode):
|
||
f = io.open(input_directory + '/' + path, encoding='utf-8', mode=mode)
|
||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||
f.close()
|
||
|
||
|
||
#def create_article:
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|