Files
hotels/scraper_missing.py
Фёдор 0cf3297290 Проект аудита отелей: основные скрипты и документация
- Краулеры: smart_crawler.py, regional_crawler.py
- Аудит: audit_orel_to_excel.py, audit_chukotka_to_excel.py
- РКН проверка: check_rkn_registry.py, recheck_unclear_rkn.py
- Отчёты: create_orel_horizontal_report.py
- Обработка: process_all_hotels_embeddings.py
- Документация: README.md, DB_SCHEMA_REFERENCE.md
2025-10-16 10:52:09 +03:00

197 lines
6.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Парсер для ОСТАВШИХСЯ необработанных отелей
С автоматическим переподключением к БД
"""
import requests
import psycopg2
from psycopg2.extras import execute_batch, Json
import time
import logging
from datetime import datetime
from urllib.parse import unquote
from typing import Optional, Dict, List
import json
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(f'scraper_missing_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
DB_CONFIG = {
'host': "147.45.189.234",
'port': 5432,
'database': "default_db",
'user': "gen_user",
'password': unquote("2~~9_%5EkVsU%3F2%5CS")
}
API_BASE_URL = "https://tourism.fsa.gov.ru/api/v1"
RATE_LIMIT_DELAY = 0.1
BATCH_SIZE = 50
class MissingScraper:
def __init__(self, limit=None, offset=0):
self.limit = limit
self.offset = offset
self.conn = None
self.session = requests.Session()
self.processed = 0
self.errors = 0
def reconnect_db(self):
"""Переподключение к БД"""
if self.conn:
try:
self.conn.close()
except:
pass
self.conn = psycopg2.connect(**DB_CONFIG)
def get_missing_hotel_ids(self):
"""Получить ID необработанных отелей"""
self.reconnect_db()
cur = self.conn.cursor()
sql = """
SELECT m.id
FROM hotel_main m
LEFT JOIN hotel_raw_json r ON m.id = r.hotel_id
WHERE r.hotel_id IS NULL
ORDER BY m.id
"""
if self.limit:
sql += f" LIMIT {self.limit} OFFSET {self.offset}"
cur.execute(sql)
ids = [row[0] for row in cur.fetchall()]
cur.close()
return ids
def api_request(self, url: str) -> Optional[Dict]:
"""API запрос"""
time.sleep(RATE_LIMIT_DELAY)
try:
response = self.session.get(url, timeout=30)
response.raise_for_status()
return response.json()
except:
return None
def get_hotel_details(self, hotel_id: str) -> Dict:
"""Получить детали отеля"""
return {
'hotel_id': hotel_id,
'main': self.api_request(f"{API_BASE_URL}/resorts/hotels/{hotel_id}/main"),
'additional_info': self.api_request(f"{API_BASE_URL}/resorts/common/{hotel_id}/additional-info"),
'sanatorium': self.api_request(f"{API_BASE_URL}/resorts/hotels/{hotel_id}/sanatoriumDrawer"),
'drawer': self.api_request(f"{API_BASE_URL}/resorts/hotels/{hotel_id}/drawer")
}
def save_batch(self, batch: List[Dict]):
"""Сохранить батч с переподключением"""
if not batch:
return
# Переподключаемся перед каждым сохранением
self.reconnect_db()
cur = self.conn.cursor()
try:
# Сохраняем в hotel_raw_json
records = [(item['hotel_id'], Json(item['main']), Json(item['additional_info']),
Json(item['sanatorium']), Json(item['drawer'])) for item in batch]
sql = """
INSERT INTO hotel_raw_json
(hotel_id, main_data, additional_info, sanatorium_data, drawer_data)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (hotel_id) DO UPDATE SET
main_data = EXCLUDED.main_data,
additional_info = EXCLUDED.additional_info,
sanatorium_data = EXCLUDED.sanatorium_data,
drawer_data = EXCLUDED.drawer_data
"""
execute_batch(cur, sql, records, page_size=BATCH_SIZE)
self.conn.commit()
logger.info(f"✓ Сохранено {len(batch)} отелей")
except Exception as e:
logger.error(f"Ошибка сохранения: {e}")
self.conn.rollback()
self.errors += len(batch)
finally:
cur.close()
def run(self):
"""Запуск"""
start = datetime.now()
logger.info(f"🚀 Запуск парсинга НЕОБРАБОТАННЫХ отелей")
# Получаем список необработанных
hotel_ids = self.get_missing_hotel_ids()
total = len(hotel_ids)
logger.info(f"📊 Необработанных отелей: {total}")
if total == 0:
logger.info("Все отели уже обработаны!")
return
batch = []
for idx, hotel_id in enumerate(hotel_ids, 1):
try:
details = self.get_hotel_details(hotel_id)
batch.append(details)
self.processed += 1
# Сохраняем батч
if len(batch) >= BATCH_SIZE:
self.save_batch(batch)
batch = []
# Прогресс
if idx % 100 == 0:
elapsed = (datetime.now() - start).total_seconds()
speed = self.processed / elapsed
eta_min = (total - idx) / speed / 60
logger.info(f"Progress: {idx}/{total} ({idx/total*100:.1f}%) | "
f"Speed: {speed:.1f}/sec | ETA: {eta_min:.0f} min")
except Exception as e:
logger.error(f"Ошибка обработки {hotel_id}: {e}")
self.errors += 1
# Остаток
if batch:
self.save_batch(batch)
elapsed = (datetime.now() - start).total_seconds()
logger.info(f"\n{'='*70}")
logger.info(f"Завершено: {self.processed}/{total}")
logger.info(f"Ошибок: {self.errors}")
logger.info(f"Время: {elapsed/60:.1f} минут")
logger.info(f"{'='*70}")
if __name__ == "__main__":
import sys
limit = int(sys.argv[1]) if len(sys.argv) > 1 else None
offset = int(sys.argv[2]) if len(sys.argv) > 2 else 0
scraper = MissingScraper(limit=limit, offset=offset)
scraper.run()