#!/usr/bin/env python3 """ Сбор детальной информации по всем отелям Собирает данные из 4 endpoint'ов для каждого отеля """ import requests import psycopg2 from psycopg2.extras import execute_batch, Json import time import logging from datetime import datetime from urllib.parse import unquote from typing import Optional, Dict, List import json # Настройка логирования logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler(f'scraper_detailed_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) # Параметры подключения к БД DB_CONFIG = { 'host': "147.45.189.234", 'port': 5432, 'database': "default_db", 'user': "gen_user", 'password': unquote("2~~9_%5EkVsU%3F2%5CS") } API_BASE_URL = "https://tourism.fsa.gov.ru/api/v1" RATE_LIMIT_DELAY = 0.1 # 10 запросов в секунду (осторожно) BATCH_SIZE = 100 CHECKPOINT_INTERVAL = 1000 # Чаще checkpoint для длительного процесса class DetailedScraper: def __init__(self): self.conn = None self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (compatible; HotelDataCollector/1.0)' }) self.processed_count = 0 self.error_count = 0 self.start_time = None def connect_db(self): """Подключение к базе данных""" self.conn = psycopg2.connect(**DB_CONFIG) logger.info("✓ Подключено к базе данных") def api_request(self, url: str) -> Optional[Dict]: """Безопасный запрос к API с rate limiting""" time.sleep(RATE_LIMIT_DELAY) try: response = self.session.get(url, timeout=30) response.raise_for_status() return response.json() except Exception as e: logger.debug(f"API request failed: {url} - {e}") return None def get_hotel_ids(self, limit=None, offset=0): """Получить ID всех отелей из базы""" cur = self.conn.cursor() sql = "SELECT id FROM hotel_main ORDER BY id" if limit: sql += f" LIMIT {limit} OFFSET {offset}" cur.execute(sql) ids = [row[0] for row in cur.fetchall()] cur.close() return ids def get_detailed_info(self, hotel_id: str) -> Dict: """Получить детальную информацию об отеле""" result = { 'hotel_id': hotel_id, 'main': None, 'additional_info': None, 'sanatorium': None, 'drawer': None } # Main info url = f"{API_BASE_URL}/resorts/hotels/{hotel_id}/main" result['main'] = self.api_request(url) # Additional info url = f"{API_BASE_URL}/resorts/common/{hotel_id}/additional-info" result['additional_info'] = self.api_request(url) # Sanatorium info url = f"{API_BASE_URL}/resorts/hotels/{hotel_id}/sanatoriumDrawer" result['sanatorium'] = self.api_request(url) # Drawer (услуги) url = f"{API_BASE_URL}/resorts/hotels/{hotel_id}/drawer" result['drawer'] = self.api_request(url) return result def save_main_updates(self, data_list: List[Dict]): """Обновить основную таблицу hotel_main""" if not data_list: return cur = self.conn.cursor() updates = [] for item in data_list: main = item.get('main') if not main: continue updates.append(( main.get('shortName'), main.get('phone'), main.get('email'), main.get('websiteAddress'), main.get('ownerFullName'), item['hotel_id'] )) if updates: sql = """ UPDATE hotel_main SET short_name = %s, phone = %s, email = %s, website_address = %s, owner_full_name = %s, updated_at = CURRENT_TIMESTAMP WHERE id = %s """ execute_batch(cur, sql, updates, page_size=BATCH_SIZE) self.conn.commit() cur.close() def save_additional_info(self, data_list: List[Dict]): """Сохранить дополнительную информацию""" if not data_list: return cur = self.conn.cursor() records = [] for item in data_list: info = item.get('additional_info') if not info: continue records.append(( item['hotel_id'], info.get('ownerOgrn'), info.get('ownerInn'), info.get('ownerKpp'), info.get('ownerShortName'), info.get('ownerPhone'), info.get('ownerEmail'), info.get('resortFullName'), info.get('ownerAddressName'), info.get('ownerLegalTypeId'), info.get('phone'), info.get('email') )) if records: sql = """ INSERT INTO hotel_additional_info (hotel_id, owner_ogrn, owner_inn, owner_kpp, owner_short_name, owner_phone, owner_email, resort_full_name, owner_address_name, owner_legal_type_id, phone, email) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT (hotel_id) DO UPDATE SET owner_ogrn = EXCLUDED.owner_ogrn, owner_inn = EXCLUDED.owner_inn, owner_kpp = EXCLUDED.owner_kpp, owner_short_name = EXCLUDED.owner_short_name, owner_phone = EXCLUDED.owner_phone, owner_email = EXCLUDED.owner_email, resort_full_name = EXCLUDED.resort_full_name, owner_address_name = EXCLUDED.owner_address_name, owner_legal_type_id = EXCLUDED.owner_legal_type_id, phone = EXCLUDED.phone, email = EXCLUDED.email """ execute_batch(cur, sql, records, page_size=BATCH_SIZE) self.conn.commit() cur.close() def save_sanatorium_info(self, data_list: List[Dict]): """Сохранить санаторную информацию""" if not data_list: return cur = self.conn.cursor() records = [] for item in data_list: san = item.get('sanatorium') if not san or not isinstance(san, dict) or 'sanatoriumInfo' not in san: continue info = san.get('sanatoriumInfo', {}) if not info: continue records.append(( item['hotel_id'], info.get('oid'), info.get('fullName'), info.get('shortName'), info.get('ogrn'), info.get('inn'), info.get('legalAddress'), info.get('actualAddress'), info.get('phone'), info.get('email'), info.get('webSite'), info.get('medicalLicense'), info.get('farmLicense'), info.get('terrenkur'), info.get('resortName'), info.get('hasWaterSupply'), info.get('hasHeating'), info.get('hasSewage'), info.get('hasAirConditioning'), info.get('hasElevator'), info.get('hasTelephone'), info.get('hasInternet'), info.get('hasMobilityLift'), info.get('hasGym'), info.get('hasConferenceRoom'), Json(san.get('swimmingPoolInfo')), Json(san.get('plageInfo')), Json(san.get('landDocumentInfo')), Json(san.get('roomsInfo')) )) if records: sql = """ INSERT INTO hotel_sanatorium (hotel_id, oid, full_name, short_name, ogrn, inn, legal_address, actual_address, phone, email, web_site, medical_license, farm_license, terrenkur, resort_name, has_water_supply, has_heating, has_sewage, has_air_conditioning, has_elevator, has_telephone, has_internet, has_mobility_lift, has_gym, has_conference_room, swimming_pool_info, plage_info, land_document_info, rooms_info) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT (hotel_id) DO UPDATE SET oid = EXCLUDED.oid, full_name = EXCLUDED.full_name, short_name = EXCLUDED.short_name, ogrn = EXCLUDED.ogrn, inn = EXCLUDED.inn, legal_address = EXCLUDED.legal_address, actual_address = EXCLUDED.actual_address, phone = EXCLUDED.phone, email = EXCLUDED.email, web_site = EXCLUDED.web_site, medical_license = EXCLUDED.medical_license, farm_license = EXCLUDED.farm_license, terrenkur = EXCLUDED.terrenkur, resort_name = EXCLUDED.resort_name, has_water_supply = EXCLUDED.has_water_supply, has_heating = EXCLUDED.has_heating, has_sewage = EXCLUDED.has_sewage, has_air_conditioning = EXCLUDED.has_air_conditioning, has_elevator = EXCLUDED.has_elevator, has_telephone = EXCLUDED.has_telephone, has_internet = EXCLUDED.has_internet, has_mobility_lift = EXCLUDED.has_mobility_lift, has_gym = EXCLUDED.has_gym, has_conference_room = EXCLUDED.has_conference_room, swimming_pool_info = EXCLUDED.swimming_pool_info, plage_info = EXCLUDED.plage_info, land_document_info = EXCLUDED.land_document_info, rooms_info = EXCLUDED.rooms_info """ execute_batch(cur, sql, records, page_size=BATCH_SIZE) self.conn.commit() cur.close() def save_services_and_rooms(self, data_list: List[Dict]): """Сохранить услуги и номера из drawer""" if not data_list: return cur = self.conn.cursor() for item in data_list: drawer = item.get('drawer') if not drawer or not isinstance(drawer, dict): continue hotel_id = item['hotel_id'] # Услуги services = [] for service_group in drawer.get('hotelServiceInfoList', []): cat_id = service_group.get('id') cat_name = service_group.get('name') for service in service_group.get('servicesList', []): services.append(( hotel_id, cat_id, cat_name, service.get('id'), service.get('name') )) if services: sql = """ INSERT INTO hotel_services (hotel_id, service_category_id, service_category_name, service_id, service_name) VALUES (%s, %s, %s, %s, %s) ON CONFLICT (hotel_id, service_id) DO NOTHING """ execute_batch(cur, sql, services, page_size=200) # Номера rooms = [] for room in drawer.get('roomInfoList', []): rooms.append(( hotel_id, room.get('roomCategory', {}).get('id'), room.get('roomCategory', {}).get('name'), room.get('apartmentCount'), room.get('numberSeats'), Json(room.get('equipmentList', [])), room.get('familyRoomCount'), room.get('disabilityRoomCount') )) if rooms: sql = """ INSERT INTO hotel_rooms (hotel_id, room_category_id, room_category_name, apartment_count, number_seats, equipment_list, family_room_count, disability_room_count) VALUES (%s, %s, %s, %s, %s, %s, %s, %s) """ execute_batch(cur, sql, rooms, page_size=100) self.conn.commit() cur.close() def save_raw_json(self, data_list: List[Dict]): """Сохранить сырые JSON для backup""" if not data_list: return cur = self.conn.cursor() records = [] for item in data_list: records.append(( item['hotel_id'], Json(item.get('main')), Json(item.get('additional_info')), Json(item.get('sanatorium')), Json(item.get('drawer')) )) sql = """ INSERT INTO hotel_raw_json (hotel_id, main_data, additional_info, sanatorium_data, drawer_data) VALUES (%s, %s, %s, %s, %s) ON CONFLICT (hotel_id) DO UPDATE SET main_data = EXCLUDED.main_data, additional_info = EXCLUDED.additional_info, sanatorium_data = EXCLUDED.sanatorium_data, drawer_data = EXCLUDED.drawer_data """ execute_batch(cur, sql, records, page_size=BATCH_SIZE) self.conn.commit() cur.close() def run(self, limit=None, offset=0): """Запустить сбор детальной информации""" self.start_time = datetime.now() self.connect_db() # Получаем список ID отелей hotel_ids = self.get_hotel_ids(limit=limit, offset=offset) total = len(hotel_ids) logger.info("=" * 70) logger.info(f"Запуск сбора детальной информации") logger.info(f"Отелей к обработке: {total}") logger.info(f"Начало: {self.start_time}") logger.info("=" * 70) batch = [] try: for idx, hotel_id in enumerate(hotel_ids, 1): try: details = self.get_detailed_info(hotel_id) batch.append(details) self.processed_count += 1 # Сохраняем батч if len(batch) >= BATCH_SIZE: self.save_batch(batch) batch = [] # Checkpoint и статистика if self.processed_count % CHECKPOINT_INTERVAL == 0: elapsed = (datetime.now() - self.start_time).total_seconds() rate = self.processed_count / elapsed remaining = (total - self.processed_count) / rate / 60 logger.info( f"Progress: {self.processed_count}/{total} ({self.processed_count/total*100:.1f}%) | " f"Speed: {rate:.1f} hotels/sec | " f"ETA: {remaining:.1f} min | " f"Errors: {self.error_count}" ) except Exception as e: logger.error(f"Error processing hotel {hotel_id}: {e}") self.error_count += 1 # Сохраняем остаток if batch: self.save_batch(batch) except KeyboardInterrupt: logger.info("\n⚠ Прервано пользователем") if batch: self.save_batch(batch) finally: if self.conn: self.conn.close() elapsed = (datetime.now() - self.start_time).total_seconds() logger.info("=" * 70) logger.info("Сбор детальной информации завершен") logger.info(f"Обработано: {self.processed_count}/{total} отелей") logger.info(f"Ошибок: {self.error_count}") logger.info(f"Время: {elapsed/60:.1f} минут") logger.info(f"Скорость: {self.processed_count/elapsed:.1f} отелей/сек") logger.info("=" * 70) def save_batch(self, batch): """Сохранить батч данных""" logger.debug(f"Сохраняю батч из {len(batch)} отелей...") try: self.save_main_updates(batch) self.save_additional_info(batch) self.save_sanatorium_info(batch) self.save_services_and_rooms(batch) self.save_raw_json(batch) except Exception as e: logger.error(f"Ошибка сохранения батча: {e}") self.error_count += len(batch) if __name__ == "__main__": import sys limit = int(sys.argv[1]) if len(sys.argv) > 1 else None offset = int(sys.argv[2]) if len(sys.argv) > 2 else 0 logger.info(f"Параметры: limit={limit or 'all'}, offset={offset}") scraper = DetailedScraper() scraper.run(limit=limit, offset=offset)