Files
hotels/retry_failed_hotels.py
Фёдор 684fada337 🚀 Full project sync: Hotels RAG & Audit System
 Major Features:
- Complete RAG system for hotel website analysis
- Hybrid audit with BGE-M3 embeddings + Natasha NER
- Universal horizontal Excel reports with dashboards
- Multi-region processing (SPb, Orel, Chukotka, Kamchatka)

📊 Completed Regions:
- Орловская область: 100% (36/36)
- Чукотский АО: 100% (4/4)
- г. Санкт-Петербург: 93% (893/960)
- Камчатский край: 87% (89/102)

🔧 Infrastructure:
- PostgreSQL with pgvector extension
- BGE-M3 embeddings API
- Browserless for web scraping
- N8N workflows for automation
- S3/Nextcloud file storage

📝 Documentation:
- Complete DB schemas
- API documentation
- Setup guides
- Status reports
2025-10-27 22:49:42 +03:00

117 lines
3.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Перекраулинг отелей со статусом 'failed'
Более мягкие настройки: HTTP fallback, игнорирование SSL ошибок
"""
import psycopg2
from psycopg2.extras import RealDictCursor
from urllib.parse import unquote
import logging
from datetime import datetime
# Настройка логирования
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(f'retry_failed_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
# Конфигурация БД
DB_CONFIG = {
'host': '147.45.189.234',
'port': 5432,
'database': 'default_db',
'user': 'gen_user',
'password': unquote('2~~9_%5EkVsU%3F2%5CS')
}
def get_failed_hotels(region_name=None):
"""Получить отели со статусом failed"""
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
cur = conn.cursor()
query = """
SELECT h.id, h.full_name, h.website_address, hwm.error_message
FROM hotel_main h
INNER JOIN hotel_website_meta hwm ON h.id = hwm.hotel_id
WHERE hwm.crawl_status = 'failed'
"""
if region_name:
query += " AND h.region_name = %s"
cur.execute(query, (region_name,))
else:
cur.execute(query)
hotels = cur.fetchall()
cur.close()
conn.close()
return hotels
def main():
import sys
region = sys.argv[1] if len(sys.argv) > 1 else None
logger.info("=" * 70)
logger.info("🔄 ПЕРЕКРАУЛИНГ FAILED ОТЕЛЕЙ")
if region:
logger.info(f"📍 Регион: {region}")
else:
logger.info("📍 Регион: ВСЕ")
logger.info("=" * 70)
# Получаем failed отели
hotels = get_failed_hotels(region)
logger.info(f"\n📊 Найдено {len(hotels)} failed отелей")
if len(hotels) == 0:
logger.info("✅ Нет failed отелей!")
return
# Статистика ошибок
errors = {}
for hotel in hotels:
error = hotel['error_message'] or 'Unknown'
error_type = error.split(':')[0] if ':' in error else error
errors[error_type] = errors.get(error_type, 0) + 1
logger.info("\n📊 СТАТИСТИКА ОШИБОК:")
for error_type, count in sorted(errors.items(), key=lambda x: x[1], reverse=True):
logger.info(f" {error_type}: {count}")
# Сохраняем список в файл для краулера
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"failed_hotels_{region or 'all'}_{timestamp}.txt"
with open(filename, 'w') as f:
for hotel in hotels:
f.write(f"{hotel['id']}\t{hotel['full_name']}\t{hotel['website_address']}\n")
logger.info(f"\n💾 Список сохранён в: {filename}")
logger.info(f"\n📋 ЗАПУСК КРАУЛЕРА:")
logger.info(f" Можно запустить smart_crawler.py с этим списком")
logger.info(f" Или использовать single_hotel_crawler.py для каждого отеля")
# Выводим первые 10 отелей
logger.info(f"\n📋 ПЕРВЫЕ 10 ОТЕЛЕЙ:")
for i, hotel in enumerate(hotels[:10], 1):
logger.info(f" {i}. {hotel['full_name']}")
logger.info(f" Сайт: {hotel['website_address']}")
logger.info(f" ID: {hotel['id']}")
if __name__ == "__main__":
main()