Files
hotels/audit_spb_retry.py
Фёдор 684fada337 🚀 Full project sync: Hotels RAG & Audit System
 Major Features:
- Complete RAG system for hotel website analysis
- Hybrid audit with BGE-M3 embeddings + Natasha NER
- Universal horizontal Excel reports with dashboards
- Multi-region processing (SPb, Orel, Chukotka, Kamchatka)

📊 Completed Regions:
- Орловская область: 100% (36/36)
- Чукотский АО: 100% (4/4)
- г. Санкт-Петербург: 93% (893/960)
- Камчатский край: 87% (89/102)

🔧 Infrastructure:
- PostgreSQL with pgvector extension
- BGE-M3 embeddings API
- Browserless for web scraping
- N8N workflows for automation
- S3/Nextcloud file storage

📝 Documentation:
- Complete DB schemas
- API documentation
- Setup guides
- Status reports
2025-10-27 22:49:42 +03:00

269 lines
9.2 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Повторная обработка неудачных отелей СПб через n8n webhook
"""
import psycopg2
from psycopg2.extras import RealDictCursor
import requests
import json
import time
from urllib.parse import unquote
# Конфигурация
DB_CONFIG = {
'host': '147.45.189.234',
'port': 5432,
'database': 'default_db',
'user': 'gen_user',
'password': unquote('2~~9_%5EkVsU%3F2%5CS')
}
N8N_WEBHOOK_URL = "https://n8n.clientright.pro/webhook/6be4a7b9-a016-4252-841f-0ebca367914f"
REGION = 'г. Санкт-Петербург'
AUDIT_VERSION = 'v1.0_with_rkn'
def get_failed_hotels():
"""Получить список отелей, которые упали в ошибку"""
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
cur = conn.cursor()
# Все отели с чанками
cur.execute("""
SELECT DISTINCT metadata->>'hotel_id' as hotel_id
FROM hotel_website_chunks
WHERE metadata->>'hotel_id' IN (
SELECT id::text FROM hotel_main WHERE region_name = %s
)
""", (REGION,))
all_hotels = {row['hotel_id'] for row in cur.fetchall()}
# Уже обработанные
cur.execute("""
SELECT hotel_id::text
FROM hotel_audit_results
WHERE region_name = %s AND audit_version = %s
""", (REGION, AUDIT_VERSION))
processed_hotels = {row['hotel_id'] for row in cur.fetchall()}
# Неудачники
failed_hotel_ids = list(all_hotels - processed_hotels)
# Получаем полную информацию о неудачниках
cur.execute("""
SELECT
hm.id,
hm.full_name as hotel_name,
hm.website_address,
hm.region_name,
hm.rkn_registry_number as registry_number
FROM hotel_main hm
WHERE hm.id::text = ANY(%s)
ORDER BY hm.full_name
""", (failed_hotel_ids,))
hotels = cur.fetchall()
cur.close()
conn.close()
return hotels
def get_hotel_chunks(hotel_id):
"""Получить чанки для отеля"""
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
cur = conn.cursor()
cur.execute("""
SELECT text as chunk_text, metadata
FROM hotel_website_chunks
WHERE metadata->>'hotel_id' = %s
ORDER BY (metadata->>'page_number')::int, (metadata->>'chunk_index')::int
""", (str(hotel_id),))
chunks = cur.fetchall()
cur.close()
conn.close()
return chunks
def audit_hotel_via_webhook(hotel, chunks):
"""Отправить отель на аудит через n8n webhook"""
payload = {
"hotel_id": str(hotel['id']),
"hotel_name": hotel['hotel_name'],
"website": hotel['website_address'] or "",
"region_name": hotel['region_name'],
"registry_number": hotel['registry_number'] or "",
"chunks": [
{
"text": chunk['chunk_text'],
"metadata": chunk['metadata']
}
for chunk in chunks
]
}
try:
response = requests.post(
N8N_WEBHOOK_URL,
json=payload,
timeout=180 # 3 минуты таймаут (для больших отелей)
)
if response.status_code == 200:
try:
result = response.json()
# n8n может вернуть массив или объект
if isinstance(result, list) and len(result) > 0:
result = result[0]
# Проверяем наличие нужных полей
if not isinstance(result, dict):
return False, f"Response is not a dict: {type(result)}, content: {str(result)[:200]}"
# Преобразуем структуру n8n в структуру для БД
if 'found' in result and 'total_criteria' in result:
# Новый формат от n8n
result['total_score'] = result.get('found', 0)
result['max_score'] = result.get('total_criteria', 17)
result['score_percentage'] = result.get('compliance_percentage', 0.0)
# Преобразуем criteria в criteria_results
if 'criteria' in result:
result['criteria_results'] = result['criteria']
if 'total_score' not in result:
return False, f"Missing required fields in response: {json.dumps(result, ensure_ascii=False)[:200]}"
return True, result
except json.JSONDecodeError:
return False, f"Invalid JSON response: {response.text[:100]}"
else:
return False, f"HTTP {response.status_code}: {response.text[:100]}"
except requests.exceptions.Timeout:
return False, "Timeout after 180 seconds"
except Exception as e:
return False, str(e)
def save_audit_result(hotel, audit_result):
"""Сохранить результат аудита"""
conn = psycopg2.connect(**DB_CONFIG)
cur = conn.cursor()
try:
cur.execute("""
INSERT INTO hotel_audit_results (
hotel_id,
hotel_name,
website,
region_name,
total_score,
max_score,
score_percentage,
criteria_results,
audit_version
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (hotel_id, audit_version)
DO UPDATE SET
total_score = EXCLUDED.total_score,
max_score = EXCLUDED.max_score,
score_percentage = EXCLUDED.score_percentage,
criteria_results = EXCLUDED.criteria_results
""", (
hotel['id'],
hotel['hotel_name'],
hotel['website_address'],
hotel['region_name'],
audit_result['total_score'],
audit_result['max_score'],
audit_result['score_percentage'],
json.dumps(audit_result['criteria_results'], ensure_ascii=False),
AUDIT_VERSION
))
conn.commit()
return True
except Exception as e:
conn.rollback()
print(f" ❌ Ошибка сохранения: {e}")
return False
finally:
cur.close()
conn.close()
def main():
print("🔄 ПОВТОРНАЯ ОБРАБОТКА НЕУДАЧНЫХ ОТЕЛЕЙ СПб")
print("=" * 60)
# Получаем неудачников
failed_hotels = get_failed_hotels()
total = len(failed_hotels)
print(f"📊 Найдено неудачных отелей: {total}")
print(f"🚀 Начинаем обработку через n8n webhook...")
print()
success_count = 0
error_count = 0
start_time = time.time()
for idx, hotel in enumerate(failed_hotels, 1):
print(f"[{idx}/{total}] {hotel['hotel_name']}")
print(f" 🔗 {hotel['website_address'] or 'Нет сайта'}")
# Получаем чанки
chunks = get_hotel_chunks(hotel['id'])
print(f" 📦 Chunks: {len(chunks)}")
if not chunks:
print(f" ⚠️ Нет чанков, пропускаем")
error_count += 1
continue
# Отправляем на аудит
print(f" 🔍 Аудит: {hotel['hotel_name']}...")
success, result = audit_hotel_via_webhook(hotel, chunks)
if success:
# Сохраняем результат
if save_audit_result(hotel, result):
score = result['score_percentage']
print(f" ✅ Успех! Балл: {score:.1f}%")
success_count += 1
else:
print(f" ❌ Ошибка сохранения")
error_count += 1
else:
print(f" ❌ Ошибка: {result}")
error_count += 1
# Прогресс
if idx % 10 == 0:
elapsed = time.time() - start_time
speed = idx / elapsed
eta = (total - idx) / speed if speed > 0 else 0
print(f"\n 📊 Прогресс: {idx}/{total} ({idx/total*100:.1f}%)")
print(f" ⏱️ Скорость: {speed:.2f} отелей/сек")
print(f" 🎯 ETA: {eta/60:.0f} минут\n")
print()
# Небольшая задержка между запросами
time.sleep(0.5)
# Итоги
print("\n" + "=" * 60)
print("📊 ИТОГО:")
print(f" ✅ Успешно: {success_count}")
print(f" ❌ Ошибок: {error_count}")
print(f" 📝 Всего отелей обработано: {success_count}")
print(f" ⏱️ Время работы: {(time.time() - start_time)/60:.1f} минут")
print()
print("🎉 Повторная обработка завершена!")
if __name__ == "__main__":
main()