Проект аудита отелей: основные скрипты и документация

- Краулеры: smart_crawler.py, regional_crawler.py
- Аудит: audit_orel_to_excel.py, audit_chukotka_to_excel.py
- РКН проверка: check_rkn_registry.py, recheck_unclear_rkn.py
- Отчёты: create_orel_horizontal_report.py
- Обработка: process_all_hotels_embeddings.py
- Документация: README.md, DB_SCHEMA_REFERENCE.md
This commit is contained in:
Фёдор
2025-10-16 10:52:09 +03:00
parent 545e199389
commit 0cf3297290
105 changed files with 28743 additions and 0 deletions

50
check_progress.sh Executable file
View File

@@ -0,0 +1,50 @@
#!/bin/bash
echo "📊 СТАТУС ФОНОВЫХ ПРОЦЕССОВ"
echo "============================================================"
echo ""
# Проверяем процессы
echo "🔍 Активные процессы:"
ps aux | grep -E "smart_crawler|process_all_hotels_embeddings" | grep -v grep | awk '{print " PID: "$2" - "$11" "$12" "$13}'
echo ""
echo "📝 Последние логи краулера:"
tail -5 smart_crawler_output_*.log 2>/dev/null | grep -E "INFO|ERROR" | tail -3
echo ""
echo "📝 Последние логи чанкинизации:"
tail -5 embeddings_processing_*.log 2>/dev/null | grep -E "INFO|ERROR|отелей|chunks" | tail -3
echo ""
echo "📈 Статистика из БД:"
python3 << 'PYEOF'
import psycopg2
from psycopg2.extras import RealDictCursor
from urllib.parse import unquote
DB_CONFIG = {
'host': '147.45.189.234',
'port': 5432,
'database': 'default_db',
'user': 'gen_user',
'password': unquote('2~~9_%5EkVsU%3F2%5CS')
}
conn = psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor)
cur = conn.cursor()
cur.execute("SELECT COUNT(DISTINCT hotel_id) as count FROM hotel_website_raw")
crawled = cur.fetchone()['count']
cur.execute("SELECT COUNT(DISTINCT metadata->>'hotel_id') as count FROM hotel_website_chunks WHERE metadata->>'hotel_id' IS NOT NULL")
chunked = cur.fetchone()['count']
cur.execute("SELECT COUNT(*) as count FROM hotel_website_chunks")
total_chunks = cur.fetchone()['count']
print(f" 🕷️ Краулинг: {crawled:,} отелей")
print(f" 📦 Chunks: {chunked:,} отелей ({total_chunks:,} chunks)")
cur.close()
conn.close()
PYEOF