✨ Major Features: - Complete RAG system for hotel website analysis - Hybrid audit with BGE-M3 embeddings + Natasha NER - Universal horizontal Excel reports with dashboards - Multi-region processing (SPb, Orel, Chukotka, Kamchatka) 📊 Completed Regions: - Орловская область: 100% (36/36) - Чукотский АО: 100% (4/4) - г. Санкт-Петербург: 93% (893/960) - Камчатский край: 87% (89/102) 🔧 Infrastructure: - PostgreSQL with pgvector extension - BGE-M3 embeddings API - Browserless for web scraping - N8N workflows for automation - S3/Nextcloud file storage 📝 Documentation: - Complete DB schemas - API documentation - Setup guides - Status reports
131 lines
4.4 KiB
Python
131 lines
4.4 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Оценка времени работы краулера
|
||
"""
|
||
from urllib.parse import unquote
|
||
import psycopg2
|
||
from datetime import datetime, timedelta
|
||
|
||
conn = psycopg2.connect(
|
||
host='147.45.189.234',
|
||
database='default_db',
|
||
user='gen_user',
|
||
password=unquote('2~~9_%5EkVsU%3F2%5CS')
|
||
)
|
||
cur = conn.cursor()
|
||
|
||
# Общая статистика
|
||
cur.execute('SELECT COUNT(*) FROM hotel_main WHERE website_address IS NOT NULL')
|
||
total_hotels_with_sites = cur.fetchone()[0]
|
||
|
||
cur.execute('SELECT COUNT(DISTINCT hotel_id) FROM hotel_website_processed')
|
||
processed_hotels = cur.fetchone()[0]
|
||
|
||
# Осталось обработать
|
||
remaining = total_hotels_with_sites - processed_hotels
|
||
|
||
# Статистика за последние 24 часа
|
||
cur.execute("""
|
||
SELECT COUNT(DISTINCT hotel_id)
|
||
FROM hotel_website_processed
|
||
WHERE processed_at > NOW() - INTERVAL '24 hours'
|
||
""")
|
||
hotels_per_day = cur.fetchone()[0]
|
||
|
||
# Статистика за последний час
|
||
cur.execute("""
|
||
SELECT COUNT(DISTINCT hotel_id)
|
||
FROM hotel_website_processed
|
||
WHERE processed_at > NOW() - INTERVAL '1 hour'
|
||
""")
|
||
hotels_per_hour = cur.fetchone()[0]
|
||
|
||
# Время первого и последнего краулинга
|
||
cur.execute("""
|
||
SELECT MIN(processed_at), MAX(processed_at)
|
||
FROM hotel_website_processed
|
||
""")
|
||
first_date, last_date = cur.fetchone()
|
||
|
||
# Расчёт скорости
|
||
if first_date and last_date:
|
||
elapsed_time = (last_date - first_date).total_seconds() / 3600 # в часах
|
||
if elapsed_time > 0:
|
||
avg_hotels_per_hour = processed_hotels / elapsed_time
|
||
else:
|
||
avg_hotels_per_hour = hotels_per_hour
|
||
else:
|
||
avg_hotels_per_hour = hotels_per_hour
|
||
|
||
# Оценки времени
|
||
if hotels_per_hour > 0:
|
||
hours_left_current = remaining / hotels_per_hour
|
||
days_left_current = hours_left_current / 24
|
||
|
||
hours_left_avg = remaining / avg_hotels_per_hour if avg_hotels_per_hour > 0 else 0
|
||
days_left_avg = hours_left_avg / 24
|
||
|
||
eta_current = datetime.now() + timedelta(hours=hours_left_current)
|
||
eta_avg = datetime.now() + timedelta(hours=hours_left_avg)
|
||
else:
|
||
hours_left_current = 0
|
||
days_left_current = 0
|
||
hours_left_avg = 0
|
||
days_left_avg = 0
|
||
eta_current = None
|
||
eta_avg = None
|
||
|
||
print("📊 ОЦЕНКА ВРЕМЕНИ РАБОТЫ КРАУЛЕРА")
|
||
print("=" * 60)
|
||
print(f"\n🏨 ВСЕГО:")
|
||
print(f" Отелей с сайтами: {total_hotels_with_sites:,}")
|
||
print(f" Обработано: {processed_hotels:,} ({processed_hotels/total_hotels_with_sites*100:.1f}%)")
|
||
print(f" Осталось: {remaining:,}")
|
||
|
||
print(f"\n⚡ СКОРОСТЬ:")
|
||
print(f" За последний час: {hotels_per_hour} отелей/час")
|
||
print(f" За последние 24ч: {hotels_per_day} отелей/день")
|
||
if avg_hotels_per_hour > 0:
|
||
print(f" Средняя с начала: {avg_hotels_per_hour:.1f} отелей/час")
|
||
|
||
if first_date:
|
||
print(f"\n📅 ПЕРИОД:")
|
||
print(f" Начало: {first_date.strftime('%Y-%m-%d %H:%M')}")
|
||
print(f" Сейчас: {last_date.strftime('%Y-%m-%d %H:%M')}")
|
||
print(f" Прошло: {elapsed_time:.1f} часов ({elapsed_time/24:.1f} дней)")
|
||
|
||
print(f"\n⏱️ ОЦЕНКА ВРЕМЕНИ (по текущей скорости {hotels_per_hour} отелей/час):")
|
||
if eta_current:
|
||
print(f" Осталось времени: {hours_left_current:.1f} часов ({days_left_current:.1f} дней)")
|
||
print(f" Завершение: {eta_current.strftime('%Y-%m-%d %H:%M')}")
|
||
else:
|
||
print(" Недостаточно данных")
|
||
|
||
if avg_hotels_per_hour > 0:
|
||
print(f"\n⏱️ ОЦЕНКА ВРЕМЕНИ (по средней скорости {avg_hotels_per_hour:.1f} отелей/час):")
|
||
print(f" Осталось времени: {hours_left_avg:.1f} часов ({days_left_avg:.1f} дней)")
|
||
if eta_avg:
|
||
print(f" Завершение: {eta_avg.strftime('%Y-%m-%d %H:%M')}")
|
||
|
||
# Средние страниц на отель
|
||
cur.execute("""
|
||
SELECT AVG(page_count)::numeric(10,1)
|
||
FROM (
|
||
SELECT hotel_id, COUNT(*) as page_count
|
||
FROM hotel_website_processed
|
||
GROUP BY hotel_id
|
||
) sub
|
||
""")
|
||
avg_pages = cur.fetchone()[0]
|
||
|
||
print(f"\n📄 СТАТИСТИКА СТРАНИЦ:")
|
||
print(f" Среднее страниц на отель: {avg_pages}")
|
||
|
||
cur.close()
|
||
conn.close()
|
||
|
||
|
||
|
||
|
||
|