🚀 Full project sync: Hotels RAG & Audit System
✨ Major Features: - Complete RAG system for hotel website analysis - Hybrid audit with BGE-M3 embeddings + Natasha NER - Universal horizontal Excel reports with dashboards - Multi-region processing (SPb, Orel, Chukotka, Kamchatka) 📊 Completed Regions: - Орловская область: 100% (36/36) - Чукотский АО: 100% (4/4) - г. Санкт-Петербург: 93% (893/960) - Камчатский край: 87% (89/102) 🔧 Infrastructure: - PostgreSQL with pgvector extension - BGE-M3 embeddings API - Browserless for web scraping - N8N workflows for automation - S3/Nextcloud file storage 📝 Documentation: - Complete DB schemas - API documentation - Setup guides - Status reports
This commit is contained in:
60
website_schema.sql
Normal file
60
website_schema.sql
Normal file
@@ -0,0 +1,60 @@
|
||||
-- Схема для хранения сырых данных с сайтов отелей
|
||||
|
||||
-- Сырой HTML со страниц
|
||||
CREATE TABLE IF NOT EXISTS hotel_website_raw (
|
||||
id SERIAL PRIMARY KEY,
|
||||
hotel_id UUID REFERENCES hotel_main(id),
|
||||
url TEXT NOT NULL,
|
||||
page_title TEXT,
|
||||
html TEXT, -- Сырой HTML
|
||||
status_code INTEGER,
|
||||
response_time_ms INTEGER,
|
||||
depth INTEGER, -- 0 = главная, 1 = внутренняя ссылка
|
||||
crawled_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(hotel_id, url)
|
||||
);
|
||||
|
||||
-- Метаинформация о парсинге сайта
|
||||
CREATE TABLE IF NOT EXISTS hotel_website_meta (
|
||||
hotel_id UUID PRIMARY KEY REFERENCES hotel_main(id),
|
||||
domain TEXT,
|
||||
main_url TEXT,
|
||||
pages_crawled INTEGER DEFAULT 0,
|
||||
pages_failed INTEGER DEFAULT 0,
|
||||
total_size_bytes BIGINT DEFAULT 0,
|
||||
internal_links_found INTEGER,
|
||||
crawl_status TEXT, -- 'in_progress', 'completed', 'failed'
|
||||
crawl_started_at TIMESTAMP,
|
||||
crawl_finished_at TIMESTAMP,
|
||||
error_message TEXT,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Обработанный текст (после очистки, для векторизации)
|
||||
CREATE TABLE IF NOT EXISTS hotel_website_processed (
|
||||
id SERIAL PRIMARY KEY,
|
||||
raw_page_id INTEGER REFERENCES hotel_website_raw(id),
|
||||
hotel_id UUID REFERENCES hotel_main(id),
|
||||
url TEXT,
|
||||
cleaned_text TEXT, -- Очищенный текст
|
||||
extracted_data JSONB, -- Телефоны, email, ИНН, ОГРН и т.д.
|
||||
has_forms BOOLEAN,
|
||||
has_booking BOOLEAN,
|
||||
text_length INTEGER,
|
||||
processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Индексы
|
||||
CREATE INDEX IF NOT EXISTS idx_website_raw_hotel_id ON hotel_website_raw(hotel_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_website_raw_url ON hotel_website_raw(url);
|
||||
CREATE INDEX IF NOT EXISTS idx_website_meta_status ON hotel_website_meta(crawl_status);
|
||||
CREATE INDEX IF NOT EXISTS idx_website_processed_hotel_id ON hotel_website_processed(hotel_id);
|
||||
|
||||
COMMENT ON TABLE hotel_website_raw IS 'Сырой HTML со страниц сайтов отелей (исходники)';
|
||||
COMMENT ON TABLE hotel_website_meta IS 'Метаинформация о краулинге сайтов';
|
||||
COMMENT ON TABLE hotel_website_processed IS 'Обработанный текст для векторизации';
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user