Files
hotels/website_schema.sql
Фёдор 684fada337 🚀 Full project sync: Hotels RAG & Audit System
 Major Features:
- Complete RAG system for hotel website analysis
- Hybrid audit with BGE-M3 embeddings + Natasha NER
- Universal horizontal Excel reports with dashboards
- Multi-region processing (SPb, Orel, Chukotka, Kamchatka)

📊 Completed Regions:
- Орловская область: 100% (36/36)
- Чукотский АО: 100% (4/4)
- г. Санкт-Петербург: 93% (893/960)
- Камчатский край: 87% (89/102)

🔧 Infrastructure:
- PostgreSQL with pgvector extension
- BGE-M3 embeddings API
- Browserless for web scraping
- N8N workflows for automation
- S3/Nextcloud file storage

📝 Documentation:
- Complete DB schemas
- API documentation
- Setup guides
- Status reports
2025-10-27 22:49:42 +03:00

61 lines
2.4 KiB
SQL
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

-- Схема для хранения сырых данных с сайтов отелей
-- Сырой HTML со страниц
CREATE TABLE IF NOT EXISTS hotel_website_raw (
id SERIAL PRIMARY KEY,
hotel_id UUID REFERENCES hotel_main(id),
url TEXT NOT NULL,
page_title TEXT,
html TEXT, -- Сырой HTML
status_code INTEGER,
response_time_ms INTEGER,
depth INTEGER, -- 0 = главная, 1 = внутренняя ссылка
crawled_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(hotel_id, url)
);
-- Метаинформация о парсинге сайта
CREATE TABLE IF NOT EXISTS hotel_website_meta (
hotel_id UUID PRIMARY KEY REFERENCES hotel_main(id),
domain TEXT,
main_url TEXT,
pages_crawled INTEGER DEFAULT 0,
pages_failed INTEGER DEFAULT 0,
total_size_bytes BIGINT DEFAULT 0,
internal_links_found INTEGER,
crawl_status TEXT, -- 'in_progress', 'completed', 'failed'
crawl_started_at TIMESTAMP,
crawl_finished_at TIMESTAMP,
error_message TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Обработанный текст (после очистки, для векторизации)
CREATE TABLE IF NOT EXISTS hotel_website_processed (
id SERIAL PRIMARY KEY,
raw_page_id INTEGER REFERENCES hotel_website_raw(id),
hotel_id UUID REFERENCES hotel_main(id),
url TEXT,
cleaned_text TEXT, -- Очищенный текст
extracted_data JSONB, -- Телефоны, email, ИНН, ОГРН и т.д.
has_forms BOOLEAN,
has_booking BOOLEAN,
text_length INTEGER,
processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Индексы
CREATE INDEX IF NOT EXISTS idx_website_raw_hotel_id ON hotel_website_raw(hotel_id);
CREATE INDEX IF NOT EXISTS idx_website_raw_url ON hotel_website_raw(url);
CREATE INDEX IF NOT EXISTS idx_website_meta_status ON hotel_website_meta(crawl_status);
CREATE INDEX IF NOT EXISTS idx_website_processed_hotel_id ON hotel_website_processed(hotel_id);
COMMENT ON TABLE hotel_website_raw IS 'Сырой HTML со страниц сайтов отелей (исходники)';
COMMENT ON TABLE hotel_website_meta IS 'Метаинформация о краулинге сайтов';
COMMENT ON TABLE hotel_website_processed IS 'Обработанный текст для векторизации';