#!/usr/bin/env python3 """ API для семантического поиска по эмбеддингам Интеграция с веб-интерфейсом """ from fastapi import FastAPI, HTTPException from pydantic import BaseModel from typing import List, Dict, Optional import psycopg2 from psycopg2.extras import RealDictCursor from urllib.parse import unquote import requests import json app = FastAPI( title="Semantic Search API", description="API для семантического поиска по эмбеддингам отелей", version="1.0.0" ) # Конфигурация DB_CONFIG = { 'host': "147.45.189.234", 'port': 5432, 'database': "default_db", 'user': "gen_user", 'password': unquote("2~~9_%5EkVsU%3F2%5CS") } BGE_API_URL = "http://147.45.146.17:8002/embed" BGE_API_KEY = "22564b177aa73b6ac0b8642d7773350ff4c01d4983f028beff15ea247f09fa89" class SearchRequest(BaseModel): query: str region: Optional[str] = None hotel_id: Optional[str] = None limit: int = 10 min_distance: float = 0.3 class SearchResult(BaseModel): hotel_name: str region_name: str url: str text: str distance: float relevance: str def get_db_connection(): """Получить подключение к БД""" return psycopg2.connect(**DB_CONFIG, cursor_factory=RealDictCursor) def generate_query_embedding(query: str): """Генерация эмбеддинга для поискового запроса""" try: headers = { "X-API-Key": BGE_API_KEY, "Content-Type": "application/json" } payload = {"text": query} response = requests.post(BGE_API_URL, json=payload, headers=headers, timeout=30) if response.status_code == 200: result = response.json() return result.get('embeddings', [[]])[0] else: raise HTTPException(status_code=500, detail=f"BGE API error: {response.status_code}") except Exception as e: raise HTTPException(status_code=500, detail=f"Embedding generation error: {str(e)}") @app.get("/") async def root(): """Информация об API""" return { "name": "Semantic Search API", "version": "1.0.0", "description": "API для семантического поиска по эмбеддингам отелей", "endpoints": [ "POST /search - Семантический поиск", "GET /regions - Список регионов", "GET /hotels - Список отелей", "GET /stats - Статистика" ] } @app.post("/search", response_model=List[SearchResult]) async def semantic_search(request: SearchRequest): """Семантический поиск по эмбеддингам""" try: # Генерируем эмбеддинг для запроса query_embedding = generate_query_embedding(request.query) embedding_str = json.dumps(query_embedding) # Строим SQL запрос с фильтрами where_conditions = ["embedding IS NOT NULL"] params = [] if request.region: where_conditions.append("metadata->>'region_name' = %s") params.append(request.region) if request.hotel_id: where_conditions.append("metadata->>'hotel_id' = %s") params.append(request.hotel_id) where_clause = " AND ".join(where_conditions) query = f""" SELECT metadata->>'hotel_name' as hotel_name, metadata->>'region_name' as region_name, metadata->>'url' as url, LEFT(text, 300) as text, embedding <-> %s::vector as distance FROM hotel_website_chunks WHERE {where_clause} ORDER BY embedding <-> %s::vector LIMIT %s; """ # Добавляем параметры в правильном порядке params = [embedding_str] + params + [embedding_str, request.limit] conn = get_db_connection() cur = conn.cursor() cur.execute(query, params) results = [] for row in cur.fetchall(): distance = row['distance'] if distance < 0.9: relevance = "🟢 Высокая" elif distance < 1.0: relevance = "🟡 Средняя" else: relevance = "🔴 Низкая" results.append(SearchResult( hotel_name=row['hotel_name'] or "Неизвестный отель", region_name=row['region_name'] or "Неизвестный регион", url=row['url'] or "", text=row['text'] or "", distance=float(distance), relevance=relevance )) cur.close() conn.close() return results except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.get("/regions") async def get_regions(): """Получить список регионов с эмбеддингами""" try: conn = get_db_connection() cur = conn.cursor() cur.execute(""" SELECT metadata->>'region_name' as region_name, COUNT(DISTINCT metadata->>'hotel_id') as hotels_count, COUNT(*) as chunks_count FROM hotel_website_chunks WHERE metadata->>'region_name' IS NOT NULL GROUP BY metadata->>'region_name' ORDER BY chunks_count DESC; """) regions = [] for row in cur.fetchall(): regions.append({ "region_name": row['region_name'], "hotels_count": row['hotels_count'], "chunks_count": row['chunks_count'] }) cur.close() conn.close() return {"regions": regions} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.get("/hotels") async def get_hotels(region: Optional[str] = None): """Получить список отелей с эмбеддингами""" try: conn = get_db_connection() cur = conn.cursor() if region: cur.execute(""" SELECT DISTINCT metadata->>'hotel_id' as hotel_id, metadata->>'hotel_name' as hotel_name, metadata->>'region_name' as region_name, COUNT(*) as chunks_count FROM hotel_website_chunks WHERE metadata->>'region_name' = %s GROUP BY metadata->>'hotel_id', metadata->>'hotel_name', metadata->>'region_name' ORDER BY chunks_count DESC; """, (region,)) else: cur.execute(""" SELECT DISTINCT metadata->>'hotel_id' as hotel_id, metadata->>'hotel_name' as hotel_name, metadata->>'region_name' as region_name, COUNT(*) as chunks_count FROM hotel_website_chunks GROUP BY metadata->>'hotel_id', metadata->>'hotel_name', metadata->>'region_name' ORDER BY chunks_count DESC; """) hotels = [] for row in cur.fetchall(): hotels.append({ "hotel_id": row['hotel_id'], "hotel_name": row['hotel_name'], "region_name": row['region_name'], "chunks_count": row['chunks_count'] }) cur.close() conn.close() return {"hotels": hotels} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.get("/stats") async def get_stats(): """Получить статистику по эмбеддингам""" try: conn = get_db_connection() cur = conn.cursor() # Общая статистика cur.execute(""" SELECT COUNT(*) as total_chunks, COUNT(DISTINCT metadata->>'hotel_id') as total_hotels, COUNT(DISTINCT metadata->>'region_name') as total_regions, AVG(LENGTH(text)) as avg_chunk_length FROM hotel_website_chunks; """) stats = cur.fetchone() # Статистика по регионам cur.execute(""" SELECT metadata->>'region_name' as region_name, COUNT(DISTINCT metadata->>'hotel_id') as hotels_count, COUNT(*) as chunks_count FROM hotel_website_chunks WHERE metadata->>'region_name' IS NOT NULL GROUP BY metadata->>'region_name' ORDER BY chunks_count DESC; """) regions_stats = [] for row in cur.fetchall(): regions_stats.append({ "region_name": row['region_name'], "hotels_count": row['hotels_count'], "chunks_count": row['chunks_count'] }) cur.close() conn.close() return { "total_chunks": stats['total_chunks'], "total_hotels": stats['total_hotels'], "total_regions": stats['total_regions'], "avg_chunk_length": float(stats['avg_chunk_length']) if stats['avg_chunk_length'] else 0, "regions": regions_stats } except Exception as e: raise HTTPException(status_code=500, detail=str(e)) if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=8001)