feat: OCR в фоне + AI проверка полиса + debug логи

Новый сервис ocr_service.py: ✅ OCR распознавание через http://147.45.146.17:8001 ✅ AI анализ через Gemini Vision (OpenRouter) ✅ Проверка: полис или шляпа ✅ Извлечение данных полиса автоматически Логика обработки: 1. Файл загружается в S3 2. OCR запускается в фоне (RabbitMQ queue) 3. Gemini Vision анализирует текст: - document_type: policy/passport/ticket/garbage - is_valid_policy: true/false - confidence: 0.0-1.0 - extracted_data: voucher, holder_name, dates 4. Результат сохраняется в Redis (1 час TTL) Debug логи (в backend): 📤 OCR task queued: file_id - filename 💾 OCR result cached in Redis 📊 Document type: policy/garbage/other ✅ Valid: true/false, Confidence: 0.95 🗑️ GARBAGE uploaded: если не полис (silent) Новый endpoint: GET /api/v1/upload/ocr-result/{file_id} - Получить результат OCR из Redis - Можно запрашивать когда угодно Где смотреть логи: tail -f /var/www/fastuser/data/www/crm.clientright.ru/erv_platform_backend.log Результаты хранятся в Redis: key: ocr_result:{file_id} ttl: 3600 сек (1 час)
2025-10-24 21:58:34 +03:00
parent 621c8ebf01
commit 20bad53008
2 changed files with 258 additions and 1 deletions
--- a/backend/app/api/upload.py
+++ b/backend/app/api/upload.py
@@ -8,7 +8,11 @@ import uuid
 import os
 from ..config import settings
 from ..services.s3_service import s3_service
+from ..services.ocr_service import ocr_service
+from ..services.redis_service import redis_service
+from ..services.rabbitmq_service import rabbitmq_service
 import logging
+import json

 router = APIRouter(prefix="/api/v1/upload", tags=["Upload"])
 logger = logging.getLogger(__name__)
@@ -213,12 +217,54 @@ async def upload_files(files: List[UploadFile] = File(...), folder: str = "claim
                )
                
                if file_url:
+                    file_id = str(uuid.uuid4())
+                    
+                    # Запускаем OCR в фоне через RabbitMQ
+                    ocr_task = {
+                        "file_id": file_id,
+                        "file_url": file_url,
+                        "filename": file.filename,
+                        "folder": folder,
+                        "content_type": file.content_type
+                    }
+                    
+                    try:
+                        # Отправляем задачу в очередь OCR
+                        await rabbitmq_service.publish(
+                            queue_name="erv_ocr_processing",
+                            message=json.dumps(ocr_task)
+                        )
+                        logger.info(f"📤 OCR task queued: {file_id} - {file.filename}")
+                        
+                        # Также сразу запускаем OCR для быстрого результата
+                        # (параллельно с очередью для бэкапа)
+                        ocr_result = await ocr_service.process_document(content, file.filename)
+                        
+                        # Сохраняем результат в Redis на 1 час
+                        await redis_service.set(
+                            f"ocr_result:{file_id}",
+                            json.dumps(ocr_result, ensure_ascii=False),
+                            expire=3600
+                        )
+                        
+                        logger.info(f"💾 OCR result cached in Redis: {file_id}")
+                        logger.info(f"📊 Document type: {ocr_result.get('document_type')}")
+                        logger.info(f"✅ Valid: {ocr_result.get('is_valid')}, Confidence: {ocr_result.get('confidence')}")
+                        
+                        if ocr_result.get('document_type') == 'garbage':
+                            logger.warning(f"🗑️ GARBAGE uploaded: {file.filename} (but user doesn't know)")
+                        
+                    except Exception as queue_error:
+                        logger.error(f"⚠️ Queue error (non-critical): {queue_error}")
+                    
                    uploaded_files.append({
                        "success": True,
                        "filename": file.filename,
                        "url": file_url,
+                        "file_id": file_id,
                        "size": len(content),
-                        "content_type": file.content_type
+                        "content_type": file.content_type,
+                        "ocr_result": ocr_result if 'ocr_result' in locals() else None
                    })
                else:
                    uploaded_files.append({
@@ -246,3 +292,38 @@ async def upload_files(files: List[UploadFile] = File(...), folder: str = "claim
        logger.error(f"Batch upload error: {e}")
        raise HTTPException(status_code=500, detail=str(e))

+
+@router.get("/ocr-result/{file_id}")
+async def get_ocr_result(file_id: str):
+    """
+    Получить результат OCR по file_id из Redis
+    
+    Args:
+        file_id: UUID файла
+    
+    Returns:
+        OCR результат или None если еще не обработан
+    """
+    try:
+        # Достаем из Redis
+        result_json = await redis_service.get(f"ocr_result:{file_id}")
+        
+        if result_json:
+            result = json.loads(result_json)
+            return {
+                "success": True,
+                "found": True,
+                "file_id": file_id,
+                "ocr_result": result
+            }
+        else:
+            return {
+                "success": True,
+                "found": False,
+                "message": "OCR результат еще не готов или не найден"
+            }
+    
+    except Exception as e:
+        logger.error(f"Error getting OCR result: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
--- a/backend/app/services/ocr_service.py
+++ b/backend/app/services/ocr_service.py
@@ -0,0 +1,176 @@
+"""
+OCR Service - Распознавание документов + AI проверка
+"""
+import httpx
+import logging
+from typing import Optional, Dict, Any
+from ..config import settings
+import json
+
+logger = logging.getLogger(__name__)
+
+
+class OCRService:
+    """Сервис для OCR и AI анализа документов"""
+    
+    def __init__(self):
+        self.ocr_url = settings.ocr_api_url
+        self.ai_api_key = settings.openrouter_api_key
+        self.ai_model = settings.openrouter_model
+    
+    async def process_document(self, file_content: bytes, filename: str) -> Dict[str, Any]:
+        """
+        Обработка документа: OCR + AI анализ
+        
+        Args:
+            file_content: Содержимое файла
+            filename: Имя файла
+        
+        Returns:
+            Dict с результатами OCR и AI анализа
+        """
+        result = {
+            "ocr_text": "",
+            "ai_analysis": None,
+            "document_type": "unknown",  # policy, passport, ticket, other, garbage
+            "is_valid": False,
+            "confidence": 0.0,
+            "extracted_data": {}
+        }
+        
+        try:
+            # Шаг 1: OCR распознавание текста
+            logger.info(f"🔍 Starting OCR for: {filename}")
+            
+            async with httpx.AsyncClient(timeout=60.0) as client:
+                files = {"file": (filename, file_content, "image/jpeg")}
+                response = await client.post(
+                    f"{self.ocr_url}/process",
+                    files=files
+                )
+                
+                if response.status_code == 200:
+                    ocr_result = response.json()
+                    ocr_text = ocr_result.get("text", "")
+                    result["ocr_text"] = ocr_text
+                    
+                    logger.info(f"📄 OCR completed: {len(ocr_text)} chars")
+                    logger.debug(f"OCR Text preview: {ocr_text[:200]}...")
+                else:
+                    logger.error(f"❌ OCR failed: {response.status_code}")
+                    return result
+            
+            # Шаг 2: AI анализ - что это за документ?
+            logger.info(f"🤖 Starting AI analysis with {self.ai_model}")
+            
+            ai_analysis = await self._analyze_with_vision(ocr_text)
+            result["ai_analysis"] = ai_analysis
+            
+            if ai_analysis:
+                result["document_type"] = ai_analysis.get("document_type", "unknown")
+                result["is_valid"] = ai_analysis.get("is_valid_policy", False)
+                result["confidence"] = ai_analysis.get("confidence", 0.0)
+                result["extracted_data"] = ai_analysis.get("extracted_data", {})
+                
+                # Логируем результат
+                logger.info(f"✅ AI Analysis complete:")
+                logger.info(f"   Document type: {result['document_type']}")
+                logger.info(f"   Valid policy: {result['is_valid']}")
+                logger.info(f"   Confidence: {result['confidence']}")
+                
+                if result['document_type'] == 'garbage':
+                    logger.warning(f"⚠️ GARBAGE DETECTED: {filename} - not a policy document!")
+                elif result['document_type'] == 'policy':
+                    logger.info(f"✅ VALID POLICY: {filename}")
+                    if result['extracted_data']:
+                        logger.info(f"   Extracted: {json.dumps(result['extracted_data'], ensure_ascii=False)}")
+        
+        except Exception as e:
+            logger.error(f"❌ OCR/AI processing error: {e}")
+        
+        return result
+    
+    async def _analyze_with_vision(self, ocr_text: str) -> Optional[Dict[str, Any]]:
+        """
+        Анализ через Gemini Vision
+        
+        Проверяет:
+        - Это полис или нет?
+        - Извлекает данные полиса
+        """
+        try:
+            prompt = f"""Проанализируй этот текст из OCR документа.
+
+Текст: {ocr_text}
+
+Задачи:
+1. Определи тип документа: policy (страховой полис), passport, ticket, other, garbage (не документ)
+2. Если это полис - извлеки данные:
+   - voucher (номер полиса вида E1000-302538524)
+   - holder_name (ФИО держателя)
+   - insured_from (дата начала)
+   - insured_to (дата окончания)
+   - destination (страна/регион)
+3. Оцени confidence (0.0-1.0) насколько уверен
+4. is_valid_policy: true если это реальный страховой полис
+
+Ответь ТОЛЬКО в формате JSON:
+{{
+  "document_type": "policy|passport|ticket|other|garbage",
+  "is_valid_policy": true/false,
+  "confidence": 0.95,
+  "extracted_data": {{
+    "voucher": "E1000-302538524",
+    "holder_name": "...",
+    "insured_from": "DD.MM.YYYY",
+    "insured_to": "DD.MM.YYYY",
+    "destination": "..."
+  }}
+}}"""
+
+            async with httpx.AsyncClient(timeout=30.0) as client:
+                response = await client.post(
+                    "https://openrouter.ai/api/v1/chat/completions",
+                    headers={
+                        "Authorization": f"Bearer {self.ai_api_key}",
+                        "HTTP-Referer": "http://147.45.146.17:8100",
+                        "Content-Type": "application/json"
+                    },
+                    json={
+                        "model": self.ai_model,
+                        "messages": [
+                            {
+                                "role": "user",
+                                "content": prompt
+                            }
+                        ],
+                        "temperature": 0.1,
+                        "max_tokens": 500
+                    }
+                )
+                
+                if response.status_code == 200:
+                    ai_response = response.json()
+                    content = ai_response["choices"][0]["message"]["content"]
+                    
+                    # Парсим JSON из ответа
+                    # Убираем markdown если есть
+                    if "```json" in content:
+                        content = content.split("```json")[1].split("```")[0]
+                    elif "```" in content:
+                        content = content.split("```")[1].split("```")[0]
+                    
+                    analysis = json.loads(content.strip())
+                    return analysis
+                else:
+                    logger.error(f"❌ AI API error: {response.status_code}")
+                    return None
+        
+        except Exception as e:
+            logger.error(f"❌ AI analysis error: {e}")
+            return None
+
+
+# Глобальный экземпляр
+ocr_service = OCRService()
+