Save all currently accumulated repository changes as a backup snapshot for Gitea so no local work is lost.
257 lines
7.5 KiB
PHP
257 lines
7.5 KiB
PHP
<?php
|
||
/**
|
||
* Пример использования таблицы sprf_court_decisions
|
||
* Для работы с судебными решениями из Telegram
|
||
*/
|
||
|
||
// Функция для вычисления SHA-256 хеша файла
|
||
function calculateFileHash($filePath) {
|
||
return hash_file('sha256', $filePath);
|
||
}
|
||
|
||
// Функция для сохранения файла в БД при загрузке из Telegram
|
||
function saveCourtDecisionFile($pdo, $fileData) {
|
||
$sql = "
|
||
INSERT INTO sprf_court_decisions (
|
||
uuid,
|
||
file_name,
|
||
file_size,
|
||
mime_type,
|
||
file_hash,
|
||
s3_url,
|
||
telegram_message_id,
|
||
telegram_chat_id,
|
||
telegram_user_id,
|
||
telegram_username,
|
||
telegram_full_name,
|
||
processing_status
|
||
) VALUES (
|
||
gen_random_uuid()::text,
|
||
:file_name,
|
||
:file_size,
|
||
:mime_type,
|
||
:file_hash,
|
||
:s3_url,
|
||
:telegram_message_id,
|
||
:telegram_chat_id,
|
||
:telegram_user_id,
|
||
:telegram_username,
|
||
:telegram_full_name,
|
||
'pending'
|
||
) RETURNING id, uuid, uploaded_at
|
||
";
|
||
|
||
$stmt = $pdo->prepare($sql);
|
||
$stmt->execute([
|
||
':file_name' => $fileData['file_name'],
|
||
':file_size' => $fileData['file_size'],
|
||
':mime_type' => $fileData['mime_type'],
|
||
':file_hash' => $fileData['file_hash'],
|
||
':s3_url' => $fileData['s3_url'],
|
||
':telegram_message_id' => $fileData['telegram_message_id'] ?? null,
|
||
':telegram_chat_id' => $fileData['telegram_chat_id'] ?? null,
|
||
':telegram_user_id' => $fileData['telegram_user_id'],
|
||
':telegram_username' => $fileData['telegram_username'] ?? null,
|
||
':telegram_full_name' => $fileData['telegram_full_name'] ?? null,
|
||
]);
|
||
|
||
return $stmt->fetch(PDO::FETCH_ASSOC);
|
||
}
|
||
|
||
// Функция для проверки дубликатов
|
||
function checkDuplicate($pdo, $fileHash) {
|
||
$sql = "
|
||
SELECT id, uuid, file_name, uploaded_at, processing_status
|
||
FROM sprf_court_decisions
|
||
WHERE file_hash = :file_hash
|
||
LIMIT 1
|
||
";
|
||
|
||
$stmt = $pdo->prepare($sql);
|
||
$stmt->execute([':file_hash' => $fileHash]);
|
||
|
||
return $stmt->fetch(PDO::FETCH_ASSOC);
|
||
}
|
||
|
||
// Функция для обновления статуса после OCR
|
||
function updateOCRStatus($pdo, $uuid, $ocrResult) {
|
||
$sql = "
|
||
UPDATE sprf_court_decisions
|
||
SET
|
||
ocr_processed = TRUE,
|
||
ocr_processed_at = CURRENT_TIMESTAMP,
|
||
ocr_text = :ocr_text,
|
||
ocr_pages_data = :ocr_pages_data::jsonb,
|
||
ocr_pages_count = :ocr_pages_count,
|
||
processing_status = CASE
|
||
WHEN vector_processed THEN 'completed'
|
||
ELSE 'processing'
|
||
END,
|
||
updated_at = CURRENT_TIMESTAMP
|
||
WHERE uuid = :uuid
|
||
RETURNING id, uuid, ocr_pages_count
|
||
";
|
||
|
||
// Собираем текст со всех страниц
|
||
$fullText = '';
|
||
$pagesCount = count($ocrResult['pages_data'] ?? []);
|
||
|
||
if (isset($ocrResult['pages_data']) && is_array($ocrResult['pages_data'])) {
|
||
$pagesTexts = array_map(function($page) {
|
||
return $page['ocr_text'] ?? '';
|
||
}, $ocrResult['pages_data']);
|
||
$fullText = implode("\n\n", array_filter($pagesTexts));
|
||
}
|
||
|
||
$stmt = $pdo->prepare($sql);
|
||
$stmt->execute([
|
||
':uuid' => $uuid,
|
||
':ocr_text' => $fullText,
|
||
':ocr_pages_data' => json_encode($ocrResult['pages_data'] ?? [], JSON_UNESCAPED_UNICODE),
|
||
':ocr_pages_count' => $pagesCount,
|
||
]);
|
||
|
||
return $stmt->fetch(PDO::FETCH_ASSOC);
|
||
}
|
||
|
||
// Функция для обновления статуса после векторизации
|
||
function updateVectorStatus($pdo, $uuid, $vectorStoreId, $fileIds) {
|
||
$sql = "
|
||
UPDATE sprf_court_decisions
|
||
SET
|
||
vector_processed = TRUE,
|
||
vector_processed_at = CURRENT_TIMESTAMP,
|
||
vector_store_id = :vector_store_id,
|
||
vector_file_ids = :vector_file_ids::jsonb,
|
||
processing_status = 'completed',
|
||
updated_at = CURRENT_TIMESTAMP
|
||
WHERE uuid = :uuid
|
||
RETURNING id, uuid, processing_status
|
||
";
|
||
|
||
$stmt = $pdo->prepare($sql);
|
||
$stmt->execute([
|
||
':uuid' => $uuid,
|
||
':vector_store_id' => $vectorStoreId,
|
||
':vector_file_ids' => json_encode($fileIds),
|
||
]);
|
||
|
||
return $stmt->fetch(PDO::FETCH_ASSOC);
|
||
}
|
||
|
||
// Функция для получения файлов, требующих OCR обработки
|
||
function getFilesForOCR($pdo, $limit = 10) {
|
||
$sql = "
|
||
SELECT
|
||
id,
|
||
uuid,
|
||
file_name,
|
||
s3_url,
|
||
uploaded_at
|
||
FROM sprf_court_decisions
|
||
WHERE
|
||
ocr_processed = FALSE
|
||
AND processing_status IN ('pending', 'processing')
|
||
ORDER BY uploaded_at ASC
|
||
LIMIT :limit
|
||
";
|
||
|
||
$stmt = $pdo->prepare($sql);
|
||
$stmt->bindValue(':limit', $limit, PDO::PARAM_INT);
|
||
$stmt->execute();
|
||
|
||
return $stmt->fetchAll(PDO::FETCH_ASSOC);
|
||
}
|
||
|
||
// Функция для получения файлов, требующих векторизации
|
||
function getFilesForVector($pdo, $limit = 10) {
|
||
$sql = "
|
||
SELECT
|
||
id,
|
||
uuid,
|
||
file_name,
|
||
ocr_text,
|
||
ocr_pages_data,
|
||
s3_url
|
||
FROM sprf_court_decisions
|
||
WHERE
|
||
ocr_processed = TRUE
|
||
AND vector_processed = FALSE
|
||
AND processing_status IN ('processing', 'completed')
|
||
AND ocr_text IS NOT NULL
|
||
ORDER BY ocr_processed_at ASC
|
||
LIMIT :limit
|
||
";
|
||
|
||
$stmt = $pdo->prepare($sql);
|
||
$stmt->bindValue(':limit', $limit, PDO::PARAM_INT);
|
||
$stmt->execute();
|
||
|
||
return $stmt->fetchAll(PDO::FETCH_ASSOC);
|
||
}
|
||
|
||
// Пример использования:
|
||
//
|
||
// 1. При получении файла из Telegram:
|
||
/*
|
||
$filePath = '/tmp/downloaded_file.pdf';
|
||
$fileHash = calculateFileHash($filePath);
|
||
|
||
// Проверяем дубликат
|
||
$duplicate = checkDuplicate($pdo, $fileHash);
|
||
if ($duplicate) {
|
||
echo "Файл уже существует: {$duplicate['uuid']}";
|
||
exit;
|
||
}
|
||
|
||
// Загружаем в S3 и получаем URL
|
||
$s3Url = uploadToS3($filePath);
|
||
|
||
// Сохраняем в БД
|
||
$fileData = [
|
||
'file_name' => 'file_0.pdf',
|
||
'file_size' => filesize($filePath),
|
||
'mime_type' => 'application/pdf',
|
||
'file_hash' => $fileHash,
|
||
's3_url' => $s3Url,
|
||
'telegram_message_id' => $messageId,
|
||
'telegram_chat_id' => $chatId,
|
||
'telegram_user_id' => $userId,
|
||
'telegram_username' => $username,
|
||
'telegram_full_name' => $fullName,
|
||
];
|
||
|
||
$result = saveCourtDecisionFile($pdo, $fileData);
|
||
echo "Файл сохранен с UUID: {$result['uuid']}";
|
||
*/
|
||
|
||
// 2. После OCR обработки:
|
||
/*
|
||
$ocrResult = [
|
||
'pages_data' => [
|
||
[
|
||
'page' => 1,
|
||
'ocr_text' => 'Текст страницы 1...',
|
||
'image_url' => '/static/vision_input/file1.png'
|
||
],
|
||
// ...
|
||
]
|
||
];
|
||
|
||
$result = updateOCRStatus($pdo, $uuid, $ocrResult);
|
||
echo "OCR обработан: {$result['ocr_pages_count']} страниц";
|
||
*/
|
||
|
||
// 3. После векторизации:
|
||
/*
|
||
$result = updateVectorStatus(
|
||
$pdo,
|
||
$uuid,
|
||
'vs_abc123xyz',
|
||
['file-id-1', 'file-id-2']
|
||
);
|
||
echo "Векторизация завершена";
|
||
*/
|
||
|
||
?>
|