Files
crm.clientright.ru/jsonjson.js
Fedor 01c4fe80b5 chore: snapshot current working tree changes
Save all currently accumulated repository changes as a backup snapshot for Gitea so no local work is lost.
2026-03-26 14:19:01 +03:00

117 lines
2.8 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// =====================
// Получаем результат OCR
// =====================
const ocrResult = $input.item.json;
// =====================
// pages_data (нормализуем)
// =====================
const pagesData = Array.isArray(ocrResult.pages_data) ? ocrResult.pages_data : [];
// =====================
// Сбор текста со страниц
// =====================
let fullText = '';
try {
const courtActsText = $('Code in JavaScript8').first().json?.court_acts?.[0]?.text;
if (courtActsText && String(courtActsText).trim() !== '') {
fullText = String(courtActsText);
}
} catch (e) {
// Игнорируем, если нода недоступна
}
if (!fullText) {
if (pagesData.length) {
fullText = pagesData
.map(p => p.ocr_text || '')
.filter(t => t.trim() !== '')
.join('\n\n');
} else if (ocrResult?.text && String(ocrResult.text).trim() !== '') {
// Fallback: некоторые OCR возвращают итоговый текст в поле text
fullText = String(ocrResult.text);
}
}
// =====================
// Очистка OCR текста
// =====================
function cleanOCRText(text) {
if (!text) return '';
return text
.replace(/\n{3,}/g, '\n\n')
.replace(/ {3,}/g, ' ')
.split('\n')
.map(l => l.trim())
.filter(Boolean)
.join('\n')
.replace(/[©•»®™✓→←¤…›«""―''◆◇■□●○◎☆★☑☐⚫️×]/g, ' ')
.replace(/[.,;:]{2,}/g, m => m[0])
.replace(/\s+/g, ' ')
.trim();
}
const cleanedText = cleanOCRText(fullText);
// =====================
// NSFW АГРЕГАЦИЯ (ИЗ images_data)
// =====================
let documentNSFW = false;
let nsfwScoreSum = 0;
let nsfwPagesCount = 0;
const imagesData = Array.isArray(ocrResult.images_data) ? ocrResult.images_data : [];
imagesData.forEach(img => {
const pageNSFW = img.nsfw === true;
const score = Number(img.nsfw_score);
if (pageNSFW) {
documentNSFW = true;
nsfwPagesCount++;
}
if (!Number.isNaN(score)) {
nsfwScoreSum += score;
}
});
const pagesCount =
Number(ocrResult.pages) ||
imagesData.length ||
pagesData.length ||
0;
const nsfwScoreAvg =
pagesCount > 0
? nsfwScoreSum / pagesCount
: 0;
// =====================
// pages_data → JSON для PostgreSQL
// =====================
const pagesDataJson = JSON.stringify(pagesData);
// =====================
// RETURN
// =====================
return [{
json: {
...ocrResult,
// тексты
original_text: fullText,
cleaned_text: cleanedText,
// страницы
page_count: pagesCount,
pages_data: pagesData,
pages_data_json: pagesDataJson,
// NSFW (ГЛАВНОЕ)
document_nsfw: documentNSFW,
nsfw_pages_count: nsfwPagesCount,
nsfw_score_avg: Number(nsfwScoreAvg.toFixed(6)),
}
}];