Files
crm.clientright.ru/pdf_processor/scripts/process_pdfs.sh
Fedor 01c4fe80b5 chore: snapshot current working tree changes
Save all currently accumulated repository changes as a backup snapshot for Gitea so no local work is lost.
2026-03-26 14:19:01 +03:00

366 lines
14 KiB
Bash

#!/bin/bash
# PDF Processing Script for n8n
set -e
# Читаем JSON из stdin (решает проблему "Argument list too long" для больших JSON)
# MODE передаётся как первый аргумент
MODE="${1:-}"
JSON_DATA="$(cat)"
WORK_DIR="$(mktemp -d -t pdfproc.XXXXXX)"
trap 'rm -rf "$WORK_DIR"' EXIT
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >&2; }
download() {
local url="$1" filename="$2"
if command -v wget >/dev/null 2>&1; then
wget -q -O "$filename" "$url" || return 1
elif command -v curl >/dev/null 2>&1; then
curl -s -o "$filename" "$url" || return 1
else
return 1
fi
[[ -s "$filename" ]] || return 1
}
to_base64() {
base64 -w 0 "$1"
}
count_pdf_pages() {
local pdf_file="$1"
if command -v gs >/dev/null 2>&1; then
gs -q -dNODISPLAY -c "($pdf_file) (r) file runpdfbegin pdfpagecount = quit" 2>/dev/null || echo "1"
else
echo "1"
fi
}
merge_pdfs() {
local output_file="$1"
shift
local input_files=("$@")
if [[ ${#input_files[@]} -eq 0 ]]; then
return 1
fi
if [[ ${#input_files[@]} -eq 1 ]]; then
cp "${input_files[0]}" "$output_file"
return 0
fi
gs -q -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -sOutputFile="$output_file" "${input_files[@]}" 2>/dev/null || return 1
[[ -s "$output_file" ]] || return 1
}
convert_to_pdf(){
local input_file="$1" output_file="$2"
local ext="${input_file##*.}"; ext=$(echo "$ext" | tr '[:upper:]' '[:lower:]')
case "$ext" in
pdf)
if [[ "$input_file" != "$output_file" ]]; then
cp "$input_file" "$output_file"
fi
;;
doc|docx|rtf|odt|xls|xlsx|ods|ppt|pptx|odp)
local tmpdir; tmpdir="$(dirname "$output_file")"
soffice --headless --convert-to pdf --outdir "$tmpdir" "$input_file" >/dev/null 2>&1 || true
local base="$(basename "${input_file%.*}")"
if [[ -f "$tmpdir/$base.pdf" ]]; then
if [[ "$tmpdir/$base.pdf" != "$output_file" ]]; then
mv "$tmpdir/$base.pdf" "$output_file"
fi
else
local produced
produced="$(ls -1 "$tmpdir"/*.pdf 2>/dev/null | head -n1 || true)"
if [[ -n "$produced" && "$produced" != "$output_file" ]]; then mv "$produced" "$output_file"; fi
fi
;;
*)
local tmpdir="$(dirname "$output_file")"
local tmpimg="$tmpdir/.img_$$.png"
convert -units PixelsPerInch -density 72 "$input_file" -auto-orient png:"$tmpimg" >/dev/null 2>&1 || true
local ROTATE=""
if dims=$(identify -format "%w %h" "$tmpimg" 2>/dev/null); then
iw=$(echo "$dims" | awk '{print $1}')
ih=$(echo "$dims" | awk '{print $2}')
if [[ -n "$iw" && -n "$ih" && "$iw" -gt "$ih" ]]; then
ROTATE="-rotate 90"
fi
fi
convert \
"$tmpimg" \
$ROTATE \
-background white -alpha remove -alpha off \
-resize 595x842 \
-gravity center -extent 595x842 \
-page 595x842 \
"$output_file" >/dev/null 2>&1 || true
rm -f "$tmpimg"
;;
esac
[[ -s "$output_file" ]] || { log "Empty PDF after convert: $output_file"; return 1; }
}
process_pdf_merge_old(){
local session_token user_id contactid
session_token=$(echo "$JSON_DATA" | jq -r '.[0].propertyName.session_token // ""')
user_id=$(echo "$JSON_DATA" | jq -r '.[0].propertyName.user_id // 0')
contactid=$(echo "$JSON_DATA" | jq -r '.[0].propertyName.contactid // 0')
declare -A file_groups
local file_count; file_count=$(echo "$JSON_DATA" | jq -r '.[0].data | length')
for ((i=0; i<file_count; i++)); do
local url=$(echo "$JSON_DATA" | jq -r ".[0].data[$i].url")
local file_name=$(echo "$JSON_DATA" | jq -r ".[0].data[$i].file_name")
local field_name=$(echo "$JSON_DATA" | jq -r ".[0].data[$i].field_name")
[[ "$url" == "null" || -z "$url" ]] && continue
local group_key=$(echo "$field_name" | sed 's/\[[0-9]\+\].*$//')
local in_path="$WORK_DIR/$file_name"
download "$url" "$in_path" || continue
local fext="${file_name##*.}"; fext=$(echo "$fext" | tr '[:upper:]' '[:lower:]')
local pdf_path
if [[ "$fext" == "pdf" ]]; then
pdf_path="$in_path"
else
pdf_path="$WORK_DIR/${file_name%.*}.pdf"
convert_to_pdf "$in_path" "$pdf_path" || continue
fi
if [[ -z "${file_groups[$group_key]}" ]]; then
file_groups[$group_key]="$pdf_path"
else
file_groups[$group_key]="${file_groups[$group_key]} $pdf_path"
fi
done
declare -a out; out=()
for group_key in "${!file_groups[@]}"; do
local files=(${file_groups[$group_key]})
local gkey="group_${group_key}"
local merged="$WORK_DIR/merged_${gkey}.pdf"
merge_pdfs "$merged" "${files[@]}" || continue
local pages=$(count_pdf_pages "$merged")
local tmpb64="$WORK_DIR/merged_${gkey}.b64"
to_base64 "$merged" > "$tmpb64"
local obj=$(jq -n \
--arg group "$gkey" \
--arg session_token "$session_token" \
--argjson user_id ${user_id:-0} \
--argjson contactid ${contactid:-0} \
--argjson pages ${pages:-0} \
--rawfile base64 "$tmpb64" \
'{group:$group, session_token:$session_token, user_id:$user_id, contactid:$contactid, pages:$pages, merged_base64:$base64}')
rm -f "$tmpb64"
out+=("$obj")
done
printf '%s\n' "${out[@]}" | jq -s '.'
}
# Формат: один объект { "data": [ { "url", "file_name", "field_name" }, ... ] } — без обёртки [0] и propertyName
process_pdf_merge_flat_data(){
local session_token="" user_id=0 contactid=0
declare -A file_groups
local file_count; file_count=$(echo "$JSON_DATA" | jq -r '.data | length')
for ((i=0; i<file_count; i++)); do
local url=$(echo "$JSON_DATA" | jq -r ".data[$i].url")
local file_name=$(echo "$JSON_DATA" | jq -r ".data[$i].file_name")
local field_name=$(echo "$JSON_DATA" | jq -r ".data[$i].field_name")
[[ "$url" == "null" || -z "$url" ]] && continue
local group_key=$(echo "$field_name" | sed 's/\[[0-9]\+\].*$//')
local in_path="$WORK_DIR/$file_name"
download "$url" "$in_path" || continue
local fext="${file_name##*.}"; fext=$(echo "$fext" | tr '[:upper:]' '[:lower:]')
local pdf_path
if [[ "$fext" == "pdf" ]]; then
pdf_path="$in_path"
else
pdf_path="$WORK_DIR/${file_name%.*}.pdf"
convert_to_pdf "$in_path" "$pdf_path" || continue
fi
if [[ -z "${file_groups[$group_key]}" ]]; then
file_groups[$group_key]="$pdf_path"
else
file_groups[$group_key]="${file_groups[$group_key]} $pdf_path"
fi
done
declare -a out; out=()
for group_key in "${!file_groups[@]}"; do
local files=(${file_groups[$group_key]})
local gkey="group_${group_key}"
local merged="$WORK_DIR/merged_${gkey}.pdf"
merge_pdfs "$merged" "${files[@]}" || continue
local pages=$(count_pdf_pages "$merged")
local tmpb64="$WORK_DIR/merged_${gkey}.b64"
to_base64 "$merged" > "$tmpb64"
local obj=$(jq -n \
--arg group "$gkey" \
--arg session_token "$session_token" \
--argjson user_id ${user_id:-0} \
--argjson contactid ${contactid:-0} \
--argjson pages ${pages:-0} \
--rawfile base64 "$tmpb64" \
'{group:$group, session_token:$session_token, user_id:$user_id, contactid:$contactid, pages:$pages, merged_base64:$base64}')
rm -f "$tmpb64"
out+=("$obj")
done
printf '%s\n' "${out[@]}" | jq -s '.'
}
process_pdf_merge_new(){
local session_token user_id contactid
session_token=$(echo "$JSON_DATA" | jq -r '.[0].files[0].meta.session_token // ""')
user_id=$(echo "$JSON_DATA" | jq -r '.[0].files[0].meta.user_id // 0')
contactid=$(echo "$JSON_DATA" | jq -r '.[0].files[0].meta.contactid // 0')
declare -a out; out=()
local glen; glen=$(echo "$JSON_DATA" | jq -r 'length')
for ((gi=0;gi<glen;gi++)); do
local gkey="group_$(echo "$JSON_DATA" | jq -r ".[$gi].group_index_num // $gi")"
declare -a pdfs; pdfs=()
local flen; flen=$(echo "$JSON_DATA" | jq -r ".[$gi].files | length")
for ((i=0;i<flen;i++)); do
local url=$(echo "$JSON_DATA" | jq -r ".[$gi].files[$i].file.url")
local file_name=$(echo "$JSON_DATA" | jq -r ".[$gi].files[$i].file.file_name")
[[ "$url" == "null" || -z "$url" ]] && continue
local in_path="$WORK_DIR/$file_name"
download "$url" "$in_path"
local fext="${file_name##*.}"; fext=$(echo "$fext" | tr '[:upper:]' '[:lower:]')
local pdf_path
if [[ "$fext" == "pdf" ]]; then
pdf_path="$in_path"
else
pdf_path="$WORK_DIR/${file_name%.*}.pdf"
convert_to_pdf "$in_path" "$pdf_path"
fi
pdfs+=("$pdf_path")
done
local merged="$WORK_DIR/merged_${gkey}.pdf"
merge_pdfs "$merged" "${pdfs[@]}" || continue
local pages=$(count_pdf_pages "$merged")
local tmpb64="$WORK_DIR/merged_${gkey}.b64"
to_base64 "$merged" > "$tmpb64"
local group_session_token=$(echo "$JSON_DATA" | jq -r ".[$gi].session_token // \"\"")
local files_count=$(echo "$JSON_DATA" | jq -r ".[$gi].files_count // 0")
local group_index_num_val=$(echo "$JSON_DATA" | jq -r ".[$gi].group_index_num // $gi")
local description=$(echo "$JSON_DATA" | jq -r ".[$gi].files[0].description // \"\"")
local meta_json=$(echo "$JSON_DATA" | jq -c ".[$gi].files[0].meta // {}")
local obj=$(jq -n \
--arg group "$gkey" \
--arg session_token "$session_token" \
--arg group_session_token "$group_session_token" \
--arg description "$description" \
--argjson user_id ${user_id:-0} \
--argjson contactid ${contactid:-0} \
--argjson files_count ${files_count:-0} \
--argjson group_index_num ${group_index_num_val:-0} \
--argjson pages ${pages:-0} \
--argjson meta "$meta_json" \
--rawfile base64 "$tmpb64" \
'{group:$group, session_token:$session_token, group_session_token:$group_session_token, user_id:$user_id, contactid:$contactid, files_count:$files_count, group_index_num:$group_index_num, description:$description, meta:$meta, pages:$pages, merged_base64:$base64}')
rm -f "$tmpb64"
out+=("$obj")
done
printf '%s\n' "${out[@]}" | jq -s '.'
}
process_pdf_merge(){
# Нормализуем JSON: n8n может передать .files как объект {"0":..., "1":...} — приводим к массиву
JSON_DATA=$(echo "$JSON_DATA" | jq '
if type == "array" then
[.[] | if .files then .files |= (if type == "array" then . else [.[] ] end) else . end]
elif .files then
.files |= (if type == "array" then . else [.[] ] end)
else
.
end')
if [[ -z "$JSON_DATA" || "$JSON_DATA" == "null" ]]; then
echo '[]'; return
fi
if echo "$JSON_DATA" | jq -e 'type=="array"' >/dev/null 2>&1; then
process_pdf_merge_new
elif echo "$JSON_DATA" | jq -e 'type=="object" and (.data | type)=="array"' >/dev/null 2>&1; then
process_pdf_merge_flat_data
elif echo "$JSON_DATA" | jq -e '.data' >/dev/null 2>&1; then
process_pdf_merge_old
elif echo "$JSON_DATA" | jq -e '.files' >/dev/null 2>&1; then
local group_data="$JSON_DATA"
local gkey="group_$(echo "$group_data" | jq -r '.group_index_num // 0')"
local session_token=$(echo "$group_data" | jq -r '.files[0].meta.session_token // ""')
local user_id=$(echo "$group_data" | jq -r '.files[0].meta.user_id // 0')
local contactid=$(echo "$group_data" | jq -r '.files[0].meta.contactid // 0')
declare -a pdfs; pdfs=()
local flen; flen=$(echo "$group_data" | jq -r '.files | length')
for ((i=0;i<flen;i++)); do
local url=$(echo "$group_data" | jq -r ".files[$i].file.url")
local file_name=$(echo "$group_data" | jq -r ".files[$i].file.file_name")
[[ "$url" == "null" || -z "$url" ]] && continue
local in_path="$WORK_DIR/$file_name"
download "$url" "$in_path" || { log "Failed to download $url"; continue; }
local fext="${file_name##*.}"; fext=$(echo "$fext" | tr '[:upper:]' '[:lower:]')
local pdf_path
if [[ "$fext" == "pdf" ]]; then
pdf_path="$in_path"
else
pdf_path="$WORK_DIR/${file_name%.*}.pdf"
convert_to_pdf "$in_path" "$pdf_path" || { log "Failed to convert $in_path to PDF"; continue; }
fi
pdfs+=("$pdf_path")
done
if [[ ${#pdfs[@]} -eq 0 ]]; then echo '[]'; return; fi
local merged="$WORK_DIR/merged_${gkey}.pdf"
merge_pdfs "$merged" "${pdfs[@]}" || { echo '[]'; return; }
local pages=$(count_pdf_pages "$merged")
local tmpb64="$WORK_DIR/merged_${gkey}.b64"
to_base64 "$merged" > "$tmpb64"
local group_session_token=$(echo "$group_data" | jq -r '.session_token // ""')
local files_count=$(echo "$group_data" | jq -r '.files_count // 0')
local group_index_num_val=$(echo "$group_data" | jq -r '.group_index_num // 0')
local description=$(echo "$group_data" | jq -r '.files[0].description // ""')
local meta_json=$(echo "$group_data" | jq -c '.files[0].meta // {}')
jq -n \
--arg group "$gkey" \
--arg session_token "$session_token" \
--arg group_session_token "$group_session_token" \
--arg description "$description" \
--argjson user_id ${user_id:-0} \
--argjson contactid ${contactid:-0} \
--argjson files_count ${files_count:-0} \
--argjson group_index_num ${group_index_num_val:-0} \
--argjson pages ${pages:-0} \
--argjson meta "$meta_json" \
--rawfile base64 "$tmpb64" \
'[{group:$group, session_token:$session_token, group_session_token:$group_session_token, user_id:$user_id, contactid:$contactid, files_count:$files_count, group_index_num:$group_index_num, description:$description, meta:$meta, pages:$pages, merged_base64:$base64}]'
rm -f "$tmpb64"
else
echo '[]'
fi
}
main(){
case "$MODE" in
--pdf-merge) process_pdf_merge ;;
*) process_pdf_merge ;;
esac
}
main