pdf-lambda-crop/cropper.py
2025-10-07 14:17:39 -04:00

232 lines
6.5 KiB
Python

import sys
from pypdf import PdfWriter, PdfReader, Transformation
import pdfplumber
import io
from dataclasses import dataclass
@dataclass
class Setting:
width: float = 288 # 4 * 72
height: float = 432 # 6 * 72
def event_argument(self, event, index, field=None):
if field is None:
field = index
try:
raw = event[index]
p_type = self.__annotations__[field]
except KeyError:
return
if p_type.__name__ == 'boolean':
setattr(self, field, raw == True or raw == 'true')
if p_type.__name__ == 'float':
setattr(self, field, float(raw))
if p_type.__name__ == 'integer':
setattr(self, field, int(raw))
@dataclass
class AutoSettings(Setting):
margin: float = 7.2 # 0.1 * 72
@dataclass
class ManualSettings(Setting):
start_x: float = 90
start_y: float = 90
scale_x: float = 1
scale_y: float = 1
rotate: int = 90
messages = list()
def argument_default(event, index, default, cast):
try:
val = event[index]
if cast == 'float':
return float(val)
if cast == 'integer':
return int(val)
if cast == 'boolean':
return val == 'true'
return val
except KeyError:
return default
def extents_by_rectangle(page) -> tuple[int|None, int|None, int|None, int|None]:
largest_area = None
largest_rect = None
for rect in page.rects:
area = rect["width"] * rect["height"]
if largest_area is None or area > largest_area:
largest_area = area
largest_rect = rect
if largest_rect is None:
return None, None, None, None
return largest_rect["x0"], largest_rect["y0"], largest_rect["x1"], largest_rect["y1"]
def extents_by_image(page) -> tuple[int|None, int|None, int|None, int|None]:
min_x = None
min_y = None
max_x = None
max_y = None
for obj in page.images:
if min_x is None or obj["x0"] < min_x:
min_x = obj["x0"]
if min_y is None or obj["y0"] < min_y:
min_y = obj["y0"]
if max_x is None or obj["x1"] > max_x:
max_x = obj["x1"]
if max_y is None or obj["y1"] > max_y:
max_y = obj["y1"]
return min_x, min_y, max_x, max_y
# def extents_by_all(page) -> tuple[int|None, int|None, int|None, int|None]:
# min_x = None
# min_y = None
# max_x = None
# max_y = None
#
# for obj in page.objects:
# if obj == 'image':
# x0, y0, x1, y1 = extents_by_image(page)
# if x0 is None or y0 is None or x1 is None or y1 is None:
# continue
# if min_x is None or x0 < min_x:
# min_x = x0
# if min_y is None or y0 < min_y:
# min_y = y0
# if max_x is None or x1 > max_x:
# max_x = x1
# if max_y is None or y1 > max_y:
# max_y = y1
#
# return min_x, min_y, max_x, max_y
def auto_crop(content: io.BytesIO, settings: AutoSettings) -> io.BytesIO:
messages.append("Using auto-crop mode")
extents_method = 'rectangle'
with pdfplumber.open(content) as pdf:
page = pdf.pages[0]
min_x, min_y, max_x, max_y = extents_by_rectangle(page)
if min_x is None or min_y is None or max_x is None or max_y is None:
extents_method = 'image'
min_x, min_y, max_x, max_y = extents_by_image(page)
if min_x is None or min_y is None or max_x is None or max_y is None:
raise Exception("Unable to find the extents of the document")
messages.append(f"Used {extents_method} to find extents")
min_x = round(min_x - settings.margin)
min_y = round(min_y - settings.margin)
max_x = round(max_x + settings.margin)
max_y = round(max_y + settings.margin)
width = max_x - min_x
height = max_y - min_y
messages.append(f"Document extents: {width}x{height}")
rotate = False
if width > height:
h = width
width = height
height = h
rotate = True
scale = 1
if width > settings.width or height > settings.height:
x_scale = settings.width / width
y_scale = settings.height / height
scale = min(x_scale, y_scale)
min_x = round((min_x * scale) - settings.margin)
min_y = round((min_y * scale) - settings.margin)
max_x = round((max_x * scale) + settings.margin)
max_y = round((max_y * scale) + settings.margin)
reader = PdfReader(content)
writer = PdfWriter()
p1 = reader.pages[0]
p1.cropbox.upper_left = (min_x, min_y)
p1.cropbox.upper_right = (max_x, min_y)
p1.cropbox.lower_left = (min_x, max_y)
p1.cropbox.lower_right = (max_x, max_y)
if rotate:
p1.rotate(90)
messages.append("Rotated 90 degrees")
if scale != 1:
op = Transformation().scale(sx=scale, sy=scale)
p1.add_transformation(op)
messages.append(f"Scaled by {scale}")
writer.add_page(p1)
buffer = io.BytesIO()
writer.write(buffer)
buffer.seek(0)
return buffer
def manual_crop(content: io.BytesIO, settings: ManualSettings) -> io.BytesIO:
messages.append("Using manual-crop mode")
# Swap directions to be less confusing for inputs
if settings.rotate == 90 or settings.rotate == -90:
sx = settings.start_x
settings.start_x = settings.start_y
settings.start_y = sx
h = settings.height
settings.height = settings.width
settings.width = h
sy = settings.scale_y
settings.scale_y = settings.scale_x
settings.scale_x = sy
reader = PdfReader(content)
writer = PdfWriter()
p1 = reader.pages[0]
top = p1.mediabox.top - settings.start_y
bottom = top - (settings.height * 72)
left = settings.start_x
right = left + (settings.width * 72)
p1.mediabox.upper_left = (left, top)
p1.mediabox.upper_right = (right, top)
p1.mediabox.lower_left = (left, bottom)
p1.mediabox.lower_right = (right, bottom)
if settings.scale_x != 1 or settings.scale_y != 1:
op = Transformation().scale(sx=settings.scale_x, sy=settings.scale_y)
p1.add_transformation(op)
messages.append(f"Scaled by {settings.scale_x}x{settings.scale_y}")
if settings.rotate != 0:
writer.add_page(p1.rotate(settings.rotate))
messages.append(f"Rotated {settings.rotate} degrees")
else:
writer.add_page(p1)
buffer = io.BytesIO()
writer.write(buffer)
buffer.seek(0)
return buffer