Is there a way to automatically mark up a PDF with the target page number opposi...

Question 1

I often have a situation where I have a PDF which contains pages with text, and some words of the text have links which point to other pages within the same PDF. Is there a way to get the page number pointed to by each link to appear opposite the link (i.e. same Y value as link and an X value almost up to the right hand edge of the page) so that someone using a hardcopy of the PDF (who therefore cannot click on links) can see which page number the link points to so that they can turn to that page?

I have looked at using Adobe Acrobat, PDF X-Change Editor, Nitro PDF, and Foxit but none have a facility which could do this.

TL;DR If you are interested in the wider background (not necessary to understand the question but just out of interest) the background is legal procedure. At a court hearing there is a collection of documents combined into a single bookmarked PDF. Some of the documents are witness statements. Most of the documents are ordinary letters, emails, invoices, photos etc. from the past which happen to be relevant to what the court case is about. Statements will refer to specific documents but the documents they refer to won't be positioned immediately following each statement but will be scattered throughout the PDF (because they are required to be in a specific order). To make it easy when reading through statements, every time a statement mentions a document the words referencing that document are hyperlinked to the document. So far so straightforward. But not everyone uses the PDF. Some people just use a printout of the PDF and so can't use the hyperlinks. To cater for them the PDF page number of each document hyperlinked to in a statement is written opposite the hyperlinked words.

Question 2

Here is a script which accomplishes this using the pdftl API.

To set it up, make a venv and activate it, run pip install 'pdftl[full]', save the script as markup.py and then run python markup.py filename.pdf.

#!/usr/bin/env python3

# See https://superuser.com/a/1938267
# And/or https://gist.github.com/pdftl-dev/d09ecab220919a5532df81a553fcca8e/

import os
import sys
import pikepdf
import pdftl

SCRIPT_VERSION = 19

SHOW_SKIPPED = True
DEBUG = True

MERGE_CLASHES = True


def add_page_markers_with_dump(input_pdf_path: str, offset: int, page_specs: list):
    base, ext = os.path.splitext(input_pdf_path)
    output_pdf_path = f"{base}_printed{ext}"
    pdf = pikepdf.open(input_pdf_path)
    print(f"Dumping annotations for {input_pdf_path}...")
    op_args = ["/Link"]
    if page_specs:
        op_args = [f"{spec}/Link" for spec in page_specs]
    dump_result = pdftl.dump_annots(pdf, operation_args=op_args, full_result=True)
    links = dump_result.data
    print(f"Found {len(links)} potential links.")
    page_numbers = None
    spec_data = [
        spec
        for spec in [link_to_text_spec(link, pdf) for link in links]
        if spec is not None
    ]
    print(f"Resolved {len(spec_data)} link page numbers.")
    if MERGE_CLASHES:
        spec_data = merge_clashing_spec_data(spec_data)
        print(f"After merging clashes: {len(spec_data)} items to overlay.")
    text_specs = [
        (
            f"{source_page}!→ p. {target_page_with_offset}"
            f"!(x={target_x}pt, y={target_y}pt, size=8, color=0.4 0.4 0.4, align=right)"
        )
        for (source_page, target_page_with_offset, target_x, target_y) in spec_data
    ]
    if text_specs:
        print(f"Overlaying {len(text_specs)} hardcopy markers...")
        pdftl.pipeline(pdf).add_text(*text_specs).save(output_pdf_path)
        print(f"Success! Saved to: {output_pdf_path}")
    else:
        print("No internal links found to mark. Doing nothing.")


def merge_clashing_spec_data(data, clash_dist=5):
    last_page = None
    last_y = None
    new_data = []
    data.sort(key=lambda x: -x[3])
    for record in data:
        source_page, target_page, target_x, target_y = record
        if source_page != last_page:
            last_page = source_page
            last_y = target_y
            new_data.append(record)
            continue
        if last_y is not None and target_y - clash_dist <= last_y <= target_y + clash_dist:
            last_y = min(target_y, last_y)
            last_record = new_data.pop()
            new_target_pages = f"{last_record[1]},{target_page}"
            new_data.append((source_page, new_target_pages, target_x, (last_y + target_y) / 2))
            continue
        last_y = target_y
        new_data.append(record)
    return new_data


def link_to_text_spec(link, pdf):
    try:
        props = link["Properties"]
        rect = props.get("/Rect")
        target_y = (float(rect[1]) + float(rect[3])) / 2 - 2
    except (AttributeError, ValueError, TypeError) as exc:
        if SHOW_SKIPPED:
            print(f"Skipping[badrect] {link}")
            if DEBUG:
                print(f"Exception[badrect]: {exc}")
        return None
    try:
        source_page = link["Page"]
        page_box = pdf.pages[source_page - 1].trimbox
        page_width = float(page_box[2]) - float(page_box[0])
    except (AttributeError, ValueError, TypeError) as exc:
        if SHOW_SKIPPED:
            print(f"Skipping[badwidth] {link}")
            if DEBUG:
                print(f"Exception[badwidth]: {type(exc).__name__}: {exc}")
        return None
    target_page = get_target_page(props)
    if target_page is None:
        if SHOW_SKIPPED:
            print(f"No target page found, skipping {link}")
        return None
    target_x = page_width - 10  # 10pt margin from the right edge
    return source_page, target_page - offset, target_x, target_y


def get_target_page(props):
    caught_errors = (AttributeError, TypeError, IndexError, KeyError)
    exceptions = []
    try:
        return props.get("/A").get("ResolvedDestination").get("TargetPage")
    except caught_errors as exc:
        exceptions.append(f"Exception 1[A.RD.TP]: {type(exc).__name__}: {exc}")
    try:
        return props.get("/Dest")[0].get("Page")
    except caught_errors as exc:
        exceptions.append(f"Exception 2[Dest0.P]: {type(exc).__name__}: {exc}")

    try:
        return props.get("/A").get("/D")[0].get("Page")
    except caught_errors as exc:
        exceptions.append(f"Exception 3[A.D0.P]: {type(exc).__name__}: {exc}")

    if DEBUG:
        print("\n".join(exceptions))

    return None


if __name__ == "__main__":
    if DEBUG:
        print(f"SCRIPT VERSION: {SCRIPT_VERSION}")
        # pdftl.cli.help_version.print_version()
    if len(sys.argv) < 2:
        print(
            f"Usage: python {sys.argv[0]} <input.pdf> [<offset>] [<page_ranges>...]",
            file=sys.stderr,
        )
        sys.exit(1)
    input_file = sys.argv[1]
    offset = 0 if len(sys.argv) < 3 else int(sys.argv[2])
    page_specs = sys.argv[3:]
    add_page_markers_with_dump(input_file, offset, page_specs)

You can also run markup.py filename.pdf 5 if you need to offset the page numbers by 5 (for example, if PDF page 6 should be printed as "p. 1" because the physical and printed page numbers are different).

And markup.py filename.pdf 0 5-10 70-end will only target links on pages 5-10 and 70-end.

demo: demo output

Disclaimer: I am the pdftl developer