main
  1import json
  2import sys
  3
  4from pypdf import PdfReader, PdfWriter
  5
  6from extract_form_field_info import get_field_info
  7
  8
  9# Fills fillable form fields in a PDF. See forms.md.
 10
 11
 12def fill_pdf_fields(input_pdf_path: str, fields_json_path: str, output_pdf_path: str):
 13    with open(fields_json_path) as f:
 14        fields = json.load(f)
 15    # Group by page number.
 16    fields_by_page = {}
 17    for field in fields:
 18        if "value" in field:
 19            field_id = field["field_id"]
 20            page = field["page"]
 21            if page not in fields_by_page:
 22                fields_by_page[page] = {}
 23            fields_by_page[page][field_id] = field["value"]
 24    
 25    reader = PdfReader(input_pdf_path)
 26
 27    has_error = False
 28    field_info = get_field_info(reader)
 29    fields_by_ids = {f["field_id"]: f for f in field_info}
 30    for field in fields:
 31        existing_field = fields_by_ids.get(field["field_id"])
 32        if not existing_field:
 33            has_error = True
 34            print(f"ERROR: `{field['field_id']}` is not a valid field ID")
 35        elif field["page"] != existing_field["page"]:
 36            has_error = True
 37            print(f"ERROR: Incorrect page number for `{field['field_id']}` (got {field['page']}, expected {existing_field['page']})")
 38        else:
 39            if "value" in field:
 40                err = validation_error_for_field_value(existing_field, field["value"])
 41                if err:
 42                    print(err)
 43                    has_error = True
 44    if has_error:
 45        sys.exit(1)
 46
 47    writer = PdfWriter(clone_from=reader)
 48    for page, field_values in fields_by_page.items():
 49        writer.update_page_form_field_values(writer.pages[page - 1], field_values, auto_regenerate=False)
 50
 51    # This seems to be necessary for many PDF viewers to format the form values correctly.
 52    # It may cause the viewer to show a "save changes" dialog even if the user doesn't make any changes.
 53    writer.set_need_appearances_writer(True)
 54    
 55    with open(output_pdf_path, "wb") as f:
 56        writer.write(f)
 57
 58
 59def validation_error_for_field_value(field_info, field_value):
 60    field_type = field_info["type"]
 61    field_id = field_info["field_id"]
 62    if field_type == "checkbox":
 63        checked_val = field_info["checked_value"]
 64        unchecked_val = field_info["unchecked_value"]
 65        if field_value != checked_val and field_value != unchecked_val:
 66            return f'ERROR: Invalid value "{field_value}" for checkbox field "{field_id}". The checked value is "{checked_val}" and the unchecked value is "{unchecked_val}"'
 67    elif field_type == "radio_group":
 68        option_values = [opt["value"] for opt in field_info["radio_options"]]
 69        if field_value not in option_values:
 70            return f'ERROR: Invalid value "{field_value}" for radio group field "{field_id}". Valid values are: {option_values}' 
 71    elif field_type == "choice":
 72        choice_values = [opt["value"] for opt in field_info["choice_options"]]
 73        if field_value not in choice_values:
 74            return f'ERROR: Invalid value "{field_value}" for choice field "{field_id}". Valid values are: {choice_values}'
 75    return None
 76
 77
 78# pypdf (at least version 5.7.0) has a bug when setting the value for a selection list field.
 79# In _writer.py around line 966:
 80#
 81# if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0:
 82#     txt = "\n".join(annotation.get_inherited(FA.Opt, []))
 83#
 84# The problem is that for selection lists, `get_inherited` returns a list of two-element lists like
 85# [["value1", "Text 1"], ["value2", "Text 2"], ...]
 86# This causes `join` to throw a TypeError because it expects an iterable of strings.
 87# The horrible workaround is to patch `get_inherited` to return a list of the value strings.
 88# We call the original method and adjust the return value only if the argument to `get_inherited`
 89# is `FA.Opt` and if the return value is a list of two-element lists.
 90def monkeypatch_pydpf_method():
 91    from pypdf.generic import DictionaryObject
 92    from pypdf.constants import FieldDictionaryAttributes
 93
 94    original_get_inherited = DictionaryObject.get_inherited
 95
 96    def patched_get_inherited(self, key: str, default = None):
 97        result = original_get_inherited(self, key, default)
 98        if key == FieldDictionaryAttributes.Opt:
 99            if isinstance(result, list) and all(isinstance(v, list) and len(v) == 2 for v in result):
100                result = [r[0] for r in result]
101        return result
102
103    DictionaryObject.get_inherited = patched_get_inherited
104
105
106if __name__ == "__main__":
107    if len(sys.argv) != 4:
108        print("Usage: fill_fillable_fields.py [input pdf] [field_values.json] [output pdf]")
109        sys.exit(1)
110    monkeypatch_pydpf_method()
111    input_pdf = sys.argv[1]
112    fields_json = sys.argv[2]
113    output_pdf = sys.argv[3]
114    fill_pdf_fields(input_pdf, fields_json, output_pdf)