diff options
Diffstat (limited to 'extract/Makefile')
-rw-r--r-- | extract/Makefile | 232 |
1 files changed, 199 insertions, 33 deletions
diff --git a/extract/Makefile b/extract/Makefile index 086c09b7..31b099c3 100644 --- a/extract/Makefile +++ b/extract/Makefile @@ -18,6 +18,9 @@ # to docx. We require that $(gs) was built with --with-extract-dir=... We # also do a simple test of output-file-per-page. # +# make test-tables +# Tests handling of tables, using mutool with docx device's html output. +# # make test-buffer test-misc test-src # Runs unit tests etc. # @@ -53,13 +56,21 @@ else ifeq ($(build),debug-opt) else ifeq ($(build),memento) flags_link += -g -dl ifeq ($(uname),OpenBSD) - flags_link += -L /usr/local/lib -l execinfo + flags_link += -l execinfo endif flags_compile += -g -D MEMENTO else $(error unrecognised $$(build)=$(build)) endif +gdb = gdb +ifeq ($(uname),OpenBSD) + flags_link += -L /usr/local/lib -l execinfo + $(warning have added -L /usr/local/lib) + gdb = egdb + # For some reason OpenBSD's gmake defaults CXX to g++, which is not helpful. + CXX = c++ +endif # Locations of mutool and gs. By default we assume these are not available. # @@ -72,7 +83,11 @@ endif we_are_mupdf_thirdparty = $(findstring /mupdf/thirdparty/extract, $(abspath .)) ifneq ($(we_are_mupdf_thirdparty),) $(warning we are mupdf thirdparty) - mutool := ../../build/debug/mutool + ifeq ($(build),memento) + mutool := ../../build/memento/mutool + else + mutool := ../../build/debug/mutool + endif gs := ../../../ghostpdl/debug-extract-bin/gs libbacktrace = ../../../libbacktrace/.libs endif @@ -86,6 +101,12 @@ endif $(warning mutool=$(mutool)) endif +ifeq ($(build),memento) + mutool_run := MEMENTO_ABORT_ON_LEAK=1 $(mutool) +else + mutool_run := $(mutool) +endif + ifneq ($(gs),) ifeq ($(wildcard $(gs)),) $(error gs does not exist: $(gs)) @@ -96,7 +117,7 @@ endif # Default target - run all tests. # -test: test-buffer test-misc test-src test-exe test-mutool test-gs +test: test-buffer test-misc test-src test-exe test-mutool test-gs test-html test-tables @echo $@: passed # Define the main test targets. @@ -115,7 +136,9 @@ ifneq ($(mutool),) tests_exe := $(tests_exe) $(patsubst %, %.intermediate-mu.xml, $(pdfs_generated)) endif ifneq ($(gs),) - tests_exe := $(tests_exe) $(patsubst %, %.intermediate-gs.xml, $(pdfs_generated)) +# 2022-02-23: don't check intermediate-gs, because gs's txtwrite device doesn't +# work easily with multi-page documents since the change to pdfi. +# tests_exe := $(tests_exe) $(patsubst %, %.intermediate-gs.xml, $(pdfs_generated)) endif tests_exe := \ @@ -134,10 +157,15 @@ ifneq ($(mutool),) $(patsubst %, %.mutool.docx.diff, $(pdfs_generated)) \ $(patsubst %, %.mutool-norotate.docx.diff, $(pdfs_generated)) \ $(patsubst %, %.mutool.odt.diff, $(pdfs_generated)) \ + $(patsubst %, %.mutool.text.diff, $(pdfs_generated)) \ tests_mutool_odt := \ $(patsubst %, %.mutool.odt.diff, $(pdfs_generated)) \ + tests_mutool_text := \ + $(patsubst %, %.mutool.text.diff, $(pdfs_generated)) \ + + tests_html := test/generated/table.pdf.mutool.html.diff endif ifneq ($(gs),) # Targets that test direct conversion with gs. @@ -157,18 +185,21 @@ endif test-exe: $(tests_exe) @echo $@: passed -# Checks output of mutool conversion from .pdf to .docx/.odt. Requires that -# mutool was built with extract as a third-party library. +# Checks output of mutool conversion from .pdf to .docx/.odt. # test-mutool: $(tests_mutool) @echo $@: passed -# Checks output of mutool conversion from .pdf to .odt. Requires that mutool -# was built with extract as a third-party library. +# Checks output of mutool conversion from .pdf to .odt. # test-mutool-odt: $(tests_mutool_odt) @echo $@: passed +# Checks output of mutool conversion from .pdf to .text. +# +test-mutool-text: $(tests_mutool_text) + @echo $@: passed + # Checks output of gs conversion from .pdf to .docx. Requires that gs was built # with extract as a third-party library. As of 2021-02-10 this requires, for # example ghostpdl/extract being a link to an extract checkout and configuring @@ -193,7 +224,59 @@ test_gs_fpp: $(gs) ls test/generated/text_graphic_image.pdf.gs.*.docx | wc -l | grep '^ *1$$' ls test/generated/Python2.pdf.gs.*.docx | wc -l | grep '^ *1$$' ls test/generated/zlib.3.pdf.gs.*.docx | wc -l | grep '^ *2$$' - + + +test-html: $(tests_html) + +ifneq ($(mutool),) + test_tables_pdfs = \ + test/agstat.pdf \ + test/background_lines_1.pdf \ + test/background_lines_2.pdf \ + test/column_span_1.pdf \ + test/column_span_2.pdf \ + test/electoral_roll.pdf \ + test/rotated.pdf \ + test/row_span.pdf \ + test/table.pdf \ + test/twotables_1.pdf \ + test/twotables_2.pdf \ + + test_tables_generated = $(patsubst test/%, test/generated/%, $(test_tables_pdfs)) + + test_tables_html = $(patsubst test/%.pdf, test/generated/%.pdf.mutool.html.diff, $(test_tables_pdfs)) + test_tables_docx = $(patsubst test/%.pdf, test/generated/%.pdf.mutool.docx.diff, $(test_tables_pdfs)) + test_tables_odt = $(patsubst test/%.pdf, test/generated/%.pdf.mutool.odt.diff, $(test_tables_pdfs)) + + test_tables = $(test_tables_html) $(test_tables_docx) $(test_tables_odt) +endif + +test-tables-html: $(test_tables_html) +test-tables-docx: $(test_tables_docx) +test-tables-odt: $(test_tables_odt) + +test-tables: $(test_tables) + @echo $@: passed + +test/generated/%.pdf.mutool.html.diff: test/generated/%.pdf.mutool.html test/%.pdf.mutool.html.ref + @echo + @echo == Checking $< + diff -u $^ + +test/generated/%.pdf.mutool.cv.html.diff: test/generated/%.pdf.mutool.cv.html test/%.pdf.mutool.html.ref + @echo + @echo == Checking $< + diff -u $^ + +test/generated/%.pdf.mutool.cv.html: test/%.pdf $(mutool) + $(mutool) convert -O resolution=300 -o $<..png $< + EXTRACT_OPENCV_IMAGE_BASE=$< $(mutool_run) convert -F docx -O html -o $@ $< + +test/generated/%.pdf.mutool.text.diff: test/generated/%.pdf.mutool.text test/%.pdf.mutool.text.ref + @echo + @echo == Checking $< + diff -u $^ + # Main executable. # @@ -202,10 +285,12 @@ exe_src = \ src/alloc.c \ src/astring.c \ src/buffer.c \ + src/document.c \ src/docx.c \ src/docx_template.c \ src/extract-exe.c \ src/extract.c \ + src/html.c \ src/join.c \ src/mem.c \ src/odt.c \ @@ -216,6 +301,7 @@ exe_src = \ src/xml.c \ src/zip.c \ + ifeq ($(build),memento) exe_src += src/memento.c ifeq ($(uname),Linux) @@ -223,29 +309,52 @@ ifeq ($(build),memento) flags_link += -L $(libbacktrace) -l backtrace -l dl endif endif -exe_obj = $(patsubst src/%.c, src/build/%.c-$(build).o, $(exe_src)) +exe_obj := $(exe_src) +exe_obj := $(patsubst src/%.c, src/build/%.c-$(build).o, $(exe_obj)) +exe_obj := $(patsubst src/%.cpp, src/build/%.cpp-$(build).o, $(exe_obj)) exe_dep = $(exe_obj:.o=.d) exe: $(exe) $(exe): $(exe_obj) - $(CC) $(flags_link) -o $@ $^ -lz -lm + $(CXX) $(flags_link) -o $@ $^ -lz -lm run_exe = $(exe) ifeq ($(build),memento) ifeq ($(uname),Linux) - run_exe = LD_LIBRARY_PATH=$(libbacktrace) MEMENTO_ABORT_ON_LEAK=1 MEMENTO_HIDE_MULTIPLE_REALLOCS=1 $(exe) + run_exe = MEMENTO_ABORT_ON_LEAK=1 MEMENTO_HIDE_MULTIPLE_REALLOCS=1 LD_LIBRARY_PATH=$(libbacktrace) $(exe) #run_exe = LD_LIBRARY_PATH=../libbacktrace/.libs $(exe) endif ifeq ($(uname),OpenBSD) - run_exe = MEMENTO_ABORT_ON_LEAK=1 $(exe) + run_exe = MEMENTO_ABORT_ON_LEAK=1 MEMENTO_HIDE_MULTIPLE_REALLOCS=1 $(exe) endif endif -ifeq ($(create_ref),yes) -# Special rule for populating .ref directories with current output. Useful to +exe_tables = src/build/extract-tables-$(build).exe +exe-tables: $(exe_tables) +exe-tables-test: $(exe_tables) + $< test/agstat.pdf + +ifeq (0,1) +# Do not commit changes to above line. +# +# Special rules for populating .ref directories with current output. Useful to # initialise references outputs for new output type. # +test/%.docx.dir.ref/: test/generated/%.docx.dir/ + rsync -ai $< $@ test/%.odt.dir.ref/: test/generated/%.odt.dir/ rsync -ai $< $@ +test/%.text.ref: test/generated/%.text + rsync -ai $< $@ + +_update_tables_leafs = $(patsubst test/%, %, $(test_tables_pdfs)) +# Update all table docx reference outputs. +# +_update-docx-tables: + for i in $(_update_tables_leafs); do rsync -ai test/generated/$$i.mutool.docx.dir/ test/$$i.mutool.docx.dir.ref/; done +# Update all table odt reference outputs. +# +_update-odt-tables: + for i in $(_update_tables_leafs); do rsync -ai test/generated/$$i.mutool.odt.dir/ test/$$i.mutool.odt.dir.ref/; done endif # Rules that make the various intermediate targets required by $(tests). @@ -255,7 +364,7 @@ test/generated/%.pdf.intermediate-mu.xml: test/%.pdf $(mutool) @echo @echo == Generating intermediate file for $< with mutool. @mkdir -p test/generated - $(mutool) draw -F xmltext -o $@ $< + $(mutool_run) draw -F xmltext -o $@ $< test/generated/%.pdf.intermediate-gs.xml: test/%.pdf $(gs) @echo @@ -297,6 +406,12 @@ test/generated/%.diff: test/generated/%.dir/ test/%.dir.ref/ @echo @echo == Checking $< diff -ru $^ +#if diff -ruq $^; then true; else echo "@@@ failure... fix with: rsync -ai" $^; false; fi + +test/generated/%.html.diff: test/generated/%.html test/%.html.ref + @echo + @echo == Checking $< + diff -u $^ # This checks that -t src/template.docx gives identical results. # @@ -336,6 +451,14 @@ test/generated/%.extract-template.docx.diff: test/generated/%.extract-template.d @rm -r $@ 2>/dev/null || true cd $< && zip -r ../$(notdir $@) . +# Uses zip to create .odt file by zipping up a directory. Useful to recreate +# .docx from reference directory test/*.odt.dir.ref. +%.odt: % + @echo + @echo == Zipping directory into .odt file. + @rm -r $@ 2>/dev/null || true + cd $< && zip -r ../$(notdir $@) . + # Prettifies each .xml file within .docx.dir/ directory. %.docx.dir.pretty: %.docx.dir/ @rm -r $@ $@- 2>/dev/null || true @@ -348,19 +471,19 @@ test/generated/%.pdf.mutool.docx: test/%.pdf $(mutool) @echo @echo == Converting .pdf directly to .docx using mutool. @mkdir -p test/generated - $(mutool) convert -O mediabox-clip=yes -o $@ $< + $(mutool_run) convert -O mediabox-clip=yes -o $@ $< test/generated/%.pdf.mutool-norotate.docx: test/%.pdf $(mutool) @echo @echo == Converting .pdf directly to .docx using mutool. @mkdir -p test/generated - $(mutool) convert -O mediabox-clip=yes,rotation=no -o $@ $< + $(mutool_run) convert -O mediabox-clip=yes,rotation=no -o $@ $< test/generated/%.pdf.mutool-spacing.docx: test/%.pdf $(mutool) @echo @echo == Converting .pdf directly to .docx using mutool. @mkdir -p test/generated - $(mutool) convert -O mediabox-clip=yes,spacing=yes -o $@ $< + $(mutool_run) convert -O mediabox-clip=yes,spacing=yes -o $@ $< # Converts .pdf directly to .docx using gs. test/generated/%.pdf.gs.docx: test/%.pdf $(gs) @@ -374,8 +497,21 @@ test/generated/%.pdf.mutool.odt: test/%.pdf $(mutool) @echo @echo == Converting .pdf directly to .odt using mutool. @mkdir -p test/generated - $(mutool) convert -O mediabox-clip=no -o $@ $< + $(mutool_run) convert -O mediabox-clip=no -o $@ $< +# Converts .pdf directly to .html using mutool +test/generated/%.pdf.mutool.html: test/%.pdf $(mutool) + @echo + @echo == Converting .pdf directly to .html using mutool. + @mkdir -p test/generated + $(mutool_run) convert -F docx -O html -o $@ $< + +# Converts .pdf directly to .text using mutool +test/generated/%.pdf.mutool.text: test/%.pdf $(mutool) + @echo + @echo == Converting .pdf directly to .text using mutool. + @mkdir -p test/generated + $(mutool_run) convert -F docx -O text -o $@ $< # Valgrind test # @@ -386,17 +522,29 @@ valgrind: $(exe) test/generated/Python2.pdf.intermediate-mu.xml # Memento tests. # ifeq ($(build),memento) -msqueeze: $(exe) test/generated/Python2.pdf.intermediate-mu.xml - MEMENTO_SQUEEZEAT=1 $(run_exe) --alloc-exp-min 0 -r 1 -s 0 -i test/generated/Python2.pdf.intermediate-mu.xml -o test/generated/msqueeze-out.docx 2>&1 | src/memento.py -q 1 -o msqueeze-raw - @echo $@: passed -mfailat: $(exe) test/generated/Python2.pdf.intermediate-mu.xml - MEMENTO_FAILAT=61463 $(run_exe) --alloc-exp-min 0 -r 1 -s 0 -i test/generated/Python2.pdf.intermediate-mu.xml -o test/generated/msqueeze-out.docx - @echo $@: passed -mutool_memento_extract = ../../build/memento-extract/mutool -msqueeze-mutool: - MEMENTO_SQUEEZEAT=1 $(mutool_memento_extract) convert -o test/generated/text_graphic_image.pdf.mutool.docx test/text_graphic_image.pdf 2>&1 | src/memento.py -q 1 -o msqueeze-raw -msqueeze-mutool2: - MEMENTO_SQUEEZEAT=1 $(mutool_memento_extract) convert -o test/generated/Python2.pdf.mutool.docx test/Python2.pdf 2>&1 | src/memento.py -q 1 -o msqueeze-raw +mutool_memento_extract = ../../build/memento/mutool +memento_failat_gdb := $(gdb) -ex 'b Memento_breakpoint' -ex r -ex c -ex bt --args + +# Memento squeeze with test/text_graphic_image.pdf runs quickly - just 2,100 events taking 20s. +# +# test/Python2.pdf is much slower - 301,900 events, taking around 8h. +# +msqueeze-mutool-docx: + MEMENTO_SQUEEZEAT=1 ./src/memento.py -q 100 $(mutool_run) convert -o $@.docx test/text_graphic_image.pdf +msqueeze-mutool-docx-failat: + MEMENTO_FAILAT=1960 $(memento_failat_gdb) $(mutool) convert -o $@.docx test/text_graphic_image.pdf +msqueeze-mutool-odt: + MEMENTO_SQUEEZEAT=1 ./src/memento.py -q 100 $(mutool_run) convert -o $@.docx test/text_graphic_image.pdf +msqueeze-mutool-odt2: + MEMENTO_SQUEEZEAT=4000 ./src/memento.py -q 100 $(mutool_run) convert -o $@.docx test/Python2.pdf +msqueeze-mutool-table: + MEMENTO_SQUEEZEAT=1 ./src/memento.py -q 100 $(mutool_run) convert -F docx -O html -o $@.html test/agstat.pdf +msqueeze-mutool-table-docx: + MEMENTO_SQUEEZEAT=1 ./src/memento.py -q 100 $(mutool_run) convert -o $@.docx test/agstat.pdf +msqueeze-mutool-table-odt: + MEMENTO_SQUEEZEAT=1 ./src/memento.py -q 100 $(mutool_run) convert -o $@.odt test/agstat.pdf +msqueeze-mutool-table-failat: + MEMENTO_FAILAT=296643 MEMENTO_HIDE_MULTIPLE_REALLOCS=1 $(gdb) -ex 'b Memento_breakpoint' -ex r -ex c -ex bt --args $(mutool_memento_extract) convert -F docx -O html -o $@.html test/agstat.pdf endif @@ -437,6 +585,10 @@ test-buffer-valgrind: $(exe_buffer_test) valgrind --leak-check=full ./$< @echo $@: passed +ifeq ($(build),memento) +test-buffer-msqueeze: $(exe_buffer_test) + MEMENTO_SQUEEZEAT=1 ./src/memento.py -q 1 ./$< +endif # Misc unit test. # @@ -477,12 +629,26 @@ test-src: if egrep -wn 'for *[(] *[a-zA-Z0-9]+ [a-zA-Z0-9]' src/*.c src/*.h; then false; else true; fi @echo $@: passed +# Check that all defined global symbols start with 'extract_'. This is not +# included in the overall 'test' target because the use of '!egrep ...' appears +# to break on some cluster machines. +# +test-obj: + @echo + nm -egPC $(exe_obj) | egrep '^[a-zA-Z0-9_]+ T' | grep -vw ^main | ! egrep -v ^extract_ + @echo $@: passed + # Compile rule. We always include src/docx_template.c as a prerequisite in case -# code #includes docx_template.h. +# code #includes docx_template.h. We use -std=gnu90 to catch 'ISO C90 forbids +# mixing declarations and code' errors while still supporting 'inline'. # src/build/%.c-$(build).o: src/%.c src/docx_template.c src/odt_template.c @mkdir -p src/build - $(CC) -c $(flags_compile) -o $@ $< + $(CC) -std=gnu90 -c $(flags_compile) -o $@ $< + +src/build/%.cpp-$(build).o: src/%.cpp + @mkdir -p src/build + $(CXX) -c -Wall -W -I /usr/local/include/opencv4 -o $@ $< # Rule for machine-generated source code, src/docx_template.c. Also generates # src/docx_template.h. |