diff options
Diffstat (limited to 'extract/src')
-rw-r--r-- | extract/src/astring.c | 127 | ||||
-rw-r--r-- | extract/src/astring.h | 28 | ||||
-rw-r--r-- | extract/src/buffer-test.c | 2 | ||||
-rw-r--r-- | extract/src/buffer.c | 2 | ||||
-rw-r--r-- | extract/src/document.c | 88 | ||||
-rw-r--r-- | extract/src/document.h | 137 | ||||
-rw-r--r-- | extract/src/docx.c | 598 | ||||
-rw-r--r-- | extract/src/docx.h | 4 | ||||
-rwxr-xr-x | extract/src/docx_template_build.py | 30 | ||||
-rw-r--r-- | extract/src/extract-exe.c | 3 | ||||
-rw-r--r-- | extract/src/extract.c | 891 | ||||
-rw-r--r-- | extract/src/html.c | 314 | ||||
-rw-r--r-- | extract/src/html.h | 23 | ||||
-rw-r--r-- | extract/src/join.c | 1241 | ||||
-rw-r--r-- | extract/src/mem.c | 22 | ||||
-rw-r--r-- | extract/src/mem.h | 13 | ||||
-rwxr-xr-x | extract/src/memento.py | 39 | ||||
-rw-r--r-- | extract/src/misc-test.c | 58 | ||||
-rw-r--r-- | extract/src/odt.c | 627 | ||||
-rw-r--r-- | extract/src/outf.c | 10 | ||||
-rw-r--r-- | extract/src/outf.h | 40 | ||||
-rw-r--r-- | extract/src/sys.c | 2 | ||||
-rw-r--r-- | extract/src/text.c | 22 | ||||
-rw-r--r-- | extract/src/xml.c | 16 | ||||
-rw-r--r-- | extract/src/xml.h | 6 | ||||
-rw-r--r-- | extract/src/zip.c | 43 |
26 files changed, 3533 insertions, 853 deletions
diff --git a/extract/src/astring.c b/extract/src/astring.c index fd09d639..e5d40217 100644 --- a/extract/src/astring.c +++ b/extract/src/astring.c @@ -27,6 +27,9 @@ void extract_astring_free(extract_alloc_t* alloc, extract_astring_t* string) int extract_astring_catl(extract_alloc_t* alloc, extract_astring_t* string, const char* s, size_t s_len) { if (extract_realloc2(alloc, &string->chars, string->chars_num+1, string->chars_num + s_len + 1)) return -1; + /* Coverity doesn't seem to realise that extract_realloc2() modifies + string->chars. */ + /* coverity[deref_parm_field_in_call] */ memcpy(string->chars + string->chars_num, s, s_len); string->chars[string->chars_num + s_len] = 0; string->chars_num += s_len; @@ -65,7 +68,7 @@ int extract_astring_truncate(extract_astring_t* content, int len) return 0; } -int astring_char_truncate_if(extract_astring_t* content, char c) +int extract_astring_char_truncate_if(extract_astring_t* content, char c) { if (content->chars_num && content->chars[content->chars_num-1] == c) { extract_astring_truncate(content, 1); @@ -73,40 +76,58 @@ int astring_char_truncate_if(extract_astring_t* content, char c) return 0; } -int extract_astring_cat_xmlc(extract_alloc_t* alloc, extract_astring_t* string, int c) +int extract_astring_catc_unicode( + extract_alloc_t* alloc, + extract_astring_t* string, + int c, + int xml, + int ascii_ligatures, + int ascii_dash, + int ascii_apostrophe + ) { int ret = -1; if (0) {} /* Escape XML special characters. */ - else if (c == '<') extract_astring_cat(alloc, string, "<"); - else if (c == '>') extract_astring_cat(alloc, string, ">"); - else if (c == '&') extract_astring_cat(alloc, string, "&"); - else if (c == '"') extract_astring_cat(alloc, string, """); - else if (c == '\'') extract_astring_cat(alloc, string, "'"); + else if (xml && c == '<') extract_astring_cat(alloc, string, "<"); + else if (xml && c == '>') extract_astring_cat(alloc, string, ">"); + else if (xml && c == '&') extract_astring_cat(alloc, string, "&"); + else if (xml && c == '"') extract_astring_cat(alloc, string, """); + else if (xml && c == '\'') extract_astring_cat(alloc, string, "'"); /* Expand ligatures. */ - else if (c == 0xFB00) + else if (ascii_ligatures && c == 0xFB00) { if (extract_astring_cat(alloc, string, "ff")) goto end; } - else if (c == 0xFB01) + else if (ascii_ligatures && c == 0xFB01) { if (extract_astring_cat(alloc, string, "fi")) goto end; } - else if (c == 0xFB02) + else if (ascii_ligatures && c == 0xFB02) { if (extract_astring_cat(alloc, string, "fl")) goto end; } - else if (c == 0xFB03) + else if (ascii_ligatures && c == 0xFB03) { if (extract_astring_cat(alloc, string, "ffi")) goto end; } - else if (c == 0xFB04) + else if (ascii_ligatures && c == 0xFB04) { if (extract_astring_cat(alloc, string, "ffl")) goto end; } + + /* Convert some special characters to ascii. */ + else if (ascii_dash && c == 0x2212) + { + if (extract_astring_catc(alloc, string, '-')) goto end; + } + else if (ascii_apostrophe && c == 0x2019) + { + if (extract_astring_catc(alloc, string, '\'')) goto end; + } /* Output ASCII verbatim. */ else if (c >= 32 && c <= 127) @@ -117,18 +138,65 @@ int extract_astring_cat_xmlc(extract_alloc_t* alloc, extract_astring_t* string, /* Escape all other characters. */ else { - char buffer[32]; - if (c < 32 - && (c != 0x9 && c != 0xa && c != 0xd) - ) + if (xml) { - /* Illegal xml character; see - https://www.w3.org/TR/xml/#charsets. We replace with - 0xfffd, the unicode replacement character. */ - c = 0xfffd; + char buffer[32]; + if (c < 32 + && (c != 0x9 && c != 0xa && c != 0xd) + ) + { + /* Illegal xml character; see + https://www.w3.org/TR/xml/#charsets. We replace with + 0xfffd, the unicode replacement character. */ + c = 0xfffd; + } + snprintf(buffer, sizeof(buffer), "&#x%x;", c); + if (extract_astring_cat(alloc, string, buffer)) goto end; + } + else + { + /* Use utf8. */ + if (c < 0x80) + { + if (extract_astring_catc(alloc, string, (char) c)) return -1; + } + else if (c < 0x0800) + { + char cc[2] = + { + (char) (((c >> 6) & 0x1f) | 0xc0), + (char) (((c >> 0) & 0x3f) | 0x80) + }; + if (extract_astring_catl(alloc, string, cc, sizeof(cc))) return -1; + } + else if (c < 0x10000) + { + char cc[3] = + { + (char) (((c >> 12) & 0x0f) | 0xe0), + (char) (((c >> 6) & 0x3f) | 0x80), + (char) (((c >> 0) & 0x3f) | 0x80) + }; + if (extract_astring_catl(alloc, string, cc, sizeof(cc))) return -1; + } + else if (c < 0x110000) + { + char cc[4] = + { + (char) (((c >> 18) & 0x07) | 0xf0), + (char) (((c >> 12) & 0x3f) | 0x80), + (char) (((c >> 6) & 0x3f) | 0x80), + (char) (((c >> 0) & 0x3f) | 0x80) + }; + if (extract_astring_catl(alloc, string, cc, sizeof(cc))) return -1; + } + else + { + /* Use replacement character. */ + char cc[4] = { (char) 0xef, (char) 0xbf, (char) 0xbd, 0}; + if (extract_astring_catl(alloc, string, cc, sizeof(cc))) return -1; + } } - snprintf(buffer, sizeof(buffer), "&#x%x;", c); - if (extract_astring_cat(alloc, string, buffer)) goto end; } ret = 0; @@ -136,3 +204,18 @@ int extract_astring_cat_xmlc(extract_alloc_t* alloc, extract_astring_t* string, end: return ret; } + +int extract_astring_catc_unicode_xml(extract_alloc_t* alloc, extract_astring_t* string, int c) +{ + /* Fixme, better to use ascii_ligatures=0, but that requires updates to + expected output files. */ + return extract_astring_catc_unicode( + alloc, + string, + c, + 1 /*xml*/, + 1 /*ascii_ligatures*/, + 0 /*ascii_dash*/, + 0 /*ascii_apostrophe*/ + ); +} diff --git a/extract/src/astring.h b/extract/src/astring.h index c2b60d25..aef4d87f 100644 --- a/extract/src/astring.h +++ b/extract/src/astring.h @@ -11,8 +11,11 @@ typedef struct } extract_astring_t; void extract_astring_init(extract_astring_t* string); +/* Initialises <string> so it is ready for use. */ void extract_astring_free(extract_alloc_t* alloc, extract_astring_t* string); +/* Frees any existing data and returns with <string> ready for use as if by +extract_astring_init(). */ int extract_astring_catl(extract_alloc_t* alloc, extract_astring_t* string, const char* s, size_t s_len); @@ -24,10 +27,33 @@ int extract_astring_catf(extract_alloc_t* alloc, extract_astring_t* string, cons int extract_astring_truncate(extract_astring_t* content, int len); /* Removes last <len> chars. */ -int astring_char_truncate_if(extract_astring_t* content, char c); +int extract_astring_char_truncate_if(extract_astring_t* content, char c); /* Removes last char if it is <c>. */ int extract_astring_cat_xmlc(extract_alloc_t* alloc, extract_astring_t* string, int c); /* Appends specified character using XML escapes as necessary. */ +int extract_astring_catc_unicode( + extract_alloc_t* alloc, + extract_astring_t* string, + int c, + int xml, + int ascii_ligatures, + int ascii_dash, + int ascii_apostrophe + ); +/* Appends unicode character <c> to <string>. + xml: + If true, we use XML escape sequences for special characters such as '<' + and unicode values above 127. Otherwise we encode as utf8. + ascii_ligatures: if true we expand ligatures to "fl", "fi" etc. + ascii_dash: + If true we replace unicode dash characters with '-'. + ascii_apostrophe: + If true we replace unicode apostrophe with ascii single-quote "'". +*/ + +int extract_astring_catc_unicode_xml(extract_alloc_t* alloc, extract_astring_t* string, int c); +/* Appends specific unicode character, using XML escape sequences as required. */ + #endif diff --git a/extract/src/buffer-test.c b/extract/src/buffer-test.c index 6701fbab..a8464c2a 100644 --- a/extract/src/buffer-test.c +++ b/extract/src/buffer-test.c @@ -298,7 +298,7 @@ static void test_file(void) int main(void) { - outf_verbose_set(1); + extract_outf_verbose_set(1); test_read(); test_write(); test_file(); diff --git a/extract/src/buffer.c b/extract/src/buffer.c index 3fd35bfd..b25dee73 100644 --- a/extract/src/buffer.c +++ b/extract/src/buffer.c @@ -375,7 +375,7 @@ int extract_buffer_write_internal( not recoverable. <pos> will be the number of bytes in source..+numbytes that have been successfully flushed, and could be negative if we failed to flush earlier data. */ - outf("failed to flush. actual=%i delta=%i\n", actual, delta); + outf("failed to flush. actual=%li delta=%li\n", (long) actual, (long) delta); e = 0; goto end; } diff --git a/extract/src/document.c b/extract/src/document.c new file mode 100644 index 00000000..d501f259 --- /dev/null +++ b/extract/src/document.c @@ -0,0 +1,88 @@ +#include "document.h" +#include "outf.h" + + +void extract_span_init(span_t* span) +{ + span->font_name = NULL; + span->chars = NULL; + span->chars_num = 0; +} + +void extract_span_free(extract_alloc_t* alloc, span_t** pspan) +{ + if (!*pspan) return; + extract_free(alloc, &(*pspan)->font_name); + extract_free(alloc, &(*pspan)->chars); + extract_free(alloc, pspan); +} + +void extract_spans_free(extract_alloc_t* alloc, span_t*** pspans, int spans_num) +{ + span_t** spans = *pspans; + int s; + for (s=0; s<spans_num; ++s) + { + extract_span_free(alloc, &spans[s]); + } + extract_free(alloc, pspans); +} + +void extract_line_free(extract_alloc_t* alloc, line_t** pline) +{ + line_t* line = *pline; + int s; + for (s=0; s<line->spans_num; ++s) + { + extract_span_free(alloc, &line->spans[s]); + } + extract_free(alloc, &line->spans); + extract_free(alloc, pline); +} + +void extract_lines_free(extract_alloc_t* alloc, line_t*** plines, int lines_num) +{ + int l; + line_t** lines = *plines; + for (l=0; l<lines_num; ++l) + { + extract_line_free(alloc, &lines[l]); + } + extract_free(alloc, plines); +} + +void extract_image_clear(extract_alloc_t* alloc, image_t* image) +{ + extract_free(alloc, &image->type); + extract_free(alloc, &image->name); + extract_free(alloc, &image->id); + if (image->data_free) { + image->data_free(image->data_free_handle, image->data); + } +} + +void extract_cell_free(extract_alloc_t* alloc, cell_t** pcell) +{ + int p; + cell_t* cell = *pcell; + if (!cell) return; + + outf("cell->lines_num=%i", cell->lines_num); + outf("cell->paragraphs_num=%i", cell->paragraphs_num); + extract_lines_free(alloc, &cell->lines, cell->lines_num); + + outf("cell=%p cell->paragraphs_num=%i", cell, cell->paragraphs_num); + for (p=0; p<cell->paragraphs_num; ++p) + { + paragraph_t* paragraph = cell->paragraphs[p]; + outf("paragraph->lines_num=%i", paragraph->lines_num); + /* We don't attempt to free paragraph->lines[] because they point into + cell->lines which are already freed. */ + extract_free(alloc, ¶graph->lines); + extract_free(alloc, &cell->paragraphs[p]); + } + extract_free(alloc, &cell->paragraphs); + extract_free(alloc, pcell); +} + + diff --git a/extract/src/document.h b/extract/src/document.h index c59348f4..2dc4f1ee 100644 --- a/extract/src/document.h +++ b/extract/src/document.h @@ -1,6 +1,15 @@ #ifndef ARTIFEX_EXTRACT_DOCUMENT_H #define ARTIFEX_EXTRACT_DOCUMENT_H +#include "../include/extract.h" + +#ifdef _MSC_VER + #include "compat_stdint.h" +#else + #include <stdint.h> +#endif + + static const double pi = 3.141592653589793; typedef struct @@ -9,6 +18,16 @@ typedef struct double y; } point_t; +const char* extract_point_string(const point_t* point); + +typedef struct +{ + point_t min; + point_t max; +} rect_t; + +const char* extract_rect_string(const rect_t* rect); + typedef struct { double a; @@ -19,9 +38,15 @@ typedef struct double f; } matrix_t; -double matrix_expansion(matrix_t m); +const char* extract_matrix_string(const matrix_t* matrix); -int matrix_cmp4(const matrix_t* lhs, const matrix_t* rhs) +double extract_matrix_expansion(matrix_t m); +/* Returns a*d - b*c. */ + +point_t extract_multiply_matrix_point(matrix_t m, point_t p); +matrix_t extract_multiply_matrix_matrix(matrix_t m1, matrix_t m2); + +int extract_matrix_cmp4(const matrix_t* lhs, const matrix_t* rhs) ; /* Returns zero if first four members of *lhs and *rhs are equal, otherwise +/-1. */ @@ -48,7 +73,7 @@ typedef struct matrix_t trm; char* font_name; - /* font size is matrix_expansion(trm). */ + /* font size is extract_matrix_cmp4(trm). */ struct { unsigned font_bold : 1; @@ -61,14 +86,21 @@ typedef struct } span_t; /* List of chars that have same font and are usually adjacent. */ -char_t* span_char_last(span_t* span); +void extract_span_init(span_t* span); + +void extract_span_free(extract_alloc_t* alloc, span_t** pspan); +/* Frees a span_t, returning with *pspan set to NULL. */ + +void extract_spans_free(extract_alloc_t* alloc, span_t*** pspans, int spans_num); + +char_t* extract_span_char_last(span_t* span); /* Returns last character in span. */ -int span_append_c(extract_alloc_t* alloc, span_t* span, int c); +int extract_span_append_c(extract_alloc_t* alloc, span_t* span, int c); /* Appends new char_t to an span_t with .ucs=c and all other fields zeroed. */ -const char* span_string(extract_alloc_t* alloc, span_t* span); +const char* extract_span_string(extract_alloc_t* alloc, span_t* span); /* Returns static string containing info about span_t. */ typedef struct @@ -78,10 +110,13 @@ typedef struct } line_t; /* List of spans that are aligned on same line. */ -span_t* line_span_first(line_t* line); +void extract_line_free(extract_alloc_t* alloc, line_t** pline); +void extract_lines_free(extract_alloc_t* alloc, line_t*** plines, int lines_num); + +span_t* extract_line_span_first(line_t* line); /* Returns first span in a line. */ -span_t* line_span_last(line_t* line); +span_t* extract_line_span_last(line_t* line); /* Returns last span in a line. */ typedef struct @@ -112,6 +147,61 @@ typedef struct <name> and <id> are created to be unique identifiers for use in generated docx file. */ +void extract_image_clear(extract_alloc_t* alloc, image_t* image); + +typedef struct +{ + float color; + rect_t rect; +} tableline_t; +/* A line that is part of a table. */ + +typedef struct +{ + tableline_t* tablelines; + int tablelines_num; +} tablelines_t; + + +typedef struct +{ + rect_t rect; + + /* If left/above is true, this cell is not obscured by cell to its + left/above. */ + uint8_t left; + uint8_t above; + + /* extend_right and extend_down are 1 for normal cells, 2 for cells which + extend right/down to cover an additional column/row, 3 to cover two + additional columns/rows etc. */ + int extend_right; + int extend_down; + + /* Contents of this cell. */ + line_t** lines; + int lines_num; + paragraph_t** paragraphs; + int paragraphs_num; +} cell_t; +/* A cell within a table. */ + +void extract_cell_init(cell_t* cell); +void extract_cell_free(extract_alloc_t* alloc, cell_t** pcell); + +typedef struct +{ + point_t pos; /* top-left. */ + + /* Array of cells_num_x*cells_num_y cells; cell (x, y) is: + cells_num_x * y + x. + */ + cell_t** cells; + int cells_num_x; + int cells_num_y; +} table_t; + + typedef struct { span_t** spans; @@ -129,10 +219,17 @@ typedef struct int paragraphs_num; /* These refer to items in .lines. Initially empty, then set by extract_join(). */ + + tablelines_t tablelines_horizontal; + tablelines_t tablelines_vertical; + + table_t** tables; + int tables_num; } extract_page_t; /* A page. Contains different representations of the list of spans. NB not -called page_t because this clashes with a system type on hpux. */ ++called page_t because this clashes with a system type on hpux. */ + typedef struct { @@ -150,9 +247,31 @@ typedef struct int imagetypes_num; } images_t; + int extract_document_join(extract_alloc_t* alloc, document_t* document); +/* This does all the work of finding paragraphs and tables. */ double extract_matrices_to_font_size(matrix_t* ctm, matrix_t* trm); +/* Things below here are used when generating output. */ + +typedef struct +{ + char* name; + double size; + int bold; + int italic; +} font_t; +/* Basic information about current font. */ + +typedef struct +{ + font_t font; + matrix_t* ctm_prev; +} content_state_t; +/* Used to keep track of font information when writing paragraphs of odt +content, e.g. so we know whether a font has changed so need to start a new odt +span. */ + #endif diff --git a/extract/src/docx.c b/extract/src/docx.c index 4532cd4e..761de176 100644 --- a/extract/src/docx.c +++ b/extract/src/docx.c @@ -21,6 +21,7 @@ docx_paragraph_finish(). */ #include <assert.h> #include <errno.h> +#include <float.h> #include <math.h> #include <stdlib.h> #include <stdio.h> @@ -29,46 +30,42 @@ docx_paragraph_finish(). */ #include <sys/stat.h> -static int extract_docx_paragraph_start(extract_alloc_t* alloc, extract_astring_t* content) +static int s_docx_paragraph_start(extract_alloc_t* alloc, extract_astring_t* content) { return extract_astring_cat(alloc, content, "\n\n<w:p>"); } -static int extract_docx_paragraph_finish(extract_alloc_t* alloc, extract_astring_t* content) +static int s_docx_paragraph_finish(extract_alloc_t* alloc, extract_astring_t* content) { return extract_astring_cat(alloc, content, "\n</w:p>"); } -static int extract_docx_run_start( +static int s_docx_run_start( extract_alloc_t* alloc, extract_astring_t* content, - const char* font_name, - double font_size, - int bold, - int italic + content_state_t* content_state ) -/* Starts a new run. Caller must ensure that extract_docx_run_finish() was +/* Starts a new run. Caller must ensure that s_docx_run_finish() was called to terminate any previous run. */ { int e = 0; if (!e) e = extract_astring_cat(alloc, content, "\n<w:r><w:rPr><w:rFonts w:ascii=\""); - if (!e) e = extract_astring_cat(alloc, content, font_name); + if (!e) e = extract_astring_cat(alloc, content, content_state->font.name); if (!e) e = extract_astring_cat(alloc, content, "\" w:hAnsi=\""); - if (!e) e = extract_astring_cat(alloc, content, font_name); + if (!e) e = extract_astring_cat(alloc, content, content_state->font.name); if (!e) e = extract_astring_cat(alloc, content, "\"/>"); - if (!e && bold) e = extract_astring_cat(alloc, content, "<w:b/>"); - if (!e && italic) e = extract_astring_cat(alloc, content, "<w:i/>"); + if (!e && content_state->font.bold) e = extract_astring_cat(alloc, content, "<w:b/>"); + if (!e && content_state->font.italic) e = extract_astring_cat(alloc, content, "<w:i/>"); { char font_size_text[32]; - if (0) font_size = 10; if (!e) e = extract_astring_cat(alloc, content, "<w:sz w:val=\""); - snprintf(font_size_text, sizeof(font_size_text), "%f", font_size * 2); + snprintf(font_size_text, sizeof(font_size_text), "%f", content_state->font.size * 2); extract_astring_cat(alloc, content, font_size_text); extract_astring_cat(alloc, content, "\"/>"); if (!e) e = extract_astring_cat(alloc, content, "<w:szCs w:val=\""); - snprintf(font_size_text, sizeof(font_size_text), "%f", font_size * 1.5); + snprintf(font_size_text, sizeof(font_size_text), "%f", content_state->font.size * 1.5); extract_astring_cat(alloc, content, font_size_text); extract_astring_cat(alloc, content, "\"/>"); } @@ -77,38 +74,39 @@ called to terminate any previous run. */ } -static int extract_docx_run_finish(extract_alloc_t* alloc, extract_astring_t* content) +static int s_docx_run_finish(extract_alloc_t* alloc, content_state_t* state, extract_astring_t* content) { + if (state) state->font.name = NULL; return extract_astring_cat(alloc, content, "</w:t></w:r>"); } -static int extract_docx_paragraph_empty(extract_alloc_t* alloc, extract_astring_t* content) +static int s_docx_paragraph_empty(extract_alloc_t* alloc, extract_astring_t* content) /* Append an empty paragraph to *content. */ { int e = -1; - if (extract_docx_paragraph_start(alloc, content)) goto end; + static char fontname[] = "OpenSans"; + content_state_t content_state = {0}; + if (s_docx_paragraph_start(alloc, content)) goto end; /* It seems like our choice of font size here doesn't make any difference to the ammount of vertical space, unless we include a non-space character. Presumably something to do with the styles in the template document. */ - if (extract_docx_run_start( - alloc, - content, - "OpenSans", - 10 /*font_size*/, - 0 /*font_bold*/, - 0 /*font_italic*/ - )) goto end; + content_state.font.name = fontname; + content_state.font.size = 10; + content_state.font.bold = 0; + content_state.font.italic = 0; + + if (s_docx_run_start(alloc, content, &content_state)) goto end; //docx_char_append_string(content, " "); /*   is non-break space. */ - if (extract_docx_run_finish(alloc, content)) goto end; - if (extract_docx_paragraph_finish(alloc, content)) goto end; + if (s_docx_run_finish(alloc, NULL /*state*/, content)) goto end; + if (s_docx_paragraph_finish(alloc, content)) goto end; e = 0; end: return e; } -static int extract_docx_char_truncate_if(extract_astring_t* content, char c) +static int s_docx_char_truncate_if(extract_astring_t* content, char c) /* Removes last char if it is <c>. */ { if (content->chars_num && content->chars[content->chars_num-1] == c) { @@ -118,22 +116,9 @@ static int extract_docx_char_truncate_if(extract_astring_t* content, char c) } -typedef struct -{ - const char* font_name; - double font_size; - int font_bold; - int font_italic; - matrix_t* ctm_prev; -} content_state_t; -/* Used to keep track of font information when writing paragraphs of docx -content, e.g. so we know whether a font has changed so need to start a new docx -span. */ - - -static int extract_document_to_docx_content_paragraph( +static int s_document_to_docx_content_paragraph( extract_alloc_t* alloc, - content_state_t* state, + content_state_t* content_state, paragraph_t* paragraph, extract_astring_t* content ) @@ -142,7 +127,7 @@ font. */ { int e = -1; int l; - if (extract_docx_paragraph_start(alloc, content)) goto end; + if (s_docx_paragraph_start(alloc, content)) goto end; for (l=0; l<paragraph->lines_num; ++l) { line_t* line = paragraph->lines[l]; @@ -151,45 +136,38 @@ font. */ int si; span_t* span = line->spans[s]; double font_size_new; - state->ctm_prev = &span->ctm; + content_state->ctm_prev = &span->ctm; font_size_new = extract_matrices_to_font_size(&span->ctm, &span->trm); - if (!state->font_name - || strcmp(span->font_name, state->font_name) - || span->flags.font_bold != state->font_bold - || span->flags.font_italic != state->font_italic - || font_size_new != state->font_size + if (!content_state->font.name + || strcmp(span->font_name, content_state->font.name) + || span->flags.font_bold != content_state->font.bold + || span->flags.font_italic != content_state->font.italic + || font_size_new != content_state->font.size ) { - if (state->font_name) { - if (extract_docx_run_finish(alloc, content)) goto end; + if (content_state->font.name) { + if (s_docx_run_finish(alloc, content_state, content)) goto end; } - state->font_name = span->font_name; - state->font_bold = span->flags.font_bold; - state->font_italic = span->flags.font_italic; - state->font_size = font_size_new; - if (extract_docx_run_start( - alloc, - content, - state->font_name, - state->font_size, - state->font_bold, - state->font_italic - )) goto end; + content_state->font.name = span->font_name; + content_state->font.bold = span->flags.font_bold; + content_state->font.italic = span->flags.font_italic; + content_state->font.size = font_size_new; + if (s_docx_run_start(alloc, content, content_state)) goto end; } for (si=0; si<span->chars_num; ++si) { char_t* char_ = &span->chars[si]; int c = char_->ucs; - if (extract_astring_cat_xmlc(alloc, content, c)) goto end; + if (extract_astring_catc_unicode_xml(alloc, content, c)) goto end; } /* Remove any trailing '-' at end of line. */ - if (extract_docx_char_truncate_if(content, '-')) goto end; + if (s_docx_char_truncate_if(content, '-')) goto end; } } - if (state->font_name) { - if (extract_docx_run_finish(alloc, content)) goto end; - state->font_name = NULL; + if (content_state->font.name) + { + if (s_docx_run_finish(alloc, content_state, content)) goto end; } - if (extract_docx_paragraph_finish(alloc, content)) goto end; + if (s_docx_paragraph_finish(alloc, content)) goto end; e = 0; @@ -197,7 +175,7 @@ font. */ return e; } -static int extract_document_append_image( +static int s_docx_append_image( extract_alloc_t* alloc, extract_astring_t* content, image_t* image @@ -265,7 +243,7 @@ static int extract_document_append_image( } -static int extract_document_output_rotated_paragraphs( +static int s_docx_output_rotated_paragraphs( extract_alloc_t* alloc, extract_page_t* page, int paragraph_begin, @@ -353,7 +331,7 @@ static int extract_document_output_rotated_paragraphs( /* Output paragraphs p0..p2-1. */ for (p=paragraph_begin; p<paragraph_end; ++p) { paragraph_t* paragraph = page->paragraphs[p]; - if (extract_document_to_docx_content_paragraph(alloc, state, paragraph, content)) goto end; + if (s_document_to_docx_content_paragraph(alloc, state, paragraph, content)) goto end; } extract_astring_cat(alloc, content, "\n"); @@ -387,7 +365,7 @@ static int extract_document_output_rotated_paragraphs( for (p=paragraph_begin; p<paragraph_end; ++p) { paragraph_t* paragraph = page->paragraphs[p]; - if (extract_document_to_docx_content_paragraph(alloc, state, paragraph, content)) goto end; + if (s_document_to_docx_content_paragraph(alloc, state, paragraph, content)) goto end; } extract_astring_cat(alloc, content, "\n"); @@ -406,6 +384,257 @@ static int extract_document_output_rotated_paragraphs( } +static int s_docx_append_table(extract_alloc_t* alloc, table_t* table, extract_astring_t* content) +/* Appends table to content. + +We do not fix the size of the table or its columns and rows, but instead leave layout up +to the application. */ +{ + int e = -1; + int y; + + if (extract_astring_cat(alloc, content, + "\n" + " <w:tbl>\n" + " <w:tblLayout w:type=\"autofit\"/>\n" + )) goto end; + + for (y=0; y<table->cells_num_y; ++y) + { + int x; + if (extract_astring_cat(alloc, content, + " <w:tr>\n" + " <w:trPr/>\n" + )) goto end; + + for (x=0; x<table->cells_num_x; ++x) + { + cell_t* cell = table->cells[y*table->cells_num_x + x]; + if (!cell->left) continue; + + if (extract_astring_cat(alloc, content, " <w:tc>\n")) goto end; + + /* Write cell properties. */ + { + if (extract_astring_cat(alloc, content, + " <w:tcPr>\n" + " <w:tcBorders>\n" + " <w:top w:val=\"double\" w:sz=\"2\" w:space=\"0\" w:color=\"808080\"/>\n" + " <w:start w:val=\"double\" w:sz=\"2\" w:space=\"0\" w:color=\"808080\"/>\n" + " <w:bottom w:val=\"double\" w:sz=\"2\" w:space=\"0\" w:color=\"808080\"/>\n" + " <w:end w:val=\"double\" w:sz=\"2\" w:space=\"0\" w:color=\"808080\"/>\n" + " </w:tcBorders>\n" + )) goto end; + if (cell->extend_right > 1) + { + if (extract_astring_catf(alloc, content, " <w:gridSpan w:val=\"%i\"/>\n", cell->extend_right)) goto end; + } + if (cell->above) + { + if (cell->extend_down > 1) + { + if (extract_astring_catf(alloc, content, " <w:vMerge w:val=\"restart\"/>\n", cell->extend_down)) goto end; + } + } + else + { + if (extract_astring_catf(alloc, content, " <w:vMerge w:val=\"continue\"/>\n")) goto end; + } + if (extract_astring_cat(alloc, content, " </w:tcPr>\n")) goto end; + } + + /* Write contents of this cell. */ + { + size_t chars_num_old = content->chars_num; + int p; + content_state_t content_state = {0}; + content_state.font.name = NULL; + content_state.ctm_prev = NULL; + for (p=0; p<cell->paragraphs_num; ++p) + { + paragraph_t* paragraph = cell->paragraphs[p]; + if (s_document_to_docx_content_paragraph(alloc, &content_state, paragraph, content)) goto end; + } + if (content_state.font.name) + { + if (s_docx_run_finish(alloc, &content_state, content)) goto end; + } + + /* Need to write out at least an empty paragraph in each cell, + otherwise Word/Libreoffice fail to show table at all; the + OOXML spec says "If a table cell does not include at least one + block-level element, then this document shall be considered + corrupt." */ + if (content->chars_num == chars_num_old) + { + if (extract_astring_catf(alloc, content, "<w:p/>\n")) goto end; + } + } + if (extract_astring_cat(alloc, content, " </w:tc>\n")) goto end; + } + if (extract_astring_cat(alloc, content, " </w:tr>\n")) goto end; + } + if (extract_astring_cat(alloc, content, " </w:tbl>\n")) goto end; + e = 0; + + end: + return e; +} + +static int s_docx_append_rotated_paragraphs( + extract_alloc_t* alloc, + extract_page_t* page, + content_state_t* state, + int* p, + int* text_box_id, + const matrix_t* ctm, + double rotate, + extract_astring_t* content + ) +/* Appends paragraphs with same rotation, starting with page->paragraphs[*p] +and updates *p. */ +{ + /* Find extent of paragraphs with this same rotation. extent + will contain max width and max height of paragraphs, in units + before application of ctm, i.e. before rotation. */ + int e = -1; + point_t extent = {0, 0}; + int p0 = *p; + int p1; + paragraph_t* paragraph = page->paragraphs[*p]; + + outf("rotate=%.2frad=%.1fdeg ctm: ef=(%f %f) abcd=(%f %f %f %f)", + rotate, rotate * 180 / pi, + ctm->e, + ctm->f, + ctm->a, + ctm->b, + ctm->c, + ctm->d + ); + + { + /* We assume that first span is at origin of text + block. This assumes left-to-right text. */ + double rotate0 = rotate; + const matrix_t* ctm0 = ctm; + point_t origin = { + paragraph->lines[0]->spans[0]->chars[0].x, + paragraph->lines[0]->spans[0]->chars[0].y + }; + matrix_t ctm_inverse = {1, 0, 0, 1, 0, 0}; + double ctm_det = ctm->a*ctm->d - ctm->b*ctm->c; + if (ctm_det != 0) { + ctm_inverse.a = +ctm->d / ctm_det; + ctm_inverse.b = -ctm->b / ctm_det; + ctm_inverse.c = -ctm->c / ctm_det; + ctm_inverse.d = +ctm->a / ctm_det; + } + else { + outf("cannot invert ctm=(%f %f %f %f)", + ctm->a, ctm->b, ctm->c, ctm->d); + } + + for (*p=p0; *p<page->paragraphs_num; ++(*p)) { + paragraph = page->paragraphs[*p]; + ctm = ¶graph->lines[0]->spans[0]->ctm; + rotate = atan2(ctm->b, ctm->a); + if (rotate != rotate0) { + break; + } + + /* Update <extent>. */ + { + int l; + for (l=0; l<paragraph->lines_num; ++l) { + line_t* line = paragraph->lines[l]; + span_t* span = extract_line_span_last(line); + char_t* char_ = extract_span_char_last(span); + double adv = char_->adv * extract_matrix_expansion(span->trm); + double x = char_->x + adv * cos(rotate); + double y = char_->y + adv * sin(rotate); + + double dx = x - origin.x; + double dy = y - origin.y; + + /* Position relative to origin and before box rotation. */ + double xx = ctm_inverse.a * dx + ctm_inverse.b * dy; + double yy = ctm_inverse.c * dx + ctm_inverse.d * dy; + yy = -yy; + if (xx > extent.x) extent.x = xx; + if (yy > extent.y) extent.y = yy; + if (0) outf("rotate=%f *p=%i: origin=(%f %f) xy=(%f %f) dxy=(%f %f) xxyy=(%f %f) span: %s", + rotate, *p, origin.x, origin.y, x, y, dx, dy, xx, yy, extract_span_string(alloc, span)); + } + } + } + p1 = *p; + rotate = rotate0; + ctm = ctm0; + outf("rotate=%f p0=%i p1=%i. extent is: (%f %f)", + rotate, p0, p1, extent.x, extent.y); + } + + /* Paragraphs p0..p1-1 have same rotation. We output them into + a single rotated text box. */ + + /* We need unique id for text box. */ + *text_box_id += 1; + + { + /* Angles are in units of 1/60,000 degree. */ + int rot = (int) (rotate * 180 / pi * 60000); + + /* <wp:anchor distT=\.. etc are in EMU - 1/360,000 of a cm. + relativeHeight is z-ordering. (wp:positionV:wp:posOffset, + wp:positionV:wp:posOffset) is position of origin of box in + EMU. + + The box rotates about its centre but we want to rotate + about the origin (top-left). So we correct the position of + box by subtracting the vector that the top-left moves when + rotated by angle <rotate> about the middle. */ + double point_to_emu = 12700; /* https://en.wikipedia.org/wiki/Office_Open_XML_file_formats#DrawingML */ + int x = (int) (ctm->e * point_to_emu); + int y = (int) (ctm->f * point_to_emu); + int w = (int) (extent.x * point_to_emu); + int h = (int) (extent.y * point_to_emu); + int dx; + int dy; + + if (0) outf("rotate: %f rad, %f deg. rot=%i", rotate, rotate*180/pi, rot); + + h *= 2; + /* We can't predict how much space Word will actually + require for the rotated text, so make the box have the + original width but allow text to take extra vertical + space. There doesn't seem to be a way to make the text box + auto-grow to contain the text. */ + + dx = (int) ((1-cos(rotate)) * w / 2.0 + sin(rotate) * h / 2.0); + dy = (int) ((cos(rotate)-1) * h / 2.0 + sin(rotate) * w / 2.0); + outf("ctm->e,f=%f,%f rotate=%f => x,y=%ik %ik dx,dy=%ik %ik", + ctm->e, + ctm->f, + rotate * 180/pi, + x/1000, + y/1000, + dx/1000, + dy/1000 + ); + x -= dx; + y -= -dy; + + if (s_docx_output_rotated_paragraphs(alloc, page, p0, p1, rot, x, y, w, h, *text_box_id, content, state)) goto end; + } + *p = p1 - 1; + e = 0; + + end: + + return e; +} + int extract_document_to_docx_content( extract_alloc_t* alloc, document_t* document, @@ -422,184 +651,73 @@ int extract_document_to_docx_content( /* Write paragraphs into <content>. */ for (p=0; p<document->pages_num; ++p) { extract_page_t* page = document->pages[p]; - int p; - content_state_t state; - state.font_name = NULL; - state.font_size = 0; - state.font_bold = 0; - state.font_italic = 0; - state.ctm_prev = NULL; - for (p=0; p<page->paragraphs_num; ++p) { - paragraph_t* paragraph = page->paragraphs[p]; - const matrix_t* ctm = ¶graph->lines[0]->spans[0]->ctm; - double rotate = atan2(ctm->b, ctm->a); + int p = 0; + int t = 0; + + content_state_t content_state; + content_state.font.name = NULL; + content_state.font.size = 0; + content_state.font.bold = 0; + content_state.font.italic = 0; + content_state.ctm_prev = NULL; + + /* Output paragraphs and tables in order of y coordinate. */ + for(;;) + { + paragraph_t* paragraph = (p == page->paragraphs_num) ? NULL : page->paragraphs[p]; + table_t* table = (t == page->tables_num) ? NULL : page->tables[t]; + double y_paragraph; + double y_table; + if (!paragraph && !table) break; + y_paragraph = (paragraph) ? paragraph->lines[0]->spans[0]->chars[0].y : DBL_MAX; + y_table = (table) ? table->pos.y : DBL_MAX; - if (spacing - && state.ctm_prev - && paragraph->lines_num - && paragraph->lines[0]->spans_num - && matrix_cmp4( - state.ctm_prev, - ¶graph->lines[0]->spans[0]->ctm - ) - ) { - /* Extra vertical space between paragraphs that were at - different angles in the original document. */ - if (extract_docx_paragraph_empty(alloc, content)) goto end; - } + if (paragraph && y_paragraph < y_table) + { + const matrix_t* ctm = ¶graph->lines[0]->spans[0]->ctm; + double rotate = atan2(ctm->b, ctm->a); + + if (spacing + && content_state.ctm_prev + && paragraph->lines_num + && paragraph->lines[0]->spans_num + && extract_matrix_cmp4( + content_state.ctm_prev, + ¶graph->lines[0]->spans[0]->ctm + ) + ) { + /* Extra vertical space between paragraphs that were at + different angles in the original document. */ + if (s_docx_paragraph_empty(alloc, content)) goto end; + } - if (spacing) { - /* Extra vertical space between paragraphs. */ - if (extract_docx_paragraph_empty(alloc, content)) goto end; - } - - if (rotation && rotate != 0) { - - /* Find extent of paragraphs with this same rotation. extent - will contain max width and max height of paragraphs, in units - before application of ctm, i.e. before rotation. */ - point_t extent = {0, 0}; - int p0 = p; - int p1; - - outf("rotate=%.2frad=%.1fdeg ctm: ef=(%f %f) abcd=(%f %f %f %f)", - rotate, rotate * 180 / pi, - ctm->e, - ctm->f, - ctm->a, - ctm->b, - ctm->c, - ctm->d - ); - - { - /* We assume that first span is at origin of text - block. This assumes left-to-right text. */ - double rotate0 = rotate; - const matrix_t* ctm0 = ctm; - point_t origin = { - paragraph->lines[0]->spans[0]->chars[0].x, - paragraph->lines[0]->spans[0]->chars[0].y - }; - matrix_t ctm_inverse = {1, 0, 0, 1, 0, 0}; - double ctm_det = ctm->a*ctm->d - ctm->b*ctm->c; - if (ctm_det != 0) { - ctm_inverse.a = +ctm->d / ctm_det; - ctm_inverse.b = -ctm->b / ctm_det; - ctm_inverse.c = -ctm->c / ctm_det; - ctm_inverse.d = +ctm->a / ctm_det; - } - else { - outf("cannot invert ctm=(%f %f %f %f)", - ctm->a, ctm->b, ctm->c, ctm->d); - } + if (spacing) { + /* Extra vertical space between paragraphs. */ + if (s_docx_paragraph_empty(alloc, content)) goto end; + } - for (p=p0; p<page->paragraphs_num; ++p) { - paragraph = page->paragraphs[p]; - ctm = ¶graph->lines[0]->spans[0]->ctm; - rotate = atan2(ctm->b, ctm->a); - if (rotate != rotate0) { - break; - } - - /* Update <extent>. */ - { - int l; - for (l=0; l<paragraph->lines_num; ++l) { - line_t* line = paragraph->lines[l]; - span_t* span = line_span_last(line); - char_t* char_ = span_char_last(span); - double adv = char_->adv * matrix_expansion(span->trm); - double x = char_->x + adv * cos(rotate); - double y = char_->y + adv * sin(rotate); - - double dx = x - origin.x; - double dy = y - origin.y; - - /* Position relative to origin and before box rotation. */ - double xx = ctm_inverse.a * dx + ctm_inverse.b * dy; - double yy = ctm_inverse.c * dx + ctm_inverse.d * dy; - yy = -yy; - if (xx > extent.x) extent.x = xx; - if (yy > extent.y) extent.y = yy; - if (0) outf("rotate=%f p=%i: origin=(%f %f) xy=(%f %f) dxy=(%f %f) xxyy=(%f %f) span: %s", - rotate, p, origin.x, origin.y, x, y, dx, dy, xx, yy, span_string(alloc, span)); - } - } - } - p1 = p; - rotate = rotate0; - ctm = ctm0; - outf("rotate=%f p0=%i p1=%i. extent is: (%f %f)", - rotate, p0, p1, extent.x, extent.y); + if (rotation && rotate != 0) + { + if (s_docx_append_rotated_paragraphs(alloc, page, &content_state, &p, &text_box_id, ctm, rotate, content)) goto end; } - - /* Paragraphs p0..p1-1 have same rotation. We output them into - a single rotated text box. */ - - /* We need unique id for text box. */ - text_box_id += 1; - + else { - /* Angles are in units of 1/60,000 degree. */ - int rot = (int) (rotate * 180 / pi * 60000); - - /* <wp:anchor distT=\.. etc are in EMU - 1/360,000 of a cm. - relativeHeight is z-ordering. (wp:positionV:wp:posOffset, - wp:positionV:wp:posOffset) is position of origin of box in - EMU. - - The box rotates about its centre but we want to rotate - about the origin (top-left). So we correct the position of - box by subtracting the vector that the top-left moves when - rotated by angle <rotate> about the middle. */ - double point_to_emu = 12700; /* https://en.wikipedia.org/wiki/Office_Open_XML_file_formats#DrawingML */ - int x = (int) (ctm->e * point_to_emu); - int y = (int) (ctm->f * point_to_emu); - int w = (int) (extent.x * point_to_emu); - int h = (int) (extent.y * point_to_emu); - int dx; - int dy; - - if (0) outf("rotate: %f rad, %f deg. rot=%i", rotate, rotate*180/pi, rot); - - h *= 2; - /* We can't predict how much space Word will actually - require for the rotated text, so make the box have the - original width but allow text to take extra vertical - space. There doesn't seem to be a way to make the text box - auto-grow to contain the text. */ - - dx = (int) ((1-cos(rotate)) * w / 2.0 + sin(rotate) * h / 2.0); - dy = (int) ((cos(rotate)-1) * h / 2.0 + sin(rotate) * w / 2.0); - outf("ctm->e,f=%f,%f rotate=%f => x,y=%ik %ik dx,dy=%ik %ik", - ctm->e, - ctm->f, - rotate * 180/pi, - x/1000, - y/1000, - dx/1000, - dy/1000 - ); - x -= dx; - y -= -dy; - - if (extract_document_output_rotated_paragraphs(alloc, page, p0, p1, rot, x, y, w, h, text_box_id, content, &state)) goto end; + if (s_document_to_docx_content_paragraph(alloc, &content_state, paragraph, content)) goto end; } - p = p1 - 1; - //p = page->paragraphs_num - 1; + p += 1; } - else { - if (extract_document_to_docx_content_paragraph(alloc, &state, paragraph, content)) goto end; + else if (table) + { + if (s_docx_append_table(alloc, table, content)) goto end; + t += 1; } - } if (images) { int i; for (i=0; i<page->images_num; ++i) { - extract_document_append_image(alloc, content, &page->images[i]); + s_docx_append_image(alloc, content, &page->images[i]); } } } @@ -738,7 +856,6 @@ int extract_docx_write_template( int e = -1; int i; char* path_tempdir = NULL; - FILE* f = NULL; char* path = NULL; char* text = NULL; char* text2 = NULL; @@ -841,7 +958,6 @@ int extract_docx_write_template( extract_free(alloc, &path); extract_free(alloc, &text); extract_free(alloc, &text2); - if (f) fclose(f); if (e) { outf("Failed to create %s", path_out); diff --git a/extract/src/docx.h b/extract/src/docx.h index 6e26568f..976272a6 100644 --- a/extract/src/docx.h +++ b/extract/src/docx.h @@ -13,8 +13,8 @@ int extract_document_to_docx_content( int images, extract_astring_t* content ); -/* Makes *o_content point to a string containing all paragraphs in *document in -docx XML format. +/* Makes *o_content point to a string containing all paragraphs, images and +tables (tables as of 2021-07-22) in *document in docx XML format. This string can be passed to extract_docx_content_item() or extract_docx_write_template() to be inserted into a docx archive's diff --git a/extract/src/docx_template_build.py b/extract/src/docx_template_build.py index 5e2f5380..8b836300 100755 --- a/extract/src/docx_template_build.py +++ b/extract/src/docx_template_build.py @@ -9,6 +9,9 @@ Args: --pretty <directory> Prettyfies all .xml files within <directory> using 'xmllint --format'. + -f + Force touch of output file, even if unchanged. + -i <in-path> Set template docx/odt file to extract from. @@ -57,12 +60,17 @@ def write(text, path, encoding): with open(path, 'wb') as f: f.write(text.encode(encoding)) -def write_if_diff(text, path, encoding): - if os.path.isfile(path): - old = read(path, encoding) - if old == text: - return - print(f'Updating path={path} because contents have changed') +def write_if_diff(text, path, encoding, force): + ''' + Does nothing if <force> is false and file named <path> already contains + <text>. Otherwise writes <text> to file named <path>. + ''' + if not force: + if os.path.isfile(path): + old = read(path, encoding) + if old == text: + return + print(f'Updating path={path} because contents have changed') write(text, path, encoding) def check_path_safe(path): @@ -98,6 +106,8 @@ def main(): path_in = None path_out = None infix = None + force = False + args = iter(sys.argv[1:]) while 1: try: arg = next(args) @@ -114,6 +124,8 @@ def main(): path = os.path.join(dirpath, filename) system(f'xmllint --format {path} > {path}-') system(f'mv {path}- {path}') + elif arg == '-f': + force = True elif arg == '-i': path_in = next(args) elif arg == '-n': @@ -166,7 +178,7 @@ def main(): for filename in sorted(filenames): num_items += 1 path = os.path.join(dirpath, filename) - print(f'looking at path={path}') + #print(f'looking at path={path}') name = path[ len(path_temp)+1: ] out_c.write(f' {{\n') out_c.write(f' "{name}",\n') @@ -213,7 +225,7 @@ def main(): out_c.write(f'int {infix}_template_items_num = {num_items};\n') out_c = out_c.getvalue() - write_if_diff(out_c, f'{path_out}.c', 'utf-8') + write_if_diff(out_c, f'{path_out}.c', 'utf-8', force) out_h = io.StringIO() out_h.write(f'#ifndef EXTRACT_{infix.upper()}_TEMPLATE_H\n') @@ -233,7 +245,7 @@ def main(): out_h.write(f'\n') out_h.write(f'\n') out_h.write(f'#endif\n') - write_if_diff(out_h.getvalue(), f'{path_out}.h', 'utf-8') + write_if_diff(out_h.getvalue(), f'{path_out}.h', 'utf-8', force) #os.system(f'rm -r "{path_temp}"') if __name__ == '__main__': diff --git a/extract/src/extract-exe.c b/extract/src/extract-exe.c index 22b520db..ee34023a 100644 --- a/extract/src/extract-exe.c +++ b/extract/src/extract-exe.c @@ -139,6 +139,7 @@ int main(int argc, char** argv) if (arg_next_string(argv, argc, &i, &format_name)) goto end; if (!strcmp(format_name, "odt")) format = extract_format_ODT; else if (!strcmp(format_name, "docx")) format = extract_format_DOCX; + else if (!strcmp(format_name, "html")) format = extract_format_HTML; else { printf("-f value should be 'odt' or 'docx', not '%s'.\n", format_name); @@ -170,7 +171,7 @@ int main(int argc, char** argv) else if (!strcmp(arg, "-v")) { int verbose; if (arg_next_int(argv, argc, &i, &verbose)) goto end; - outf_verbose_set(verbose); + extract_outf_verbose_set(verbose); outf("Have changed verbose to %i", verbose); } else if (!strcmp(arg, "--v-alloc")) { diff --git a/extract/src/extract.c b/extract/src/extract.c index 9eb85d2f..2c375571 100644 --- a/extract/src/extract.c +++ b/extract/src/extract.c @@ -5,6 +5,7 @@ #include "document.h" #include "docx.h" #include "docx_template.h" +#include "html.h" #include "mem.h" #include "memento.h" #include "odt.h" @@ -25,7 +26,7 @@ -double matrix_expansion(matrix_t m) +double extract_matrix_expansion(matrix_t m) { return sqrt(fabs(m.a * m.d - m.b * m.c)); } @@ -41,14 +42,31 @@ static void char_init(char_t* item) item->adv = 0; } +const char* extract_point_string(const point_t* point) +{ + static char buffer[128]; + snprintf(buffer, sizeof(buffer), "(%f %f)", point->x, point->y); + return buffer; +} + +const char* extract_rect_string(const rect_t* rect) +{ + static char buffer[2][256]; + static int i = 0; + i = (i + 1) % 2; + snprintf(buffer[i], sizeof(buffer[i]), "((%f %f) (%f %f))", rect->min.x, rect->min.y, rect->max.x, rect->max.y); + return buffer[i]; +} -const char* span_string(extract_alloc_t* alloc, span_t* span) +const char* extract_span_string(extract_alloc_t* alloc, span_t* span) { static extract_astring_t ret = {0}; double x0 = 0; double y0 = 0; + point_t pre0 = {0, 0}; double x1 = 0; double y1 = 0; + point_t pre1 = {0, 0}; int c0 = 0; int c1 = 0; int i; @@ -62,17 +80,23 @@ const char* span_string(extract_alloc_t* alloc, span_t* span) c0 = span->chars[0].ucs; x0 = span->chars[0].x; y0 = span->chars[0].y; + pre0.x = span->chars[0].pre_x; + pre0.y = span->chars[0].pre_y; c1 = span->chars[span->chars_num-1].ucs; x1 = span->chars[span->chars_num-1].x; y1 = span->chars[span->chars_num-1].y; + pre1.x = span->chars[span->chars_num-1].pre_x; + pre1.y = span->chars[span->chars_num-1].pre_y; } { - char buffer[200]; + char buffer[400]; snprintf(buffer, sizeof(buffer), - "span chars_num=%i (%c:%f,%f)..(%c:%f,%f) font=%s:(%f,%f) wmode=%i chars_num=%i: ", + "span ctm=%s trm=%s chars_num=%i (%c:%f,%f pre(%f %f))..(%c:%f,%f pre(%f %f)) font=%s:(%f,%f) wmode=%i chars_num=%i: ", + extract_matrix_string(&span->ctm), + extract_matrix_string(&span->trm), span->chars_num, - c0, x0, y0, - c1, x1, y1, + c0, x0, y0, pre0.x, pre0.y, + c1, x1, y1, pre1.x, pre1.y, span->font_name, span->trm.a, span->trm.d, @@ -84,9 +108,11 @@ const char* span_string(extract_alloc_t* alloc, span_t* span) snprintf( buffer, sizeof(buffer), - " i=%i {x=%f adv=%f}", + " i=%i {x=%f y=%f ucs=%i adv=%f}", i, span->chars[i].x, + span->chars[i].y, + span->chars[i].ucs, span->chars[i].adv ); extract_astring_cat(alloc, &ret, buffer); @@ -101,7 +127,7 @@ const char* span_string(extract_alloc_t* alloc, span_t* span) return ret.chars; } -int span_append_c(extract_alloc_t* alloc, span_t* span, int c) +int extract_span_append_c(extract_alloc_t* alloc, span_t* span, int c) { char_t* item; if (extract_realloc2( @@ -119,7 +145,7 @@ int span_append_c(extract_alloc_t* alloc, span_t* span, int c) return 0; } -char_t* span_char_last(span_t* span) +char_t* extract_span_char_last(span_t* span) { assert(span->chars_num > 0); return &span->chars[span->chars_num-1]; @@ -138,58 +164,62 @@ static const char* line_string(line_t* line) int i; for (i=0; i<line->spans_num; ++i) { extract_astring_cat(&ret, " "); - extract_astring_cat(&ret, span_string(line->spans[i])); + extract_astring_cat(&ret, extract_span_string(line->spans[i])); } return ret.chars; } #endif /* Returns first span in a line. */ -span_t* line_span_last(line_t* line) +span_t* extract_line_span_last(line_t* line) { assert(line->spans_num > 0); return line->spans[line->spans_num - 1]; } -span_t* line_span_first(line_t* line) +span_t* extract_line_span_first(line_t* line) { assert(line->spans_num > 0); return line->spans[0]; } -static void page_free(extract_alloc_t* alloc, extract_page_t* page) + +static void table_free(extract_alloc_t* alloc, table_t** ptable) +{ + int c; + table_t* table = *ptable; + outf("table->cells_num_x=%i table->cells_num_y=%i", + table->cells_num_x, + table->cells_num_y + ); + for (c = 0; c< table->cells_num_x * table->cells_num_y; ++c) + { + extract_cell_free(alloc, &table->cells[c]); + } + extract_free(alloc, &table->cells); + extract_free(alloc, ptable); +} + +static void page_free(extract_alloc_t* alloc, extract_page_t** ppage) { - int s; + extract_page_t* page = *ppage; if (!page) return; - for (s=0; s<page->spans_num; ++s) { - span_t* span = page->spans[s]; - if (span) { - extract_free(alloc, &span->chars); - extract_free(alloc, &span->font_name); - } - extract_free(alloc, &span); - } - extract_free(alloc, &page->spans); + outf0("page=%p page->spans_num=%i page->lines_num=%i", + page, page->spans_num, page->lines_num); + extract_spans_free(alloc, &page->spans, page->spans_num); - { - int l; - for (l=0; l<page->lines_num; ++l) { - line_t* line = page->lines[l]; - extract_free(alloc, &line->spans); - extract_free(alloc, &line); - /* We don't free line->spans->chars[] because already freed via - page->spans. */ - } - } - extract_free(alloc, &page->lines); + extract_lines_free(alloc, &page->lines, page->lines_num); { int p; for (p=0; p<page->paragraphs_num; ++p) { paragraph_t* paragraph = page->paragraphs[p]; + /* We don't call extract_lines_free(¶graph->lines) because + these point into the same data as page->lines, which we have + already freed above. */ if (paragraph) extract_free(alloc, ¶graph->lines); - extract_free(alloc, ¶graph); + extract_free(alloc, &page->paragraphs[p]); } } extract_free(alloc, &page->paragraphs); @@ -197,13 +227,26 @@ static void page_free(extract_alloc_t* alloc, extract_page_t* page) { int i; for (i=0; i<page->images_num; ++i) { - extract_free(alloc, &page->images[i].data); - extract_free(alloc, &page->images[i].type); - extract_free(alloc, &page->images[i].id); - extract_free(alloc, &page->images[i].name); + extract_image_clear(alloc, &page->images[i]); } + extract_free(alloc, &page->images); } extract_free(alloc, &page->images); + + extract_free(alloc, &page->tablelines_horizontal.tablelines); + extract_free(alloc, &page->tablelines_vertical.tablelines); + + { + int t; + outf("page=%p page->tables_num=%i", page, page->tables_num); + for (t=0; t<page->tables_num; ++t) + { + table_free(alloc, &page->tables[t]); + } + extract_free(alloc, &page->tables); + } + + extract_free(alloc, ppage); } static span_t* page_span_append(extract_alloc_t* alloc, extract_page_t* page) @@ -212,9 +255,7 @@ error. */ { span_t* span; if (extract_malloc(alloc, &span, sizeof(*span))) return NULL; - span->font_name = NULL; - span->chars = NULL; - span->chars_num = 0; + extract_span_init(span); if (extract_realloc2( alloc, &page->spans, @@ -234,14 +275,7 @@ static void extract_images_free(extract_alloc_t* alloc, images_t* images) { int i; for (i=0; i<images->images_num; ++i) { - image_t* image = &images->images[i]; - extract_free(alloc, &image->type); - extract_free(alloc, &image->name); - extract_free(alloc, &image->id); - if (image->data_free) { - image->data_free(image->data_free_handle, image->data); - } - extract_free(alloc, &images->images[i]); + extract_image_clear(alloc, &images->images[i]); } extract_free(alloc, &images->images); extract_free(alloc, &images->imagetypes); @@ -260,10 +294,12 @@ On return document->page[].images* will be NULL etc. int p; images_t images = {0}; outf("extract_document_images(): images.images_num=%i", images.images_num); - for (p=0; p<document->pages_num; ++p) { + for (p=0; p<document->pages_num; ++p) + { extract_page_t* page = document->pages[p]; int i; - for (i=0; i<page->images_num; ++i) { + for (i=0; i<page->images_num; ++i) + { image_t* image; if (extract_realloc2( alloc, @@ -280,14 +316,17 @@ On return document->page[].images* will be NULL etc. /* Add image type if we haven't seen it before. */ { int it; - for (it=0; it<images.imagetypes_num; ++it) { + for (it=0; it<images.imagetypes_num; ++it) + { outf("it=%i images.imagetypes[it]=%s image->type=%s", it, images.imagetypes[it], image->type); if (!strcmp(images.imagetypes[it], image->type)) { break; } } - if (it == images.imagetypes_num) { + if (it == images.imagetypes_num) + { + /* We haven't seen this image type before. */ if (extract_realloc2( alloc, &images.imagetypes, @@ -314,9 +353,12 @@ On return document->page[].images* will be NULL etc. } e = 0; end: - if (e) { + if (e) + { + extract_free(alloc, &images.images); } - else { + else + { *o_images = images; } return e; @@ -330,8 +372,7 @@ static void extract_document_free(extract_alloc_t* alloc, document_t* document) } for (p=0; p<document->pages_num; ++p) { extract_page_t* page = document->pages[p]; - page_free(alloc, page); - extract_free(alloc, &page); + page_free(alloc, &page); } extract_free(alloc, &document->pages); document->pages = NULL; @@ -347,7 +388,7 @@ static int s_sign(double x) return 0; } -int matrix_cmp4(const matrix_t* lhs, const matrix_t* rhs) +int extract_matrix_cmp4(const matrix_t* lhs, const matrix_t* rhs) { int ret; ret = s_sign(lhs->a - rhs->a); if (ret) return ret; @@ -358,7 +399,7 @@ int matrix_cmp4(const matrix_t* lhs, const matrix_t* rhs) } -static point_t multiply_matrix_point(matrix_t m, point_t p) +point_t extract_multiply_matrix_point(matrix_t m, point_t p) { double x = p.x; p.x = m.a * x + m.c * p.y; @@ -366,6 +407,18 @@ static point_t multiply_matrix_point(matrix_t m, point_t p) return p; } +matrix_t extract_multiply_matrix_matrix(matrix_t m1, matrix_t m2) +{ + matrix_t ret; + ret.a = m1.a * m2.a + m1.b * m2.c; + ret.b = m1.a * m2.b + m1.b * m2.d; + ret.c = m1.c * m2.a + m1.d * m2.c; + ret.d = m1.c * m2.b + m1.d * m2.d; + ret.e = m1.e + m2.e; + ret.f = m1.f + m2.f; + return ret; +} + static int s_matrix_read(const char* text, matrix_t* matrix) { int n; @@ -427,8 +480,8 @@ char_t into a new span_t. */ return 0; } - font_size = matrix_expansion(span->trm) - * matrix_expansion(span->ctm); + font_size = extract_matrix_expansion(span->trm) + * extract_matrix_expansion(span->ctm); if (span->flags.wmode) { dir.x = 0; @@ -438,7 +491,7 @@ char_t into a new span_t. */ dir.x = 1; dir.y = 0; } - dir = multiply_matrix_point(span->trm, dir); + dir = extract_multiply_matrix_point(span->trm, dir); x = char_[-2].pre_x + char_[-2].adv * dir.x; y = char_[-2].pre_y + char_[-2].adv * dir.y; @@ -470,10 +523,10 @@ char_t into a new span_t. */ sometimes seem to appear in the middle of words for some reason. */ outfx("removing space before final char in: %s", - span_string(span)); + extract_span_string(span)); span->chars[span->chars_num-2] = span->chars[span->chars_num-1]; span->chars_num -= 1; - outfx("span is now: %s", span_string(span)); + outfx("span is now: %s", extract_span_string(span)); return 0; } } @@ -536,9 +589,42 @@ struct extract_t int contentss_num; images_t images; - + extract_format_t format; extract_odt_styles_t odt_styles; + + char* tables_csv_format; + int tables_csv_i; + + enum + { + path_type_NONE, + path_type_FILL, + path_type_STROKE, + } path_type; + + union + { + struct + { + matrix_t ctm; + double color; + point_t points[4]; + int n; + } fill; + + struct + { + matrix_t ctm; + double color; + double width; + point_t point0; + int point0_set; + point_t point; + int point_set; + } stroke; + + } path; }; @@ -551,7 +637,12 @@ int extract_begin( int e = -1; extract_t* extract; - if (format != extract_format_ODT && format != extract_format_DOCX) + if (1 + && format != extract_format_ODT + && format != extract_format_DOCX + && format != extract_format_HTML + && format != extract_format_TEXT + ) { outf0("Invalid format=%i\n", format); errno = EINVAL; @@ -570,6 +661,8 @@ int extract_begin( extract->image_n = 10; extract->format = format; + extract->tables_csv_format = NULL; + extract->tables_csv_i = 0; e = 0; @@ -578,6 +671,11 @@ int extract_begin( return e; } +int extract_tables_csv_format(extract_t* extract, const char* path_format) +{ + return extract_strdup(extract->alloc, path_format, &extract->tables_csv_format); +} + static void image_free_fn(void* handle, void* image_data) { @@ -872,6 +970,22 @@ int extract_span_begin( span_t* span; assert(extract->document.pages_num > 0); page = extract->document.pages[extract->document.pages_num-1]; + outf("extract_span_begin(): ctm=(%f %f %f %f %f %f) trm=(%f %f %f %f %f %f) font_name=%s, wmode=%i", + ctm_a, + ctm_b, + ctm_c, + ctm_d, + ctm_e, + ctm_f, + trm_a, + trm_b, + trm_c, + trm_d, + trm_e, + trm_f, + font_name, + wmode + ); span = page_span_append(extract->alloc, page); if (!span) goto end; span->ctm.a = ctm_a; @@ -880,12 +994,14 @@ int extract_span_begin( span->ctm.d = ctm_d; span->ctm.e = ctm_e; span->ctm.f = ctm_f; + span->trm.a = trm_a; span->trm.b = trm_b; span->trm.c = trm_c; span->trm.d = trm_d; span->trm.e = trm_e; span->trm.f = trm_f; + { const char* ff = strchr(font_name, '+'); const char* f = (ff) ? ff+1 : font_name; @@ -916,7 +1032,49 @@ int extract_add_char( extract_page_t* page = extract->document.pages[extract->document.pages_num-1]; span_t* span = page->spans[page->spans_num - 1]; - if (autosplit && y - extract->span_offset_y != 0) { + outf("(%f %f) ucs=% 5i=%c adv=%f", x, y, ucs, (ucs >=32 && ucs< 127) ? ucs : ' ', adv); + /* Ignore the specified <autosplit> - there seems no advantage to not + splitting spans on multiple lines, and not doing so causes problems with + missing spaces in the output. */ + autosplit = 1; + + if (span->chars_num) + { + char_t* char_prev = &span->chars[span->chars_num - 1]; + double xx = span->ctm.a * x + span->ctm.c * y + span->ctm.e; + double yy = span->ctm.b * x + span->ctm.d * y + span->ctm.f; + double dx = xx - char_prev->x; + double dy = yy - char_prev->y; + double a = atan2(dy, dx); + double span_a; + matrix_t m = extract_multiply_matrix_matrix(span->trm, span->ctm); + point_t dir = {1 - span->flags.wmode, span->flags.wmode}; + dir = extract_multiply_matrix_point(m, dir); + span_a = atan2(dir.y, dir.x); + if (fabs(span_a - a) > 0.01) + { + /* Create new span. */ + span_t* span0 = span; + outf("chars_num=%i prev=(%f %f) => (%f %f) xy=(%f %f) => xxyy=(%f %f) delta=(%f %f) a=%f not in line with dir=(%f %f) a=%f: ", + span->chars_num, + char_prev->pre_x, char_prev->pre_y, + char_prev->x, char_prev->y, + x, y, + xx, yy, + dx, dy, a, + dir.x, dir.y, span_a + ); + extract->num_spans_autosplit += 1; + span = page_span_append(extract->alloc, page); + if (!span) goto end; + *span = *span0; + span->chars = NULL; + span->chars_num = 0; + if (extract_strdup(extract->alloc, span0->font_name, &span->font_name)) goto end; + } + } + + if (0 && autosplit && y - extract->span_offset_y != 0) { double e = span->ctm.e + span->ctm.a * (x - extract->span_offset_x) + span->ctm.b * (y - extract->span_offset_y); @@ -949,21 +1107,20 @@ int extract_add_char( char_pre_y, offset_y); } - if (span_append_c(extract->alloc, span, 0 /*c*/)) goto end; + if (extract_span_append_c(extract->alloc, span, 0 /*c*/)) goto end; + /* Coverity warns, but extract_span_append_c() will have appended an item. */ + /* coverity[var_deref_op] */ char_ = &span->chars[ span->chars_num-1]; - char_->pre_x = x - extract->span_offset_x; - char_->pre_y = y - extract->span_offset_y; + char_->pre_x = x; + char_->pre_y = y; - char_->x = span->ctm.a * char_->pre_x + span->ctm.b * char_->pre_y; - char_->y = span->ctm.c * char_->pre_x + span->ctm.d * char_->pre_y; + char_->x = span->ctm.a * char_->pre_x + span->ctm.c * char_->pre_y + span->ctm.e; + char_->y = span->ctm.b * char_->pre_x + span->ctm.d * char_->pre_y + span->ctm.f; char_->adv = adv; char_->ucs = ucs; - char_->x += span->ctm.e; - char_->y += span->ctm.f; - { int page_spans_num_old = page->spans_num; if (page_span_end_clean(extract->alloc, page)) goto end; @@ -1049,6 +1206,174 @@ int extract_add_image( return e; } + +static int tablelines_append(extract_alloc_t* alloc, tablelines_t* tablelines, rect_t* rect, double color) +{ + if (extract_realloc( + alloc, + &tablelines->tablelines, + sizeof(*tablelines->tablelines) * (tablelines->tablelines_num + 1) + )) return -1; + tablelines->tablelines[ tablelines->tablelines_num].rect = *rect; + tablelines->tablelines[ tablelines->tablelines_num].color = (float) color; + tablelines->tablelines_num += 1; + return 0; +} + +static point_t transform(double x, double y, + double ctm_a, + double ctm_b, + double ctm_c, + double ctm_d, + double ctm_e, + double ctm_f + ) +{ + point_t ret; + ret.x = ctm_a * x + ctm_b * y + ctm_e; + ret.y = ctm_c * x + ctm_d * y + ctm_f; + return ret; +} + +static double s_min(double a, double b) +{ + return (a < b) ? a : b; +} + +static double s_max(double a, double b) +{ + return (a > b) ? a : b; +} + +int extract_add_path4( + extract_t* extract, + double ctm_a, + double ctm_b, + double ctm_c, + double ctm_d, + double ctm_e, + double ctm_f, + double x0, + double y0, + double x1, + double y1, + double x2, + double y2, + double x3, + double y3, + double color + ) +{ + extract_page_t* page = extract->document.pages[extract->document.pages_num-1]; + point_t points[4] = { + transform(x0, y0, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f), + transform(x1, y1, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f), + transform(x2, y2, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f), + transform(x3, y3, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f) + }; + rect_t rect; + int i; + double dx; + double dy; + if (0 && color == 1) + { + return 0; + } + outf("cmt=(%f %f %f %f %f %f) points=[(%f %f) (%f %f) (%f %f) (%f %f)]", + ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f, + x0, y0, x1, y1, x2, y2, x3, y3 + ); + outf("extract_add_path4(): [(%f %f) (%f %f) (%f %f) (%f %f)]", + x0, y0, x1, y1, x2, y2, x3, y3); + /* Find first step with dx > 0. */ + for (i=0; i<4; ++i) + { + if (points[(i+1) % 4].x > points[(i+0) % 4].x) break; + } + outf("i=%i", i); + if (i == 4) return 0; + rect.min.x = points[(i+0) % 4].x; + rect.max.x = points[(i+1) % 4].x; + if (points[(i+2) % 4].x != rect.max.x) return 0; + if (points[(i+3) % 4].x != rect.min.x) return 0; + y0 = points[(i+1) % 4].y; + y1 = points[(i+2) % 4].y; + if (y0 == y1) return 0; + if (points[(i+3) % 4].y != y1) return 0; + if (points[(i+4) % 4].y != y0) return 0; + rect.min.y = (y1 > y0) ? y0 : y1; + rect.max.y = (y1 > y0) ? y1 : y0; + + dx = rect.max.x - rect.min.x; + dy = rect.max.y - rect.min.y; + if (dx / dy > 5) + { + /* Horizontal line. */ + outf("have found horizontal line: %s", extract_rect_string(&rect)); + if (tablelines_append(extract->alloc, &page->tablelines_horizontal, &rect, color)) return -1; + } + else if (dy / dx > 5) + { + /* Vertical line. */ + outf("have found vertical line: %s", extract_rect_string(&rect)); + if (tablelines_append(extract->alloc, &page->tablelines_vertical, &rect, color)) return -1; + } + return 0; +} + + +int extract_add_line( + extract_t* extract, + double ctm_a, + double ctm_b, + double ctm_c, + double ctm_d, + double ctm_e, + double ctm_f, + double width, + double x0, + double y0, + double x1, + double y1, + double color + ) +{ + extract_page_t* page = extract->document.pages[extract->document.pages_num-1]; + point_t p0 = transform(x0, y0, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f); + point_t p1 = transform(x1, y1, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f); + double width2 = width * sqrt( fabs( ctm_a * ctm_d - ctm_b * ctm_c)); + rect_t rect; + (void) color; + rect.min.x = s_min(p0.x, p1.x); + rect.min.y = s_min(p0.y, p1.y); + rect.max.x = s_max(p0.x, p1.x); + rect.max.y = s_max(p0.y, p1.y); + + outf("%s: width=%f ((%f %f)(%f %f)) rect=%s", + extract_FUNCTION, + width, + x0, y0, x1, y1, + extract_rect_string(&rect) + ); + if (rect.min.x == rect.max.x && rect.min.y == rect.max.y) + { + } + else if (rect.min.x == rect.max.x) + { + rect.min.x -= width2 / 2; + rect.max.x += width2 / 2; + return tablelines_append(extract->alloc, &page->tablelines_vertical, &rect, color); + } + else if (rect.min.y == rect.max.y) + { + rect.min.y -= width2 / 2; + rect.max.y += width2 / 2; + return tablelines_append(extract->alloc, &page->tablelines_horizontal, &rect, color); + } + return 0; +} + + int extract_page_begin(extract_t* extract) { /* Appends new empty extract_page_t to an extract->document. */ @@ -1062,6 +1387,13 @@ int extract_page_begin(extract_t* extract) page->paragraphs_num = 0; page->images = NULL; page->images_num = 0; + page->tablelines_horizontal.tablelines = NULL; + page->tablelines_horizontal.tablelines_num = 0; + page->tablelines_vertical.tablelines = NULL; + page->tablelines_vertical.tablelines_num = 0; + page->tables = NULL; + page->tables_num = 0; + if (extract_realloc2( extract->alloc, &extract->document.pages, @@ -1076,6 +1408,231 @@ int extract_page_begin(extract_t* extract) return 0; } +int extract_fill_begin( + extract_t* extract, + double ctm_a, + double ctm_b, + double ctm_c, + double ctm_d, + double ctm_e, + double ctm_f, + double color + ) +{ + assert(extract->path_type == path_type_NONE); + extract->path_type = path_type_FILL; + extract->path.fill.color = color; + extract->path.fill.n = 0; + extract->path.fill.ctm.a = ctm_a; + extract->path.fill.ctm.b = ctm_b; + extract->path.fill.ctm.c = ctm_c; + extract->path.fill.ctm.d = ctm_d; + extract->path.fill.ctm.e = ctm_e; + extract->path.fill.ctm.f = ctm_f; + return 0; +} + +int extract_stroke_begin( + extract_t* extract, + double ctm_a, + double ctm_b, + double ctm_c, + double ctm_d, + double ctm_e, + double ctm_f, + double line_width, + double color + ) +{ + assert(extract->path_type == path_type_NONE); + extract->path_type = path_type_STROKE; + extract->path.stroke.ctm.a = ctm_a; + extract->path.stroke.ctm.b = ctm_b; + extract->path.stroke.ctm.c = ctm_c; + extract->path.stroke.ctm.d = ctm_d; + extract->path.stroke.ctm.e = ctm_e; + extract->path.stroke.ctm.f = ctm_f; + extract->path.stroke.width = line_width; + extract->path.stroke.color = color; + extract->path.stroke.point0_set = 0; + extract->path.stroke.point_set = 0; + return 0; +} + +int extract_moveto(extract_t* extract, double x, double y) +{ + if (extract->path_type == path_type_FILL) + { + if (extract->path.fill.n == -1) return 0; + if (extract->path.fill.n != 0) + { + outf0("returning error. extract->path.fill.n=%i", extract->path.fill.n); + extract->path.fill.n = -1; + return 0; + } + extract->path.fill.points[extract->path.fill.n].x = x; + extract->path.fill.points[extract->path.fill.n].y = y; + extract->path.fill.n += 1; + return 0; + } + else if (extract->path_type == path_type_STROKE) + { + extract->path.stroke.point.x = x; + extract->path.stroke.point.y = y; + extract->path.stroke.point_set = 1; + if (!extract->path.stroke.point0_set) + { + extract->path.stroke.point0 = extract->path.stroke.point; + extract->path.stroke.point0_set = 1; + } + return 0; + } + else + { + assert(0); + return -1; + } +} + +int extract_lineto(extract_t* extract, double x, double y) +{ + if (extract->path_type == path_type_FILL) + { + if (extract->path.fill.n == -1) return 0; + if (extract->path.fill.n == 0 || extract->path.fill.n >= 4) + { + outf0("returning error. extract->path.fill.n=%i", extract->path.fill.n); + extract->path.fill.n = -1; + return 0; + } + extract->path.fill.points[extract->path.fill.n].x = x; + extract->path.fill.points[extract->path.fill.n].y = y; + extract->path.fill.n += 1; + return 0; + } + else if (extract->path_type == path_type_STROKE) + { + if (extract->path.stroke.point_set) + { + if (extract_add_line( + extract, + extract->path.stroke.ctm.a, + extract->path.stroke.ctm.b, + extract->path.stroke.ctm.c, + extract->path.stroke.ctm.d, + extract->path.stroke.ctm.e, + extract->path.stroke.ctm.f, + extract->path.stroke.width, + extract->path.stroke.point.x, + extract->path.stroke.point.y, + x, + y, + extract->path.stroke.color + )) + { + return -1; + } + } + extract->path.stroke.point.x = x; + extract->path.stroke.point.y = y; + extract->path.stroke.point_set = 1; + if (!extract->path.stroke.point0_set) + { + extract->path.stroke.point0 = extract->path.stroke.point; + extract->path.stroke.point0_set = 1; + } + return 0; + } + else + { + assert(0); + return -1; + } +} + +int extract_closepath(extract_t* extract) +{ + if (extract->path_type == path_type_FILL) + { + if (extract->path.fill.n == 4) + { + /* We are closing a four-element path, so this could be a thin + rectangle that defines a line in a table. */ + int e; + e = extract_add_path4( + extract, + extract->path.fill.ctm.a, + extract->path.fill.ctm.b, + extract->path.fill.ctm.c, + extract->path.fill.ctm.d, + extract->path.fill.ctm.e, + extract->path.fill.ctm.f, + extract->path.fill.points[0].x, + extract->path.fill.points[0].y, + extract->path.fill.points[1].x, + extract->path.fill.points[1].y, + extract->path.fill.points[2].x, + extract->path.fill.points[2].y, + extract->path.fill.points[3].x, + extract->path.fill.points[3].y, + extract->path.fill.color + ); + if (e) return e; + } + extract->path.fill.n = 0; + return 0; + } + else if (extract->path_type == path_type_STROKE) + { + if (extract->path.stroke.point0_set && extract->path.stroke.point_set) + { + if (extract_add_line( + extract, + extract->path.stroke.ctm.a, + extract->path.stroke.ctm.b, + extract->path.stroke.ctm.c, + extract->path.stroke.ctm.d, + extract->path.stroke.ctm.e, + extract->path.stroke.ctm.f, + extract->path.stroke.width, + extract->path.stroke.point.x, + extract->path.stroke.point.y, + extract->path.stroke.point0.x, + extract->path.stroke.point0.y, + extract->path.stroke.color + )) + { + return -1; + } + return 0; + } + extract->path.stroke.point = extract->path.stroke.point0; + return 0; + } + else + { + assert(0); + return -1; + } +} + + +int extract_fill_end(extract_t* extract) +{ + assert(extract->path_type == path_type_FILL); + extract->path_type = path_type_NONE; + return 0; +} + + +int extract_stroke_end(extract_t* extract) +{ + assert(extract->path_type == path_type_STROKE); + extract->path_type = path_type_NONE; + return 0; +} + + int extract_page_end(extract_t* extract) { @@ -1083,6 +1640,118 @@ int extract_page_end(extract_t* extract) return 0; } + +static int paragraphs_to_text_content( + extract_alloc_t* alloc, + paragraph_t** paragraphs, + int paragraphs_num, + extract_astring_t* text + ) +{ + int p; + for (p=0; p<paragraphs_num; ++p) + { + paragraph_t* paragraph = paragraphs[p]; + int l; + for (l=0; l<paragraph->lines_num; ++l) + { + line_t* line = paragraph->lines[l]; + int s; + for (s=0; s<line->spans_num; ++s) + { + span_t* span = line->spans[s]; + int c; + for (c=0; c<span->chars_num; ++c) + { + /* We encode each character as utf8. */ + char_t* char_ = &span->chars[c]; + unsigned cc = char_->ucs; + if (extract_astring_catc_unicode( + alloc, + text, + cc, + 0 /*xml*/, + 1 /*ascii_ligatures*/, + 1 /*ascii_dash*/, + 1 /*ascii_apostrophe*/ + )) return -1; + } + } + } + if (extract_astring_catc(alloc, text, '\n')) return -1; + } + return 0; +} + + +static int extract_write_tables_csv(extract_t* extract) +{ + int ret = -1; + int p; + char* path = NULL; + FILE* f = NULL; + extract_astring_t text = {NULL, 0}; + if (!extract->tables_csv_format) return 0; + + outf("extract_write_tables_csv(): path_format=%s", extract->tables_csv_format); + outf("extract->document.pages_num=%i", extract->document.pages_num); + for (p=0; p<extract->document.pages_num; ++p) + { + extract_page_t* page = extract->document.pages[p]; + int t; + outf("p=%i page->tables_num=%i", p, page->tables_num); + for (t=0; t<page->tables_num; ++t) + { + table_t* table = page->tables[t]; + int y; + extract_free(extract->alloc, &path); + if (extract_asprintf(extract->alloc, &path, extract->tables_csv_format, extract->tables_csv_i) < 0) goto end; + extract->tables_csv_i += 1; + outf("Writing table %i to: %s", t, path); + outf("table->cells_num_x=%i", table->cells_num_x); + outf("table->cells_num_y=%i", table->cells_num_y); + f = fopen(path, "w"); + if (!f) goto end; + for (y=0; y<table->cells_num_y; ++y) + { + int x; + int have_output = 0; + for (x=0; x<table->cells_num_x; ++x) + { + cell_t* cell = table->cells[table->cells_num_x * y + x]; + extract_astring_free(extract->alloc, &text); + if (y==0) + { + outf("y=0 x=%i cell->rect=%s", x, extract_rect_string(&cell->rect)); + } + if (have_output) fprintf(f, ","); + have_output = 1; + if (paragraphs_to_text_content( + extract->alloc, + cell->paragraphs, + cell->paragraphs_num, + &text + )) goto end; + /* Reference cvs output trims trailing spaces. */ + extract_astring_char_truncate_if(&text, ' '); + fprintf(f, "\"%s\"", text.chars ? text.chars : ""); + } + fprintf(f, "\n"); + } + fclose(f); + f = NULL; + } + } + ret = 0; + + end: + if (f) fclose(f); + extract_free(extract->alloc, &path); + extract_astring_free(extract->alloc, &text); + return ret; +} + + int extract_process( extract_t* extract, int spacing, @@ -1126,6 +1795,30 @@ int extract_process( &extract->contentss[extract->contentss_num - 1] )) goto end; } + else if (extract->format == extract_format_HTML) + { + if (extract_document_to_html_content( + extract->alloc, + &extract->document, + rotation, + images, + &extract->contentss[extract->contentss_num - 1] + )) goto end; + } + else if (extract->format == extract_format_TEXT) + { + int p; + for (p=0; p<extract->document.pages_num; ++p) + { + extract_page_t* page = extract->document.pages[p]; + if (paragraphs_to_text_content( + extract->alloc, + page->paragraphs, + page->paragraphs_num, + &extract->contentss[extract->contentss_num - 1] + )) goto end; + } + } else { outf0("Invalid format=%i", extract->format); @@ -1136,11 +1829,15 @@ int extract_process( if (extract_document_images(extract->alloc, &extract->document, &extract->images)) goto end; + if (extract->tables_csv_format) + { + extract_write_tables_csv(extract); + } + { int i; for (i=0; i<extract->document.pages_num; ++i) { - page_free(extract->alloc, extract->document.pages[i]); - extract_free(extract->alloc, &extract->document.pages[i]); + page_free(extract->alloc, &extract->document.pages[i]); } extract_free(extract->alloc, &extract->document.pages); extract->document.pages_num = 0; @@ -1159,9 +1856,9 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer) char* text2 = NULL; int i; - if (extract_zip_open(buffer, &zip)) goto end; if (extract->format == extract_format_ODT) { + if (extract_zip_open(buffer, &zip)) goto end; for (i=0; i<odt_template_items_num; ++i) { const odt_template_item_t* item = &odt_template_items[i]; extract_free(extract->alloc, &text2); @@ -1191,9 +1888,11 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer) if (extract_asprintf(extract->alloc, &text2, "Pictures/%s", image->name) < 0) goto end; if (extract_zip_write_file(zip, image->data, image->data_size, text2)) goto end; } + if (extract_zip_close(&zip)) goto end; } else if (extract->format == extract_format_DOCX) { + if (extract_zip_open(buffer, &zip)) goto end; for (i=0; i<docx_template_items_num; ++i) { const docx_template_item_t* item = &docx_template_items[i]; extract_free(extract->alloc, &text2); @@ -1222,6 +1921,22 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer) if (extract_asprintf(extract->alloc, &text2, "word/media/%s", image->name) < 0) goto end; if (extract_zip_write_file(zip, image->data, image->data_size, text2)) goto end; } + if (extract_zip_close(&zip)) goto end; + + } + else if (extract->format == extract_format_HTML) + { + for (i=0; i<extract->contentss_num; ++i) + { + if (extract_buffer_write(buffer, extract->contentss[i].chars, extract->contentss[i].chars_num, NULL)) goto end; + } + } + else if (extract->format == extract_format_TEXT) + { + for (i=0; i<extract->contentss_num; ++i) + { + if (extract_buffer_write(buffer, extract->contentss[i].chars, extract->contentss[i].chars_num, NULL)) goto end; + } } else { @@ -1231,15 +1946,15 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer) return 1; } - if (extract_zip_close(&zip)) goto end; - assert(!zip); - e = 0; end: - if (e) outf("failed: %s", strerror(errno)); + if (e) + { + outf("failed: %s", strerror(errno)); + extract_zip_close(&zip); + } extract_free(extract->alloc, &text2); - extract_zip_close(&zip); return e; } @@ -1300,6 +2015,7 @@ int extract_write_template( } } + void extract_end(extract_t** pextract) { extract_t* extract = *pextract; @@ -1314,12 +2030,13 @@ void extract_end(extract_t** pextract) extract_free(extract->alloc, &extract->contentss); } extract_images_free(extract->alloc, &extract->images); + extract_odt_styles_free(extract->alloc, &extract->odt_styles); extract_free(extract->alloc, pextract); } void extract_internal_end(void) { - span_string(NULL, NULL); + extract_span_string(NULL, NULL); } void extract_exp_min(extract_t* extract, size_t size) @@ -1329,8 +2046,8 @@ void extract_exp_min(extract_t* extract, size_t size) double extract_matrices_to_font_size(matrix_t* ctm, matrix_t* trm) { - double font_size = matrix_expansion(*trm) - * matrix_expansion(*ctm); + double font_size = extract_matrix_expansion(*trm) + * extract_matrix_expansion(*ctm); /* Round font_size to nearest 0.01. */ font_size = (double) (int) (font_size * 100.0f + 0.5f) / 100.0f; return font_size; diff --git a/extract/src/html.c b/extract/src/html.c new file mode 100644 index 00000000..d12a3101 --- /dev/null +++ b/extract/src/html.c @@ -0,0 +1,314 @@ +/* These extract_html_*() functions generate docx content and docx zip archive +data. + +Caller must call things in a sensible order to create valid content - +e.g. don't call docx_paragraph_start() twice without intervening call to +docx_paragraph_finish(). */ + +#include "../include/extract.h" + +#include "astring.h" +#include "document.h" +#include "html.h" +#include "mem.h" +#include "memento.h" +#include "outf.h" +#include "sys.h" +#include "text.h" +#include "zip.h" + +#include <assert.h> +#include <errno.h> +#include <float.h> +#include <math.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#include <sys/stat.h> + + +static void content_state_init(content_state_t* content_state) +{ + content_state->font.name = NULL; + content_state->font.size = 0; + content_state->font.bold = 0; + content_state->font.italic = 0; + content_state->ctm_prev = NULL; +} + +static int content_state_reset(extract_alloc_t* alloc, content_state_t* content_state, extract_astring_t* content) +{ + int e = -1; + if (content_state->font.bold) + { + if (extract_astring_cat(alloc, content, "</b>")) goto end; + content_state->font.bold = 0; + } + if (content_state->font.italic) + { + if (extract_astring_cat(alloc, content, "</i>")) goto end; + content_state->font.italic = 0; + } + e = 0; + + end: + return e; +} + +static int paragraph_to_html_content( + extract_alloc_t* alloc, + content_state_t* content_state, + paragraph_t* paragraph, + int single_line, + extract_astring_t* content + ) +{ + int e = -1; + const char* endl = (single_line) ? "" : "\n"; + int l; + if (extract_astring_catf(alloc, content, "%s%s<p>", endl, endl)) goto end; + + for (l=0; l<paragraph->lines_num; ++l) + { + line_t* line = paragraph->lines[l]; + int s; + for (s=0; s<line->spans_num; ++s) + { + int c; + span_t* span = line->spans[s]; + content_state->ctm_prev = &span->ctm; + if (span->flags.font_bold != content_state->font.bold) + { + if (extract_astring_cat(alloc, content, + span->flags.font_bold ? "<b>" : "</b>" + )) goto end; + content_state->font.bold = span->flags.font_bold; + } + if (span->flags.font_italic != content_state->font.italic) + { + if ( extract_astring_cat(alloc, content, + span->flags.font_italic ? "<i>" : "</i>" + )) goto end; + content_state->font.italic = span->flags.font_italic; + } + + for (c=0; c<span->chars_num; ++c) + { + char_t* char_ = &span->chars[c]; + if (extract_astring_catc_unicode_xml(alloc, content, char_->ucs)) goto end; + } + } + + if (content->chars_num && l+1 < paragraph->lines_num) + { + if (content->chars[content->chars_num-1] == '-') content->chars_num -= 1; + else if (content->chars[content->chars_num-1] != ' ') + { + extract_astring_catc(alloc, content, ' '); + } + } + } + if (extract_astring_catf(alloc, content, "%s</p>", endl)) goto end; + + e = 0; + + end: + return e; +} + + +static int paragraphs_to_html_content( + extract_alloc_t* alloc, + content_state_t* state, + paragraph_t** paragraphs, + int paragraphs_num, + int single_line, + extract_astring_t* content + ) +/* Append html for paragraphs[] to <content>. Updates *state if we change font +etc. */ +{ + int e = -1; + int p; + for (p=0; p<paragraphs_num; ++p) + { + paragraph_t* paragraph = paragraphs[p]; + if (paragraph_to_html_content(alloc, state, paragraph, single_line, content)) goto end; + } + + if (content_state_reset(alloc, state, content)) goto end; + e = 0; + + end: + return e; +} + +static int append_table(extract_alloc_t* alloc, content_state_t* state, table_t* table, extract_astring_t* content) +{ + int e = -1; + int y; + + if (extract_astring_cat(alloc, content, "\n\n<table border=\"1\" style=\"border-collapse:collapse\">\n")) goto end; + + for (y=0; y<table->cells_num_y; ++y) + { + /* If 1, we put each <td>...</td> on a separate line. */ + int multiline = 0; + int x; + if (extract_astring_cat(alloc, content, " <tr>\n")) goto end; + if (!multiline) + { + if (extract_astring_cat(alloc, content, " ")) goto end; + } + for (x=0; x<table->cells_num_x; ++x) + { + cell_t* cell = table->cells[y*table->cells_num_x + x]; + if (!cell->above || !cell->left) + { + /* HTML does not require anything for cells that are subsumed + by other cells that extend horizontally and vertically. */ + continue; + } + if (extract_astring_cat(alloc, content, " ")) goto end; + if (extract_astring_cat(alloc, content, "<td")) goto end; + + if (cell->extend_right > 1) + { + if (extract_astring_catf(alloc, content, " colspan=\"%i\"", cell->extend_right)) goto end; + } + if (cell->extend_down > 1) + { + if (extract_astring_catf(alloc, content, " rowspan=\"%i\"", cell->extend_down)) goto end; + } + + if (extract_astring_cat(alloc, content, ">")) goto end; + + if (paragraphs_to_html_content(alloc, state, cell->paragraphs, cell->paragraphs_num, 1 /* single_line*/, content)) goto end; + if (extract_astring_cat(alloc, content, "</td>")) goto end; + if (extract_astring_cat(alloc, content, "\n")) goto end; + + if (content_state_reset(alloc, state, content)) goto end; + } + if (!multiline) + { + if (extract_astring_cat(alloc, content, "\n")) goto end; + } + if (extract_astring_cat(alloc, content, " </tr>\n")) goto end; + } + if (extract_astring_cat(alloc, content, "</table>\n\n")) goto end; + e = 0; + + end: + return e; +} + + +static char_t* paragraph_first_char(const paragraph_t* paragraph) +{ + line_t* line = paragraph->lines[paragraph->lines_num - 1]; + span_t* span = line->spans[line->spans_num - 1]; + return &span->chars[0]; +} + +static int compare_paragraph_y(const void* a, const void* b) +{ + const paragraph_t* const* a_paragraph = a; + const paragraph_t* const* b_paragraph = b; + double a_y = paragraph_first_char(*a_paragraph)->y; + double b_y = paragraph_first_char(*b_paragraph)->y; + if (a_y > b_y) return +1; + if (a_y < b_y) return -1; + return 0; +} + +int extract_document_to_html_content( + extract_alloc_t* alloc, + document_t* document, + int rotation, + int images, + extract_astring_t* content + ) +{ + int ret = -1; + int p; + paragraph_t** paragraphs = NULL; + + (void) rotation; + (void) images; + + extract_astring_cat(alloc, content, "<html>\n"); + extract_astring_cat(alloc, content, "<body>\n"); + + /* Write paragraphs into <content>. */ + for (p=0; p<document->pages_num; ++p) + { + extract_page_t* page = document->pages[p]; + int p; + int t; + content_state_t state; + content_state_init(&state); + extract_free(alloc, ¶graphs); + + /* Output paragraphs and tables in order of increasing <y> coordinate. + + Unfortunately the paragraph ordering we do in page->paragraphs[] + isn't quite right and results in bad ordering if ctm/trm matrices are + inconsistent. So we create our own list of paragraphs sorted strictly + by y coordinate of the first char of each paragraph. */ + if (extract_malloc(alloc, ¶graphs, sizeof(*paragraphs) * page->paragraphs_num)) goto end; + for (p = 0; p < page->paragraphs_num; ++p) + { + paragraphs[p] = page->paragraphs[p]; + } + qsort(paragraphs, page->paragraphs_num, sizeof(*paragraphs), compare_paragraph_y); + + if (0) + { + int p; + outf0("paragraphs are:"); + for (p=0; p<page->paragraphs_num; ++p) + { + paragraph_t* paragraph = page->paragraphs[p]; + line_t* line = paragraph->lines[0]; + span_t* span = line->spans[0]; + outf0(" p=%i: %s", p, extract_span_string(NULL, span)); + } + } + + p = 0; + t = 0; + for(;;) + { + double y_paragraph; + double y_table; + paragraph_t* paragraph = (p == page->paragraphs_num) ? NULL : paragraphs[p]; + table_t* table = (t == page->tables_num) ? NULL : page->tables[t]; + if (!paragraph && !table) break; + y_paragraph = (paragraph) ? paragraph->lines[0]->spans[0]->chars[0].y : DBL_MAX; + y_table = (table) ? table->pos.y : DBL_MAX; + outf("p=%i y_paragraph=%f", p, y_paragraph); + outf("t=%i y_table=%f", t, y_table); + if (paragraph && y_paragraph < y_table) + { + //extract_astring_catf(alloc, content, "<p>@@@ paragraph %i y=%f @@@)</p>\n", p, y_paragraph); + if (paragraph_to_html_content(alloc, &state, paragraph, 0 /*single_line*/, content)) goto end; + if (content_state_reset(alloc, &state, content)) goto end; + p += 1; + } + else if (table) + { + //extract_astring_catf(alloc, content, "<p>@@@ table %t y=%f @@@)</p>\n", p, y_table); + if (append_table(alloc, &state, table, content)) goto end; + t += 1; + } + } + } + extract_astring_cat(alloc, content, "</body>\n"); + extract_astring_cat(alloc, content, "</html>\n"); + ret = 0; + + end: + extract_free(alloc, ¶graphs); + return ret; +} diff --git a/extract/src/html.h b/extract/src/html.h new file mode 100644 index 00000000..6148a067 --- /dev/null +++ b/extract/src/html.h @@ -0,0 +1,23 @@ +#ifndef ARTIFEX_EXTRACT_HTML_H +#define ARTIFEX_EXTRACT_HTML_H + +/* Only for internal use by extract code. */ + +/* Things for creating docx files. */ + +int extract_document_to_html_content( + extract_alloc_t* alloc, + document_t* document, + int rotation, + int images, + extract_astring_t* content + ); +/* Makes *o_content point to a string containing all paragraphs in *document in +docx XML format. + +This string can be passed to extract_docx_content_item() or +extract_docx_write_template() to be inserted into a docx archive's +word/document.xml. */ + + +#endif diff --git a/extract/src/join.c b/extract/src/join.c index f12e2751..4425de3d 100644 --- a/extract/src/join.c +++ b/extract/src/join.c @@ -7,6 +7,7 @@ #include "outf.h" #include <assert.h> +#include <float.h> #include <math.h> #include <stdio.h> @@ -17,24 +18,39 @@ static char_t* span_char_first(span_t* span) return &span->chars[0]; } +static span_t* s_line_span_first(line_t* line) +{ + return extract_line_span_first(line); +} + /* Returns first char_t in a line. */ static char_t* line_item_first(line_t* line) { - span_t* span = line_span_first(line); + span_t* span = s_line_span_first(line); return span_char_first(span); } /* Returns last char_t in a line. */ static char_t* line_item_last(line_t* line) { - span_t* span = line_span_last(line); - return span_char_last(span); + span_t* span = extract_line_span_last(line); + return extract_span_char_last(span); } -static const char* matrix_string(const matrix_t* matrix) +static point_t char_to_point(const char_t* char_) { - static char ret[64]; - snprintf(ret, sizeof(ret), "{%f %f %f %f %f %f}", + point_t ret; + ret.x = char_->x; + ret.y = char_->y; + return ret; +} + +const char* extract_matrix_string(const matrix_t* matrix) +{ + static char ret[5][64]; + static int i = 0; + i = (i + 1) % 5; + snprintf(ret[i], sizeof(ret[i]), "{%f %f %f %f %f %f}", matrix->a, matrix->b, matrix->c, @@ -42,17 +58,17 @@ static const char* matrix_string(const matrix_t* matrix) matrix->e, matrix->f ); - return ret; + return ret[i]; } /* Returns total width of span. */ static double span_adv_total(span_t* span) { - double dx = span_char_last(span)->x - span_char_first(span)->x; - double dy = span_char_last(span)->y - span_char_first(span)->y; + double dx = extract_span_char_last(span)->x - span_char_first(span)->x; + double dy = extract_span_char_last(span)->y - span_char_first(span)->y; /* We add on the advance of the last item; this avoids us returning zero if there's only one item. */ - double adv = span_char_last(span)->adv * matrix_expansion(span->trm); + double adv = extract_span_char_last(span)->adv * extract_matrix_expansion(span->trm); return sqrt(dx*dx + dy*dy) + adv; } @@ -66,15 +82,30 @@ static double spans_adv( double delta_x = b->x - a->x; double delta_y = b->y - a->y; double s = sqrt( delta_x*delta_x + delta_y*delta_y); - double a_size = a->adv * matrix_expansion(a_span->trm); + double a_size = a->adv * extract_matrix_expansion(a_span->trm); s -= a_size; return s; } static double span_angle(span_t* span) { - /* Assume ctm is a rotation matix. */ double ret = atan2(-span->ctm.c, span->ctm.a); + if (0) + { + /* This is an attempt to take into account the trm matrix when looking + at spans, because for agstat.pdf vertical text seems to be achieved + by making trm rotate by 90 degrees. But it messes up the ordering of + rotated paragraphs in Python2.pdf so is disabled for now. */ + matrix_t m = extract_multiply_matrix_matrix(span->trm, span->ctm); + point_t dir; + double ret; + dir.x = span->flags.wmode ? 0 : 1; + dir.y = span->flags.wmode ? 1 : 0; + dir = extract_multiply_matrix_point(m, dir); + ret = atan2(dir.y, dir.x); + return ret; + } + /* Assume ctm is a rotation matix. */ outfx("ctm.a=%f ctm.b=%f ret=%f", span->ctm.a, span->ctm.b, ret); return ret; /* Not sure whether this is right. Inclined text seems to be done by @@ -89,6 +120,22 @@ static double span_angle(span_t* span) }*/ } +static double span_angle2(span_t* span) +{ + if (span->chars_num > 1) + { + double dx = span->chars[span->chars_num-1].x - span->chars[0].x; + double dy = span->chars[span->chars_num-1].y - span->chars[0].y; + double ret1 = span_angle(span); + double ret2 = atan2(-dy, dx); + if (fabs(ret2 - ret1) > 0.01) + { + outf("### ret1=%f ret2=%f: %s", ret1, ret2, extract_span_string(NULL, span)); + } + } + return span_angle(span); +} + /* Returns static string containing brief info about span_t. */ static const char* span_string2(extract_alloc_t* alloc, span_t* span) { @@ -182,36 +229,36 @@ static int lines_are_compatible( { if (a == b) return 0; if (!a->spans || !b->spans) return 0; - if (line_span_first(a)->flags.wmode != line_span_first(b)->flags.wmode) { + if (s_line_span_first(a)->flags.wmode != s_line_span_first(b)->flags.wmode) { return 0; } - if (matrix_cmp4( - &line_span_first(a)->ctm, - &line_span_first(b)->ctm + if (extract_matrix_cmp4( + &s_line_span_first(a)->ctm, + &s_line_span_first(b)->ctm )) { if (verbose) { outf("ctm's differ:"); outf(" %f %f %f %f %f %f", - line_span_first(a)->ctm.a, - line_span_first(a)->ctm.b, - line_span_first(a)->ctm.c, - line_span_first(a)->ctm.d, - line_span_first(a)->ctm.e, - line_span_first(a)->ctm.f + s_line_span_first(a)->ctm.a, + s_line_span_first(a)->ctm.b, + s_line_span_first(a)->ctm.c, + s_line_span_first(a)->ctm.d, + s_line_span_first(a)->ctm.e, + s_line_span_first(a)->ctm.f ); outf(" %f %f %f %f %f %f", - line_span_first(b)->ctm.a, - line_span_first(b)->ctm.b, - line_span_first(b)->ctm.c, - line_span_first(b)->ctm.d, - line_span_first(b)->ctm.e, - line_span_first(b)->ctm.f + s_line_span_first(b)->ctm.a, + s_line_span_first(b)->ctm.b, + s_line_span_first(b)->ctm.c, + s_line_span_first(b)->ctm.d, + s_line_span_first(b)->ctm.e, + s_line_span_first(b)->ctm.f ); } return 0; } { - double angle_b = span_angle(line_span_first(b)); + double angle_b = span_angle(s_line_span_first(b)); if (angle_b != angle_a) { outfx("%s:%i: angles differ"); return 0; @@ -221,6 +268,80 @@ static int lines_are_compatible( } +static const unsigned ucs_NONE = ((unsigned) -1); + +static int s_span_inside_rects( + extract_alloc_t* alloc, + span_t* span, + rect_t* rects, + int rects_num, + span_t* o_span + ) +/* Returns with <o_span> containing char_t's from <span> that are inside +rects[], and *span modified to remove any char_t's that we have moved to +<o_span>. + +May return with span->chars_num == 0, in which case the caller must remove the +span (including freeing .font_name), because lots of code assumes that there +are no empty spans. */ +{ + int c; + *o_span = *span; + extract_strdup(alloc, span->font_name, &o_span->font_name); + o_span->chars = NULL; + o_span->chars_num = 0; + for (c=0; c<span->chars_num; ++c) + { + /* For now we just look at whether span's (x, y) is within any + rects[]. We could instead try to find character's bounding box etc. */ + char_t* char_ = &span->chars[c]; + int r; + for (r=0; r<rects_num; ++r) + { + rect_t* rect = &rects[r]; + if (1 + && char_->x >= rect->min.x + && char_->x < rect->max.x + && char_->y >= rect->min.y + && char_->y < rect->max.y + ) + { + if (extract_span_append_c(alloc, o_span, char_->ucs)) return -1; + /* Coverity warns, but o_span must have at least one item. */ + /* coverity[var_deref_op] */ + *extract_span_char_last(o_span) = *char_; + char_->ucs = ucs_NONE; /* Mark for removal below, so it is not used again. */ + break; + } + } + } + + /* Remove any char_t's that we've used. */ + { + int cc = 0; + for (c=0; c<span->chars_num; ++c) + { + char_t* char_ = &span->chars[c]; + if (char_->ucs != ucs_NONE) + { + span->chars[cc] = span->chars[c]; + cc += 1; + } + } + /* This might set span->chars_num to zero; our caller needs to remove + the span - lots of code assumes that all spans contain at least one + character. */ + span->chars_num = cc; + } + + if (o_span->chars_num) + { + //outf0(" span: %s", extract_span_string(alloc, span)); + outf("o_span: %s", extract_span_string(alloc, o_span)); + } + return 0; +} + /* Creates representation of span_t's that consists of a list of line_t's, with each line_t contains pointers to a list of span_t's. @@ -230,11 +351,16 @@ On entry: Original value of *o_lines and *o_lines_num are ignored. <spans> points to array of <spans_num> span_t*'s, each pointing to - an span_t. + a span_t. On exit: If we succeed, we return 0, with *o_lines pointing to array of *o_lines_num - line_t*'s, each pointing to an line_t. + line_t*'s, each pointing to a line_t. + + If <rects_num> is zero, each of these line_t's will contain pointers to + items in <spans>; otherwise each of the line_t's will contain new spans + which should be freed by the caller (spans are not necessarily wholy inside + or outside rects[] so we need to create new spams). Otherwise we return -1 with errno set. *o_lines and *o_lines_num are undefined. @@ -242,35 +368,85 @@ On exit: static int make_lines( extract_alloc_t* alloc, span_t** spans, - int spans_num, + int* spans_num, + rect_t* rects, + int rects_num, line_t*** o_lines, int* o_lines_num ) { int ret = -1; - /* Make an line_t for each span. Then we will join some of these - line_t's together before returning. */ - int lines_num = spans_num; + /* Make a line_t for each span. Then we will join some of these line_t's + together before returning. */ + int lines_num = 0; line_t** lines = NULL; int a; int num_compatible; int num_joins; - if (extract_malloc(alloc, &lines, sizeof(*lines) * lines_num)) goto end; - - /* Ensure we can clean up after error. */ - for (a=0; a<lines_num; ++a) { - lines[a] = NULL; - } - for (a=0; a<lines_num; ++a) { - if (extract_malloc(alloc, &lines[a], sizeof(line_t))) goto end; - lines[a]->spans_num = 0; - if (extract_malloc(alloc, &lines[a]->spans, sizeof(span_t*) * 1)) goto end; - lines[a]->spans_num = 1; - lines[a]->spans[0] = spans[a]; - outfx("initial line a=%i: %s", a, line_string(lines[a])); + span_t* span = NULL; + + if (rects_num) + { + /* Make <lines> contain new span_t's and char_t's that are inside rects[]. */ + for (a=0; a<*spans_num; ++a) + { + if (spans[a]->chars_num == 0) continue; /* In case used for table, */ + if (extract_realloc(alloc, &span, sizeof(*span))) goto end; + extract_span_init(span); + if (s_span_inside_rects(alloc, spans[a], rects, rects_num, span)) + { + goto end; + } + if (span->chars_num) + { + if (extract_realloc(alloc, &lines, sizeof(*lines) * (lines_num + 1))) goto end; + if (extract_malloc(alloc, &lines[lines_num], sizeof(line_t))) goto end; + lines_num += 1; + if (extract_malloc(alloc, &lines[lines_num-1]->spans, sizeof(span_t*) * 1)) goto end; + lines[lines_num-1]->spans[0] = span; + lines[lines_num-1]->spans_num = 1; + span = NULL; + } + else + { + extract_span_free(alloc, &span); + } + + if (!spans[a]->chars_num) + { + /* All characters in this span are inside table, so remove + entire span, otherwise the same characters will end up being + output outside the table also. */ + extract_span_free(alloc, &spans[a]); + memmove(&spans[a], &spans[a+1], sizeof(*spans) * ((*spans_num) - (a+1))); + *spans_num -= 1; + a -= 1; + } + } } + else + { + /* Make <lines> be a copy of <spans>. */ + lines_num = *spans_num; + if (extract_malloc(alloc, &lines, sizeof(*lines) * lines_num)) goto end; + /* Ensure we can clean up after error. */ + for (a=0; a<lines_num; ++a) { + lines[a] = NULL; + } + for (a=0; a<lines_num; ++a) { + if (extract_malloc(alloc, &lines[a], sizeof(line_t))) goto end; + lines[a]->spans_num = 0; + if (extract_malloc(alloc, &lines[a]->spans, sizeof(span_t*) * 1)) goto end; + lines[a]->spans_num = 1; + lines[a]->spans[0] = spans[a]; + /* Ensure that spans[] can be safely freed now we've moved it into lines[]. */ + spans[a] = NULL; + outfx("initial line a=%i: %s", a, line_string(lines[a])); + } + } + num_compatible = 0; /* For each line, look for nearest aligned line, and append if found. */ @@ -290,14 +466,14 @@ static int make_lines( } if (0 && a < 1) verbose = 1; - outfx("looking at line_a=%s", line_string2(line_a)); + outfx("looking at line_a=%s", line_string2(alloc, line_a)); - span_a = line_span_last(line_a); + span_a = extract_line_span_last(line_a); angle_a = span_angle(span_a); if (verbose) outf("a=%i angle_a=%f ctm=%s: %s", a, angle_a * 180/pi, - matrix_string(&span_a->ctm), + extract_matrix_string(&span_a->ctm), line_string2(alloc, line_a) ); @@ -310,7 +486,6 @@ static int make_lines( continue; } if (verbose) { - outf(""); outf("a=%i b=%i: nearest_line_b=%i nearest_adv=%f", a, b, @@ -330,17 +505,17 @@ static int make_lines( /* Find angle between last glyph of span_a and first glyph of span_b. This detects whether the lines are lined up with each other (as opposed to being at the same angle but in different lines). */ - span_t* span_b = line_span_first(line_b); - double dx = span_char_first(span_b)->x - span_char_last(span_a)->x; - double dy = span_char_first(span_b)->y - span_char_last(span_a)->y; + span_t* span_b = s_line_span_first(line_b); + double dx = span_char_first(span_b)->x - extract_span_char_last(span_a)->x; + double dy = span_char_first(span_b)->y - extract_span_char_last(span_a)->y; double angle_a_b = atan2(-dy, dx); const double angle_tolerance_deg = 1; if (verbose) { outf("delta=(%f %f) alast=(%f %f) bfirst=(%f %f): angle_a=%f angle_a_b=%f", dx, dy, - span_char_last(span_a)->x, - span_char_last(span_a)->y, + extract_span_char_last(span_a)->x, + extract_span_char_last(span_a)->y, span_char_first(span_b)->x, span_char_first(span_b)->y, angle_a * 180 / pi, @@ -353,7 +528,7 @@ static int make_lines( /* Find distance between end of line_a and beginning of line_b. */ double adv = spans_adv( span_a, - span_char_last(span_a), + extract_span_char_last(span_a), span_char_first(span_b) ); if (verbose) outf("nearest_adv=%f. angle_a_b=%f adv=%f", @@ -370,8 +545,8 @@ static int make_lines( else { if (verbose) outf( "angle beyond tolerance: span_a last=(%f,%f) span_b first=(%f,%f) angle_a_b=%g angle_a=%g span_a.trm{a=%f b=%f}", - span_char_last(span_a)->x, - span_char_last(span_a)->y, + extract_span_char_last(span_a)->x, + extract_span_char_last(span_a)->y, span_char_first(span_b)->x, span_char_first(span_b)->y, angle_a_b * 180 / pi, @@ -386,24 +561,30 @@ static int make_lines( if (nearest_line) { /* line_a and nearest_line are aligned so we can move line_b's spans on to the end of line_a. */ - span_t* span_b = line_span_first(nearest_line); + double average_adv; + span_t* span_b = s_line_span_first(nearest_line); b = nearest_line_b; if (verbose) outf("found nearest line. a=%i b=%i", a, b); + /* Find average advance of the two adjacent spans in the two + lines we are considering joining, so that we can decide whether + the distance between them is large enough to merit joining with + a space character). */ + average_adv = ( + (span_adv_total(span_a) + span_adv_total(span_b)) + / + (double) (span_a->chars_num + span_b->chars_num) + ); + + if (0 && nearest_adv > 5 * average_adv) + { + continue; + } + if (1 - && span_char_last(span_a)->ucs != ' ' + && extract_span_char_last(span_a)->ucs != ' ' && span_char_first(span_b)->ucs != ' ' ) { - /* Find average advance of the two adjacent spans in the two - lines we are considering joining, so that we can decide whether - the distance between them is large enough to merit joining with - a space character). */ - double average_adv = ( - (span_adv_total(span_a) + span_adv_total(span_b)) - / - (double) (span_a->chars_num + span_b->chars_num) - ); - int insert_space = (nearest_adv > 0.25 * average_adv); if (insert_space) { /* Append space to span_a before concatenation. */ @@ -413,8 +594,8 @@ static int make_lines( nearest_adv, average_adv ); - outf(" a: %s", span_string(alloc, span_a)); - outf(" b: %s", span_string(alloc, span_b)); + outf(" a: %s", extract_span_string(alloc, span_a)); + outf(" b: %s", extract_span_string(alloc, span_b)); } if (extract_realloc2( alloc, @@ -427,6 +608,13 @@ static int make_lines( extract_bzero(item, sizeof(*item)); item->ucs = ' '; item->adv = nearest_adv; + /* This is a hack to give our extra space a vaguely useful + (x,y) coordinate - this can be used later on when ordering + paragraphs. We could try to be more accurate by adding + item[-1]'s .adv suitably transformed by .wmode, .ctm and + .trm. */ + item->x = item[-1].x; + item->y = item[-1].y; } if (verbose) { @@ -440,14 +628,14 @@ static int make_lines( "joining line insert_space=%i a=%i (y=%f) to line b=%i (y=%f). nearest_adv=%f average_adv=%f", insert_space, a, - span_char_last(span_a)->y, + extract_span_char_last(span_a)->y, b, span_char_first(span_b)->y, nearest_adv, average_adv ); - outf("a: %s", span_string(alloc, span_a)); - outf("b: %s", span_string(alloc, span_b)); + outf("a: %s", extract_span_string(alloc, span_a)); + outf("b: %s", extract_span_string(alloc, span_b)); } } @@ -487,7 +675,7 @@ static int make_lines( the new extended line_a needs checking again. */ a -= 1; } - outfx("new line is:\n %s", line_string2(line_a)); + outfx("num_joins=%i new line is:\n %s", num_joins, line_string2(line_a)); } } @@ -524,7 +712,7 @@ static int make_lines( ret = 0; outf("Turned %i spans into %i lines. num_compatible=%i", - spans_num, + *spans_num, lines_num, num_compatible ); @@ -532,9 +720,18 @@ static int make_lines( end: if (ret) { /* Free everything. */ + extract_span_free(alloc, &span); if (lines) { for (a=0; a<lines_num; ++a) { - if (lines[a]) extract_free(alloc, &lines[a]->spans); + if (lines[a]) + { + int s; + for (s=0; s<lines[a]->spans_num; ++s) + { + extract_span_free(alloc, &lines[a]->spans[s]); + } + extract_free(alloc, &lines[a]->spans); + } extract_free(alloc, &lines[a]); } } @@ -552,7 +749,7 @@ static double line_font_size_max(line_t* line) for (i=0; i<line->spans_num; ++i) { span_t* span = line->spans[i]; /* fixme: <size> should be double, which changes some output. */ - double size = matrix_expansion(span->trm); + double size = extract_matrix_expansion(span->trm); if (size > size_max) { size_max = size; } @@ -581,21 +778,35 @@ respectively. AQB is a right angle. We need to find AQ. */ -static double line_distance( - double ax, - double ay, - double bx, - double by, - double angle - ) +static double line_distance_y( double ax, double ay, double bx, double by, double angle) { double dx = bx - ax; double dy = by - ay; - return dx * sin(angle) + dy * cos(angle); } +/* Returns distance QB in above diagram. */ +static double line_distance_x( double ax, double ay, double bx, double by, double angle) +{ + double dx = bx - ax; + double dy = by - ay; + + return dx * cos(angle) - dy * sin(angle); +} + +static double line_distance_xp(point_t a, point_t b, double angle) +{ + return line_distance_x(a.x, a.y, b.x, b.y, angle); +} + +static int lines_overlap(point_t a_left, point_t a_right, point_t b_left, point_t b_right, double angle) +{ + if (line_distance_xp(a_left, b_right, angle) < 0) return 0; + if (line_distance_xp(a_right, b_left, angle) >= 0) return 0; + return 1; +} + /* A comparison function for use with qsort(), for sorting paragraphs within a page. */ @@ -606,14 +817,49 @@ static int paragraphs_cmp(const void* a, const void* b) line_t* a_line = paragraph_line_first(*a_paragraph); line_t* b_line = paragraph_line_first(*b_paragraph); - span_t* a_span = line_span_first(a_line); - span_t* b_span = line_span_first(b_line); + span_t* a_span = s_line_span_first(a_line); + span_t* b_span = s_line_span_first(b_line); - /* If ctm matrices differ, always return this diff first. Note that we - ignore .e and .f because if data is from ghostscript then .e and .f vary - for each span, and we don't care about these differences. */ - int d = matrix_cmp4(&a_span->ctm, &b_span->ctm); - if (d) return d; + if (0) + { + double a_angle = span_angle2(a_span); + double b_angle = span_angle2(b_span); + if (fabs(a_angle - b_angle) > 0.01) + { + outf0("angles differ: a_angle=%f b_angle=%f", a_angle, b_angle); + outf0("a_span: %s", extract_span_string(NULL, a_span)); + outf0("b_span: %s", extract_span_string(NULL, b_span)); + if (a_angle - b_angle > 3.14/2) { + /* Give up if more than 90 deg. */ + return 0; + } + if (a_angle > b_angle) return 1; + if (a_angle < b_angle) return -1; + return 0; + } + } + if (1) + { + /* If ctm matrices differ, always return this diff first. Note that we + ignore .e and .f because if data is from ghostscript then .e and .f + vary for each span, and we don't care about these differences. */ + int d = extract_matrix_cmp4(&a_span->ctm, &b_span->ctm); + if (d) + { + outf("extract_matrix_cmp4() returned non-zero."); + outf("a_span->ctm=%s trm=%s: %s", + extract_matrix_string(&a_span->ctm), + extract_matrix_string(&a_span->trm), + extract_span_string(NULL, a_span) + ); + outf("b_span->ctm=%s trm=%s: %s", + extract_matrix_string(&b_span->ctm), + extract_matrix_string(&a_span->trm), + extract_span_string(NULL, b_span) + ); + return d; + } + } { double a_angle = line_angle(a_line); @@ -628,7 +874,7 @@ static int paragraphs_cmp(const void* a, const void* b) double ay = line_item_first(a_line)->y; double bx = line_item_first(b_line)->x; double by = line_item_first(b_line)->y; - double distance = line_distance(ax, ay, bx, by, angle); + double distance = line_distance_y(ax, ay, bx, by, angle); if (distance > 0) return -1; if (distance < 0) return +1; } @@ -669,7 +915,7 @@ static int make_paragraphs( int num_joins; paragraph_t** paragraphs = NULL; - /* Start off with an paragraph_t for each line_t. */ + /* Start off with a paragraph_t for each line_t. */ int paragraphs_num = lines_num; if (extract_malloc(alloc, ¶graphs, sizeof(*paragraphs) * paragraphs_num)) goto end; /* Ensure we can clean up after error when setting up. */ @@ -685,11 +931,12 @@ static int make_paragraphs( paragraphs[a]->lines[0] = lines[a]; } + /* Now join paragraphs together where possible. */ num_joins = 0; for (a=0; a<paragraphs_num; ++a) { - paragraph_t* nearest_paragraph; - int nearest_paragraph_b; - double nearest_paragraph_distance; + paragraph_t* nearest_paragraph = NULL; + int nearest_paragraph_b = -1; + double nearest_paragraph_distance = -1; line_t* line_a; double angle_a; int verbose; @@ -702,14 +949,9 @@ static int make_paragraphs( continue; } - nearest_paragraph = NULL; - nearest_paragraph_b = -1; - nearest_paragraph_distance = -1; assert(paragraph_a->lines_num > 0); - line_a = paragraph_line_last(paragraph_a); angle_a = line_angle(line_a); - verbose = 0; /* Look for nearest paragraph_t that could be appended to @@ -732,7 +974,7 @@ static int make_paragraphs( double ay = line_item_last(line_a)->y; double bx = line_item_first(line_b)->x; double by = line_item_first(line_b)->y; - double distance = line_distance(ax, ay, bx, by, angle_a); + double distance = line_distance_y(ax, ay, bx, by, angle_a); if (verbose) { outf( "angle_a=%f a=(%f %f) b=(%f %f) delta=(%f %f) distance=%f:", @@ -746,17 +988,39 @@ static int make_paragraphs( outf(" line_a=%s", line_string2(alloc, line_a)); outf(" line_b=%s", line_string2(alloc, line_b)); } - if (distance > 0) { + if (distance > 0) + { if (nearest_paragraph_distance == -1 - || distance < nearest_paragraph_distance) { - if (verbose) { - outf("updating nearest. distance=%f:", distance); - outf(" line_a=%s", line_string2(alloc, line_a)); - outf(" line_b=%s", line_string2(alloc, line_b)); + || distance < nearest_paragraph_distance) + { + int ok = 1; + if (0) + { + /* Check whether lines overlap horizontally. */ + point_t a_left = char_to_point(line_item_first(line_a)); + point_t b_left = char_to_point(line_item_first(line_b)); + point_t a_right = char_to_point(line_item_last(line_a)); + point_t b_right = char_to_point(line_item_last(line_b)); + + if (!lines_overlap(a_left, a_right, b_left, b_right, angle_a)) + { + outf("Not joining lines because not overlapping."); + ok = 0; + } + } + + if (ok) + { + if (verbose) { + outf("updating nearest. distance=%f:", distance); + outf(" line_a=%s", line_string2(alloc, line_a)); + outf(" line_b=%s", line_string2(alloc, line_b)); + } + + nearest_paragraph_distance = distance; + nearest_paragraph_b = b; + nearest_paragraph = paragraph_b; } - nearest_paragraph_distance = distance; - nearest_paragraph_b = b; - nearest_paragraph = paragraph_b; } } } @@ -787,24 +1051,34 @@ static int make_paragraphs( outf(" %s", paragraph_string(alloc, paragraph_a)); outf(" %s", paragraph_string(alloc, nearest_paragraph)); outf("paragraph_a ctm=%s", - matrix_string(¶graph_a->lines[0]->spans[0]->ctm) + extract_matrix_string(¶graph_a->lines[0]->spans[0]->ctm) ); outf("paragraph_a trm=%s", - matrix_string(¶graph_a->lines[0]->spans[0]->trm) + extract_matrix_string(¶graph_a->lines[0]->spans[0]->trm) ); } /* Join these two paragraph_t's. */ - a_span = line_span_last(line_a); - if (span_char_last(a_span)->ucs == '-') { + a_span = extract_line_span_last(line_a); + if (extract_span_char_last(a_span)->ucs == '-' + || extract_span_char_last(a_span)->ucs == 0x2212 /* unicode dash */ + ) + { /* remove trailing '-' at end of prev line. char_t doesn't contain any malloc-heap pointers so this doesn't leak. */ a_span->chars_num -= 1; } - else { + else if (extract_span_char_last(a_span)->ucs == ' ') + { + } + else if (extract_span_char_last(a_span)->ucs == '/') + { + } + else + { /* Insert space before joining adjacent lines. */ char_t* c_prev; char_t* c; - if (span_append_c(alloc, line_span_last(line_a), ' ')) goto end; + if (extract_span_append_c(alloc, extract_line_span_last(line_a), ' ')) goto end; c_prev = &a_span->chars[ a_span->chars_num-2]; c = &a_span->chars[ a_span->chars_num-1]; c->x = c_prev->x + c_prev->adv * a_span->ctm.a; @@ -834,9 +1108,10 @@ static int make_paragraphs( num_joins += 1; outfx( - "have joined paragraph a=%i to snearest_paragraph_b=%i", + "have joined paragraph a=%i to nearest_paragraph_b=%i. num_joins=%i.", a, - nearest_paragraph_b + nearest_paragraph_b, + num_joins ); if (nearest_paragraph_b > a) { @@ -884,26 +1159,21 @@ static int make_paragraphs( /* Sort paragraphs so they appear in correct order, using paragraphs_cmp(). */ - qsort( - paragraphs, - paragraphs_num, - sizeof(paragraph_t*), paragraphs_cmp - ); + qsort(paragraphs, paragraphs_num, sizeof(paragraph_t*), paragraphs_cmp); *o_paragraphs = paragraphs; *o_paragraphs_num = paragraphs_num; ret = 0; - outf("Turned %i lines into %i paragraphs", - lines_num, - paragraphs_num - ); - + outf("Turned %i lines into %i paragraphs", lines_num, paragraphs_num); end: - if (ret) { - if (paragraphs) { - for (a=0; a<paragraphs_num; ++a) { + if (ret) + { + if (paragraphs) + { + for (a=0; a<paragraphs_num; ++a) + { if (paragraphs[a]) extract_free(alloc, ¶graphs[a]->lines); extract_free(alloc, ¶graphs[a]); } @@ -913,39 +1183,688 @@ static int make_paragraphs( return ret; } -int extract_document_join(extract_alloc_t* alloc, document_t* document) +static int s_join_page_rects( + extract_alloc_t* alloc, + extract_page_t* page, + rect_t* rects, + int rects_num, + line_t*** lines, + int* lines_num, + paragraph_t*** paragraphs, + int* paragraphs_num + ) +/* Extracts text that is inside any of rects[0..rects_num], or all text if +rects_num is zero. */ { - int ret = -1; + if (make_lines( + alloc, + page->spans, + &page->spans_num, + rects, + rects_num, + lines, + lines_num + )) return -1; + if (make_paragraphs( + alloc, + *lines, + *lines_num, + paragraphs, + paragraphs_num + )) return -1; + + return 0; +} + + +static int tablelines_compare_x(const void* a, const void* b) +/* Compares two tableline_t's rectangles using x as primary key. */ +{ + const tableline_t* aa = a; + const tableline_t* bb = b; + if (aa->rect.min.x > bb->rect.min.x) return +1; + if (aa->rect.min.x < bb->rect.min.x) return -1; + if (aa->rect.min.y > bb->rect.min.y) return +1; + if (aa->rect.min.y < bb->rect.min.y) return -1; + return 0; +} - /* For each page in <document> we join spans into lines and paragraphs. A - line is a list of spans that are at the same angle and on the same line. A - paragraph is a list of lines that are at the same angle and close together. +static int tablelines_compare_y(const void* a, const void* b) +/* Compares two tableline_t's rectangles using y as primary key. */ +{ + const tableline_t* aa = a; + const tableline_t* bb = b; + if (aa->rect.min.y > bb->rect.min.y) return +1; + if (aa->rect.min.y < bb->rect.min.y) return -1; + if (aa->rect.min.x > bb->rect.min.x) return +1; + if (aa->rect.min.x < bb->rect.min.x) return -1; + return 0; +} + +static int table_find_y_range(extract_alloc_t* alloc, tablelines_t* all, double y_min, double y_max, + tablelines_t* out) +/* Makes <out> to contain all lines in <all> with y coordinate in the range +y_min..y_max. */ +{ + int i; + for (i=0; i<all->tablelines_num; ++i) + { + if (all->tablelines[i].rect.min.y >= y_min && all->tablelines[i].rect.min.y < y_max) + { + if (extract_realloc(alloc, &out->tablelines, sizeof(*out->tablelines) * (out->tablelines_num + 1))) return -1; + out->tablelines[out->tablelines_num] = all->tablelines[i]; + out->tablelines_num += 1; + } + else + { + outf("Excluding line because outside y=%f..%f: %s", y_min, y_max, extract_rect_string(&all->tablelines[i].rect)); + } + } + return 0; +} + + +static int overlap(double a_min, double a_max, double b_min, double b_max) +/* Returns one if a_min..a_max significantly overlapps b_min..b_max, otherwise +zero. */ +{ + double overlap; + int ret0; + int ret1; + assert(a_min < a_max); + assert(b_min < b_max); + if (b_min < a_min) b_min = a_min; + if (b_max > a_max) b_max = a_max; + if (b_max < b_min) b_max = b_min; + overlap = (b_max - b_min) / (a_max - a_min); + ret0 = overlap > 0.2; + ret1 = overlap > 0.8; + if (ret0 != ret1) + { + if (0) outf0("warning, unclear overlap=%f: a=%f..%f b=%f..%f", overlap, a_min, a_max, b_min, b_max); + } + return overlap > 0.8; +} + +void extract_cell_init(cell_t* cell) +{ + cell->rect.min.x = 0; + cell->rect.min.y = 0; + cell->rect.max.x = 0; + cell->rect.max.y = 0; + cell->above = 0; + cell->left = 0; + cell->extend_right = 0; + cell->extend_down = 0; + cell->lines = NULL; + cell->lines_num = 0; + cell->paragraphs = NULL; + cell->paragraphs_num = 0; +} + + +static int table_find_extend(cell_t** cells, int cells_num_x, int cells_num_y) +{ + /* Find cell extensions to right and down by looking at cells' .left and + .above flags. + + For example for adjacent cells ABC..., we extend A to include cells BC.. + until we reach a cell with .left set to one. + + ABCDE + FGHIJ + KLMNO + + When looking to extend cell A, we only look at cells in the same column or + same row, (i.e. in the above example we look at BCDE and FK, and not at + GHIJ and LMNO). + + For example if BCDE have no left lines and FK have no above lines, we + ignore any lines in GHIJ and LMNO and make A extend to the entire 3x4 + box. Having found this box, we set .above=0 and .left to 0 in all enclosed + cells, which simplifies html table generation code. */ - int p; - for (p=0; p<document->pages_num; ++p) { - extract_page_t* page = document->pages[p]; - outf("processing page %i: num_spans=%i", p, page->spans_num); + int y; + for (y=0; y<cells_num_y; ++y) + { + int x; + for (x=0; x<cells_num_x; ++x) + { + cell_t* cell = cells[y * cells_num_x + x]; + outf("xy=(%i %i) above=%i left=%i", x, y, cell->above, cell->left); + if (cell->left && cell->above) + { + /* See how far this cell extends to right and down. */ + int xx; + int yy; + for (xx=x+1; xx<cells_num_x; ++xx) + { + if (cells[y * cells_num_x + xx]->left) break; + } + cell->extend_right = xx - x; + cell->rect.max.x = cells[y * cells_num_x + xx-1]->rect.max.x; + for (yy=y+1; yy<cells_num_y; ++yy) + { + if (cells[yy * cells_num_x + x]->above) break; + } + cell->extend_down = yy - y; + cell->rect.max.y = cells[(yy-1) * cells_num_x + x]->rect.max.y; + + /* Clear .above and .left in enclosed cells. */ + for (xx = x; xx < x + cell->extend_right; ++xx) + { + int yy; + for (yy = y; yy < y + cell->extend_down; ++yy) + { + cell_t* cell2 = cells[cells_num_x * yy + xx]; + if ( xx==x && yy==y) + {} + else + { + if (xx==x) + { + cell2->extend_right = cell->extend_right; + } + cell2->above = 0; + /* We set .left to 1 for left-most cells - e.g. F + and K in the above diagram; this allows us to + generate correct html without lots of recursing + looking for extend_down in earlier cells. */ + cell2->left = (xx == x); + outf("xy=(%i %i) xxyy=(%i %i) have set cell2->above=%i left=%i", + x, y, xx, yy, cell2->above, cell2->left + ); + } + } + } + } + } + } + return 0; +} - if (make_lines( - alloc, - page->spans, - page->spans_num, - &page->lines, - &page->lines_num - )) goto end; - if (make_paragraphs( +static int table_find_cells_text(extract_alloc_t* alloc, extract_page_t* page, + cell_t** cells, int cells_num_x, int cells_num_y) +/* Sets each cell to contain the text that is within the cell's boundary. We +remove any found text from the page. */ +{ + /* Find text within each cell. We don't attempt to handle images within + cells. */ + int e = -1; + int i; + int cells_num = cells_num_x * cells_num_y; + for (i=0; i<cells_num; ++i) + { + cell_t* cell = cells[i]; + if (!cell->above || !cell->left) continue; + if (s_join_page_rects( alloc, - page->lines, - page->lines_num, - &page->paragraphs, - &page->paragraphs_num - )) goto end; + page, + &cell->rect, + 1 /*rects_num*/, + &cell->lines, + &cell->lines_num, + &cell->paragraphs, + &cell->paragraphs_num + )) return -1; } + + /* Append the table we have found to page->tables[]. */ + if (extract_realloc(alloc, &page->tables, sizeof(*page->tables) * (page->tables_num + 1))) goto end; + if (extract_malloc(alloc, &page->tables[page->tables_num], sizeof(*page->tables[page->tables_num]))) goto end; + page->tables[page->tables_num]->pos.x = cells[0]->rect.min.x; + page->tables[page->tables_num]->pos.y = cells[0]->rect.min.y; + page->tables[page->tables_num]->cells = cells; + page->tables[page->tables_num]->cells_num_x = cells_num_x; + page->tables[page->tables_num]->cells_num_y = cells_num_y; + page->tables_num += 1; + + if (0) + { + /* For debugging. */ + int y; + outf0("table:\n"); + for (y=0; y<cells_num_y; ++y) + { + int x; + for (x=0; x<cells_num_x; ++x) + { + cell_t* cell = cells[cells_num_x * y + x]; + fprintf(stderr, " %c%c x=%i y=% 3i 3i w=%i h=%i", + cell->left ? '|' : ' ', + cell->above ? '-' : ' ', + x, + y, + cell->extend_right, + cell->extend_down + ); + } + fprintf(stderr, "\n"); + } + + } + + e = 0; + end: + return e; +} - ret = 0; +static int table_find(extract_alloc_t* alloc, extract_page_t* page, double y_min, double y_max) +/* Finds single table made from lines whose y coordinates are in the range +y_min..y_max. */ +{ + tablelines_t* all_h = &page->tablelines_horizontal; + tablelines_t* all_v = &page->tablelines_vertical; + int e = -1; + int i; + + /* Find subset of vertical and horizontal lines that are within range + y_min..y_max, and sort by y coordinate. */ + tablelines_t tl_h = {NULL, 0}; + tablelines_t tl_v = {NULL, 0}; + cell_t** cells = NULL; + int cells_num = 0; + int cells_num_x = 0; + int cells_num_y = 0; + int x; + int y; + + outf("y=(%f %f)", y_min, y_max); + + if (table_find_y_range(alloc, all_h, y_min, y_max, &tl_h)) goto end; + if (table_find_y_range(alloc, all_v, y_min, y_max, &tl_v)) goto end; + /* Suppress false coverity warning - qsort() does not dereference null + pointer if nmemb is zero. */ + /* coverity[var_deref_model] */ + qsort(tl_v.tablelines, tl_v.tablelines_num, sizeof(*tl_v.tablelines), tablelines_compare_x); + + if (0) + { + /* Show raw lines info. */ + outf0("all_h->tablelines_num=%i tl_h.tablelines_num=%i", all_h->tablelines_num, tl_h.tablelines_num); + for (i=0; i<tl_h.tablelines_num; ++i) + { + outf0(" %i: %s", i, extract_rect_string(&tl_h.tablelines[i].rect)); + } + + outf0("all_v->tablelines_num=%i tl_v.tablelines_num=%i", all_v->tablelines_num, tl_v.tablelines_num); + for (i=0; i<tl_v.tablelines_num; ++i) + { + outf0(" %i: %s", i, extract_rect_string(&tl_v.tablelines[i].rect)); + } + } + /* Find the cells defined by the vertical and horizontal lines. + + It seems that lines can be disjoint, e.g. what looks like a single + horizontal line could be made up of multiple lines all with the same + y coordinate, so we use i_next and j_next to skip these sublines when + iterating. */ + cells = NULL; + cells_num = 0; + cells_num_x = 0; + cells_num_y = 0; + for (i=0; i<tl_h.tablelines_num; ) + { + int i_next; + int j; + for (i_next=i+1; i_next<tl_h.tablelines_num; ++i_next) + { + if (tl_h.tablelines[i_next].rect.min.y - tl_h.tablelines[i].rect.min.y > 5) break; + } + if (i_next == tl_h.tablelines_num) + { + /* Ignore last row of points - cells need another row below. */ + break; + } + cells_num_y += 1; + + for (j=0; j<tl_v.tablelines_num; ) + { + int j_next; + int ii; + int jj; + cell_t* cell; + + for (j_next = j+1; j_next<tl_v.tablelines_num; ++j_next) + { + if (tl_v.tablelines[j_next].rect.min.x - tl_v.tablelines[j].rect.min.x > 0.5) break; + } + outf("i=%i j=%i tl_v.tablelines[j].rect=%s", i, j, extract_rect_string(&tl_v.tablelines[j].rect)); + + if (j_next == tl_v.tablelines_num) break; + + if (extract_realloc(alloc, &cells, sizeof(*cells) * (cells_num+1))) goto end; + if (extract_malloc(alloc, &cells[cells_num], sizeof(*cells[cells_num]))) goto end; + cell = cells[cells_num]; + cells_num += 1; + if (i==0) cells_num_x += 1; + + cell->rect.min.x = tl_v.tablelines[j].rect.min.x; + cell->rect.min.y = tl_h.tablelines[i].rect.min.y; + cell->rect.max.x = (j_next < tl_v.tablelines_num) ? tl_v.tablelines[j_next].rect.min.x : cell->rect.min.x; + cell->rect.max.y = (i_next < tl_h.tablelines_num) ? tl_h.tablelines[i_next].rect.min.y : cell->rect.min.y; + cell->above = (i==0); + cell->left = (j==0); + cell->extend_right = 1; + cell->extend_down = 1; + cell->lines = NULL; + cell->lines_num = 0; + cell->paragraphs = NULL; + cell->paragraphs_num = 0; + + /* Set cell->above if there is a horizontal line above the cell. */ + outf("Looking to set above for i=%i j=%i rect=%s", i, j, extract_rect_string(&cell->rect)); + for (ii = i; ii < i_next; ++ii) + { + tableline_t* h = &tl_h.tablelines[ii]; + if (overlap( + cell->rect.min.x, + cell->rect.max.x, + h->rect.min.x, + h->rect.max.x + )) + { + cell->above = 1; + break; + } + } + + /* Set cell->left if there is a vertical line to the left of the cell. */ + for (jj = j; jj < j_next; ++jj) + { + tableline_t* v = &tl_v.tablelines[jj]; + if (overlap( + cell->rect.min.y, + cell->rect.max.y, + v->rect.min.y, + v->rect.max.y + )) + { + cell->left = 1; + break; + } + } + + j = j_next; + } + + i = i_next; + } + + assert(cells_num == cells_num_x * cells_num_y); + + /* Remove cols and rows where no cells have .above and .left - these + will not appear. It also avoids spurious empty columns when table uses + closely-spaced double lines as separators. */ + for (x=0; x<cells_num_x; ++x) + { + int has_cells = 0; + for (y=0; y<cells_num_y; ++y) + { + cell_t* cell = cells[y * cells_num_x + x]; + if (cell->above && cell->left) + { + has_cells = 1; + break; + } + } + if (!has_cells) + { + /* Remove column <x>. */ + int j = 0; + outf("Removing column %i. cells_num=%i cells_num_x=%i cells_num_y=%i", x, cells_num, cells_num_x, cells_num_y); + for (i=0; i<cells_num; ++i) + { + if (i % cells_num_x == x) + { + extract_cell_free(alloc, &cells[i]); + continue; + } + cells[j] = cells[i]; + j += 1; + } + cells_num -= cells_num_y; + cells_num_x -= 1; + } + } + + if (cells_num == 0) + { + e = 0; + goto end; + } + + if (table_find_extend(cells, cells_num_x, cells_num_y)) goto end; + + if (table_find_cells_text(alloc, page, cells, cells_num_x, cells_num_y)) goto end; + + e = 0; end: + extract_free(alloc, &tl_h.tablelines); + extract_free(alloc, &tl_v.tablelines); + if (e) + { + for (i=0; i<cells_num; ++i) + { + extract_cell_free(alloc, &cells[i]); + } + extract_free(alloc, &cells); + } + return e; +} - return ret; + +static int extract_page_tables_find_lines( + extract_alloc_t* alloc, + extract_page_t* page + ) +/* Finds tables in <page> by looking for lines in page->tablelines_horizontal +and page->tablelines_vertical that look like table dividers. + +Any text found inside tables is removed from page->spans[]. +*/ +{ + double miny; + double maxy; + double margin = 1; + int iv; + int ih; + outf("page->tablelines_horizontal.tablelines_num=%i", page->tablelines_horizontal.tablelines_num); + outf("page->tablelines_vertical.tablelines_num=%i", page->tablelines_vertical.tablelines_num); + + /* Sort all lines by y coordinate. */ + qsort( + page->tablelines_horizontal.tablelines, + page->tablelines_horizontal.tablelines_num, + sizeof(*page->tablelines_horizontal.tablelines), + tablelines_compare_y + ); + qsort( + page->tablelines_vertical.tablelines, + page->tablelines_vertical.tablelines_num, + sizeof(*page->tablelines_vertical.tablelines), + tablelines_compare_y + ); + + if (0) + { + /* Show info about lines. */ + int i; + outf0("tablelines_horizontal:"); + for (i=0; i<page->tablelines_horizontal.tablelines_num; ++i) + { + outf0(" color=%f: %s", + page->tablelines_horizontal.tablelines[i].color, + extract_rect_string(&page->tablelines_horizontal.tablelines[i].rect) + ); + } + outf0("tablelines_vertical:"); + for (i=0; i<page->tablelines_vertical.tablelines_num; ++i) + { + outf0(" color=%f: %s", + page->tablelines_vertical.tablelines[i].color, + extract_rect_string(&page->tablelines_vertical.tablelines[i].rect) + ); + } + } + + /* Look for completely separate vertical regions that define different + tables, by looking for vertical gaps between the rects of each + horizontal/vertical line. */ + maxy = -DBL_MAX; + miny = -DBL_MAX; + iv = 0; + ih = 0; + for(;;) + { + tableline_t* tlv = NULL; + tableline_t* tlh = NULL; + tableline_t* tl; + if (iv < page->tablelines_vertical.tablelines_num) + { + tlv = &page->tablelines_vertical.tablelines[iv]; + } + /* We only consider horizontal lines that are not white. This is a bit + of a cheat to get the right behaviour with twotables_2.pdf. */ + while (ih < page->tablelines_horizontal.tablelines_num) + { + if (page->tablelines_horizontal.tablelines[ih].color == 1) + { + /* Ignore white horizontal lines. */ + ++ih; + } + else + { + tlh = &page->tablelines_horizontal.tablelines[ih]; + break; + } + } + if (tlv && tlh) + { + tl = (tlv->rect.min.y < tlh->rect.min.y) ? tlv : tlh; + } + else if (tlv) tl = tlv; + else if (tlh) tl = tlh; + else break; + if (tl == tlv) iv += 1; + else ih += 1; + if (tl->rect.min.y > maxy + margin) + { + if (maxy > miny) + { + outf("New table. maxy=%f miny=%f", maxy, miny); + /* Find table. */ + table_find(alloc, page, miny - margin, maxy + margin); + } + miny = tl->rect.min.y; + } + if (tl->rect.max.y > maxy) maxy = tl->rect.max.y; + } + + /* Find last table. */ + table_find(alloc, page, miny - margin, maxy + margin); + + return 0; +} + + +static void show_tables(table_t** tables, int tables_num) +/* For debugging only. */ +{ + int i; + outf0("tables_num=%i", tables_num); + for (i=0; i<tables_num; ++i) + { + table_t* table = tables[i]; + int y; + outf0("table %i: cells_num_y=%i cells_num_x=%i", i, table->cells_num_y, table->cells_num_x); + for (y=0; y<table->cells_num_y; ++y) + { + int x; + for (x=0; x<table->cells_num_x; ++x) + { + cell_t* cell = table->cells[table->cells_num_x * y + x]; + outf0("cell: y=% 3i x=% 3i: left=%i above=%i rect=%s", + y, x, cell->left, cell->above, extract_rect_string(&cell->rect)); + } + } + } +} + +static int extract_page_tables_find( + extract_alloc_t* alloc, + extract_page_t* page + ) +/* Find tables in <page>. + +At the moment this only calls extract_page_tables_find_lines(), but in future +will call other functions that find tables in different ways, e.g. by analysing +an image of a page, or looking for blocks of whitespace in between chunks of +text. */ +{ + if (extract_page_tables_find_lines(alloc, page)) return -1; + + if (0) + { + outf0("=== tables from extract_page_tables_find_lines():"); + show_tables(page->tables, page->tables_num); + } + + return 0; +} + +static int extract_document_join_page( + extract_alloc_t* alloc, + extract_page_t* page + ) +/* Finds tables and paragraphs on <page>. */ +{ + /* Find tables on this page first. This will remove text that is within + tables from page->spans, so that text doesn't appearing more than once in + the final output. */ + if (extract_page_tables_find(alloc, page)) return -1; + + /* Now join remaining spans into lines and paragraphs. */ + if (s_join_page_rects( + alloc, + page, + NULL /*rects*/, + 0 /*rects_num*/, + &page->lines, + &page->lines_num, + &page->paragraphs, + &page->paragraphs_num + )) + { + outf0("s_join_page_rects failed. page->spans_num=%i page->lines_num=%i page->paragraphs_num=%i", + page->spans_num, + page->lines_num, + page->paragraphs_num + ); + return -1; + } + + return 0; +} + + +int extract_document_join(extract_alloc_t* alloc, document_t* document) +{ + /* For each page in <document> we find tables and join spans into lines and paragraphs. + + A line is a list of spans that are at the same angle and on the same + line. A paragraph is a list of lines that are at the same angle and close + together. + */ + int p; + for (p=0; p<document->pages_num; ++p) { + extract_page_t* page = document->pages[p]; + + outf("processing page %i: num_spans=%i", p, page->spans_num); + if (extract_document_join_page(alloc, page)) return -1; + } + + return 0; } diff --git a/extract/src/mem.c b/extract/src/mem.c index 83b5032c..1c3c96e6 100644 --- a/extract/src/mem.c +++ b/extract/src/mem.c @@ -19,16 +19,26 @@ void extract_bzero(void *b, size_t len) int extract_vasprintf(extract_alloc_t* alloc, char** out, const char* format, va_list va) { int n; - int n2; + int ret; va_list va2; va_copy(va2, va); n = vsnprintf(NULL, 0, format, va); - if (n < 0) return n; - if (extract_malloc(alloc, out, n + 1)) return -1; - n2 = vsnprintf(*out, n + 1, format, va2); + if (n < 0) + { + ret = n; + goto end; + } + if (extract_malloc(alloc, out, n + 1)) + { + ret = -1; + goto end; + } + vsnprintf(*out, n + 1, format, va2); + ret = 0; + + end: va_end(va2); - assert(n2 == n); - return n2; + return ret; } diff --git a/extract/src/mem.h b/extract/src/mem.h index ffdcb049..2611b04f 100644 --- a/extract/src/mem.h +++ b/extract/src/mem.h @@ -8,8 +8,17 @@ void extract_bzero(void *b, size_t len); -int extract_vasprintf(extract_alloc_t* alloc, char** out, const char* format, va_list va); -int extract_asprintf(extract_alloc_t* alloc, char** out, const char* format, ...); +int extract_vasprintf(extract_alloc_t* alloc, char** out, const char* format, va_list va) + #ifdef __GNUC__ + __attribute__ ((format (printf, 3, 0))) + #endif + ; + +int extract_asprintf(extract_alloc_t* alloc, char** out, const char* format, ...) + #ifdef __GNUC__ + __attribute__ ((format (printf, 3, 4))) + #endif + ; int extract_strdup(extract_alloc_t* alloc, const char* s, char** o_out); diff --git a/extract/src/memento.py b/extract/src/memento.py index 987cd4fd..55171e39 100755 --- a/extract/src/memento.py +++ b/extract/src/memento.py @@ -3,20 +3,29 @@ ''' Post-processor for Memento. +Usage: + memento.py <args> [<command> ...] + Args: -q <quiet> Controls how often we output 'Memory squeezing @ ...' lines. E.g. '-q 10' outputs for multiples of 10. + +If <command> is specified we run it and look at the output. Otherwise we assume +that Memento output is available on our stdin. ''' import os import re +import subprocess import sys def main(): quiet = 1 + quiet_next = 0 out_raw = None + command = None args = iter(sys.argv[1:]) while 1: try: @@ -29,15 +38,32 @@ def main(): out_raw = open(next(args), 'w') elif arg == '-q': quiet = int(next(args)) - else: + elif arg.startswith('-'): raise Exception(f'unrecognised arg: {arg}') + else: + command = arg + for a in args: + command += f' {a}' + + if command: + print(f'Running: {command}') + child = subprocess.Popen( + command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + shell=True, + text=True, + ) + stdin = child.stdout + else: + stdin = sys.stdin openbsd = os.uname()[0] == 'OpenBSD' n = None segv = 0 leaks = 0 lines = [] - for line in sys.stdin: + for line in stdin: if out_raw: out_raw.write(line) m = re.match('^Memory squeezing @ ([0-9]+)( complete)?', line) @@ -45,7 +71,7 @@ def main(): if not m.group(2): # Start of squeeze. - if not openbsd: + if 0 and not openbsd: # Looks like memento's forked processes might terminate # before they get to output the 'Memory squeezing @ <N> # complete' line. @@ -53,9 +79,10 @@ def main(): assert n is None, f'n={n} line={line!r}' n = int(m.group(1)) - if n % quiet == 0: - sys.stdout.write(line) + if n >= quiet_next: + sys.stdout.write(f'quiet_next={quiet_next!r} n={n!r}: {line}') sys.stdout.flush() + quiet_next = (n + quiet) // quiet * quiet else: # End of squeeze. assert n == int(m.group(1)) @@ -66,6 +93,8 @@ def main(): if l.endswith('\n'): l = l[:-1] print(f' {l}') + if command: + print(f'Examine with: MEMENTO_FAILAT={n} {command}') lines = [] segv = 0 leaks = 0 diff --git a/extract/src/misc-test.c b/extract/src/misc-test.c index 58b098ff..5e658e8f 100644 --- a/extract/src/misc-test.c +++ b/extract/src/misc-test.c @@ -35,6 +35,15 @@ static void s_check( if (!ok) s_num_fails += 1; } +static void s_check_e( int e, const char* text) +{ + if (e) + { + s_num_fails += 1; + printf( "Error: e=%i: %s\n", e, text); + } +} + static void s_check_int(const char* text, int value_expected, int expected_errno) { int value; @@ -59,6 +68,53 @@ static void s_check_uint(const char* text, unsigned expected_value, int expected return; } +static void s_check_xml_parse() +{ + int e; + extract_buffer_t* buffer; + extract_xml_tag_t tag; + unsigned i; + const char* texts[] = { + "<foo a=1>text</foo>", + "< >", + "<foo bar=>", + "< bar=>", + "< =>", + }; + + extract_xml_tag_init( &tag); + + for (i=0; i<sizeof(texts) / sizeof(texts[0]); ++i) + { + const char* text = texts[i]; + printf("testing extract_xml_pparse_*(): %s\n", text); + e = extract_buffer_open_simple( + NULL /*alloc*/, + text, + strlen(text), + NULL /*handle*/, + NULL /*fn_close*/, + &buffer + ); + s_check_e( e, "extract_buffer_open_simple()"); + e = extract_xml_pparse_init( NULL /*alloc*/, buffer, NULL /*first_line*/); + s_check_e( e, "extract_xml_pparse_init()"); + + e = extract_xml_pparse_next( buffer, &tag); + s_check_e( e, "extract_xml_pparse_next()"); + s_check_e( tag.name ? 0 : 1, "tag.name is not null"); + + { + int j; + for (j=0; j<tag.attributes_num; ++j) + { + s_check_e( tag.attributes[j].name ? 0 : 1, "attribute is non-null"); + s_check_e( tag.attributes[j].value ? 0 : 1, "attribute is non-null"); + } + } + } +} + int main(void) { printf("testing extract_xml_str_to_int():\n"); @@ -73,6 +129,8 @@ int main(void) s_check_uint("-20b", 0, EINVAL); s_check_uint("123456789123", 0, ERANGE); + s_check_xml_parse(); + printf("s_num_fails=%i\n", s_num_fails); if (s_num_fails) { diff --git a/extract/src/odt.c b/extract/src/odt.c index bacb362d..9e369078 100644 --- a/extract/src/odt.c +++ b/extract/src/odt.c @@ -21,6 +21,7 @@ odt_paragraph_finish(). */ #include <assert.h> #include <errno.h> +#include <float.h> #include <math.h> #include <stdlib.h> #include <stdio.h> @@ -29,17 +30,16 @@ odt_paragraph_finish(). */ #include <sys/stat.h> -static int extract_odt_paragraph_start(extract_alloc_t* alloc, extract_astring_t* content) +static int s_odt_paragraph_start(extract_alloc_t* alloc, extract_astring_t* content) { return extract_astring_cat(alloc, content, "\n\n<text:p>"); } -static int extract_odt_paragraph_finish(extract_alloc_t* alloc, extract_astring_t* content) +static int s_odt_paragraph_finish(extract_alloc_t* alloc, extract_astring_t* content) { return extract_astring_cat(alloc, content, "</text:p>"); } - /* ODT doesn't seem to support ad-hoc inline font specifications; instead we have to define a style at the start of the content.xml file. So when writing content we insert a style name and add the required styles to a @@ -48,10 +48,7 @@ extract_odt_styles_t struct. */ struct extract_odt_style_t { int id; /* A unique id for this style. */ - char* font_name; - double font_size; - int font_bold; - int font_italic; + font_t font; }; struct extract_odt_styles_t @@ -61,41 +58,47 @@ struct extract_odt_styles_t int styles_num; }; -static int extract_odt_style_compare(extract_odt_style_t* a, extract_odt_style_t*b) +static int s_odt_style_compare(extract_odt_style_t* a, extract_odt_style_t*b) { int d; double dd; - if ((d = strcmp(a->font_name, b->font_name))) return d; - if ((dd = a->font_size - b->font_size) != 0.0) return (dd > 0.0) ? 1 : -1; - if ((d = a->font_bold - b->font_bold)) return d; - if ((d = a->font_italic - b->font_italic)) return d; + if ((d = strcmp(a->font.name, b->font.name))) return d; + if ((dd = a->font.size - b->font.size) != 0.0) return (dd > 0.0) ? 1 : -1; + if ((d = a->font.bold - b->font.bold)) return d; + if ((d = a->font.italic - b->font.italic)) return d; return 0; } -static int extract_odt_style_append_definition(extract_alloc_t* alloc, extract_odt_style_t* style, extract_astring_t* text) +static int s_odt_style_append_definition(extract_alloc_t* alloc, extract_odt_style_t* style, extract_astring_t* text) { - const char* font_name = style->font_name; + const char* font_name = style->font.name; /* This improves output e.g. for zlib.3.pdf, but clearly a hack. */ if (0 && strstr(font_name, "Helvetica")) { font_name = "Liberation Sans"; } - outf("style->font_name=%s font_name=%s", style->font_name, font_name); + outf("style->font_name=%s font_name=%s", style->font.name, font_name); if (extract_astring_catf(alloc, text, "<style:style style:name=\"T%i\" style:family=\"text\">", style->id)) return -1; if (extract_astring_catf(alloc, text, "<style:text-properties style:font-name=\"%s\"", font_name)) return -1; - if (extract_astring_catf(alloc, text, " fo:font-size=\"%.2fpt\"", style->font_size)) return -1; - if (extract_astring_catf(alloc, text, " fo:font-weight=\"%s\"", style->font_bold ? "bold" : "normal")) return -1; - if (extract_astring_catf(alloc, text, " fo:font-style=\"%s\"", style->font_italic ? "italic" : "normal")) return -1; + if (extract_astring_catf(alloc, text, " fo:font-size=\"%.2fpt\"", style->font.size)) return -1; + if (extract_astring_catf(alloc, text, " fo:font-weight=\"%s\"", style->font.bold ? "bold" : "normal")) return -1; + if (extract_astring_catf(alloc, text, " fo:font-style=\"%s\"", style->font.italic ? "italic" : "normal")) return -1; if (extract_astring_cat(alloc, text, " /></style:style>")) return -1; return 0; } void extract_odt_styles_free(extract_alloc_t* alloc, extract_odt_styles_t* styles) { + int i; + for (i=0; i<styles->styles_num; ++i) + { + extract_odt_style_t* style = &styles->styles[i]; + extract_free(alloc, &style->font.name); + } extract_free(alloc, &styles->styles); } -static int extract_odt_styles_definitions( +static int s_odt_styles_definitions( extract_alloc_t* alloc, extract_odt_styles_t* styles, extract_astring_t* out @@ -105,7 +108,7 @@ static int extract_odt_styles_definitions( if (extract_astring_cat(alloc, out, "<office:automatic-styles>")) return -1; for (i=0; i<styles->styles_num; ++i) { - if (extract_odt_style_append_definition(alloc, &styles->styles[i], out)) return -1; + if (s_odt_style_append_definition(alloc, &styles->styles[i], out)) return -1; } extract_astring_cat(alloc, out, "<style:style style:name=\"gr1\" style:family=\"graphic\">\n"); extract_astring_cat(alloc, out, "<style:graphic-properties" @@ -159,25 +162,22 @@ static int extract_odt_styles_definitions( return 0; } -static int styles_add( +static int s_odt_styles_add( extract_alloc_t* alloc, extract_odt_styles_t* styles, - const char* font_name, - double font_size, - int font_bold, - int font_italic, + font_t* font, extract_odt_style_t** o_style ) /* Adds specified style to <styles> if not already present. Sets *o_style to point to the style_t within <styles>. */ { - extract_odt_style_t style = {0 /*id*/, (char*) font_name, font_size, font_bold, font_italic}; + extract_odt_style_t style = {0 /*id*/, *font}; int i; /* We keep styles->styles[] sorted; todo: use bsearch or similar when searching. */ for (i=0; i<styles->styles_num; ++i) { - int d = extract_odt_style_compare(&style, &styles->styles[i]); + int d = s_odt_style_compare(&style, &styles->styles[i]); if (d == 0) { *o_style = &styles->styles[i]; @@ -190,92 +190,79 @@ point to the style_t within <styles>. */ memmove(&styles->styles[i+1], &styles->styles[i], sizeof(styles->styles[0]) * (styles->styles_num - i)); styles->styles_num += 1; styles->styles[i].id = styles->styles_num + 10; /* Leave space for template's built-in styles. */ - if (extract_strdup(alloc, font_name, &styles->styles[i].font_name)) return -1; - styles->styles[i].font_size = font_size; - styles->styles[i].font_bold = font_bold; - styles->styles[i].font_italic = font_italic; + if (extract_strdup(alloc, font->name, &styles->styles[i].font.name)) return -1; + styles->styles[i].font.size = font->size; + styles->styles[i].font.bold = font->bold; + styles->styles[i].font.italic = font->italic; *o_style = &styles->styles[i]; return 0; } static int extract_odt_run_start( - extract_alloc_t* alloc, - extract_astring_t* content, - extract_odt_styles_t* styles, - const char* font_name, - double font_size, - int bold, - int italic + extract_alloc_t* alloc, + extract_astring_t* content, + extract_odt_styles_t* styles, + content_state_t* content_state ) -/* Starts a new run. Caller must ensure that extract_odt_run_finish() was +/* Starts a new run. Caller must ensure that s_odt_run_finish() was called to terminate any previous run. */ { extract_odt_style_t* style; - if (styles_add(alloc, styles, font_name, font_size, bold, italic, &style)) return -1; + if (s_odt_styles_add( + alloc, + styles, + &content_state->font, + &style + )) return -1; if (extract_astring_catf(alloc, content, "<text:span text:style-name=\"T%i\">", style->id)) return -1; return 0; } -static int extract_odt_run_finish(extract_alloc_t* alloc, extract_astring_t* content) +static int s_odt_run_finish(extract_alloc_t* alloc, content_state_t* content_state, extract_astring_t* content) { + if (content_state) content_state->font.name = NULL; return extract_astring_cat(alloc, content, "</text:span>"); } -static int extract_odt_paragraph_empty(extract_alloc_t* alloc, extract_astring_t* content, extract_odt_styles_t* styles) +static int s_odt_append_empty_paragraph(extract_alloc_t* alloc, extract_astring_t* content, extract_odt_styles_t* styles) /* Append an empty paragraph to *content. */ { int e = -1; - if (extract_odt_paragraph_start(alloc, content)) goto end; + static char fontname[] = "OpenSans"; + content_state_t content_state = {0}; + if (s_odt_paragraph_start(alloc, content)) goto end; /* [This comment is from docx, haven't checked odt.] It seems like our - choice of font size here doesn't make any difference to the ammount of + choice of font size here doesn't make any difference to the amount of vertical space, unless we include a non-space character. Presumably something to do with the styles in the template document. */ - if (extract_odt_run_start( - alloc, - content, - styles, - "OpenSans", - 10 /*font_size*/, - 0 /*font_bold*/, - 0 /*font_italic*/ - )) goto end; + content_state.font.name = fontname; + content_state.font.size = 10; + content_state.font.bold = 0; + content_state.font.italic = 0; + if (extract_odt_run_start(alloc, content, styles, &content_state)) goto end; //docx_char_append_string(content, " "); /*   is non-break space. */ - if (extract_odt_run_finish(alloc, content)) goto end; - if (extract_odt_paragraph_finish(alloc, content)) goto end; + if (s_odt_run_finish(alloc, NULL /*content_state*/, content)) goto end; + if (s_odt_paragraph_finish(alloc, content)) goto end; e = 0; end: return e; } -typedef struct -{ - const char* font_name; - double font_size; - int font_bold; - int font_italic; - matrix_t* ctm_prev; - /* todo: add extract_odt_styles_t member? */ -} content_state_t; -/* Used to keep track of font information when writing paragraphs of odt -content, e.g. so we know whether a font has changed so need to start a new odt -span. */ - - -static int extract_document_to_odt_content_paragraph( +static int s_document_to_odt_content_paragraph( extract_alloc_t* alloc, - content_state_t* state, + content_state_t* content_state, paragraph_t* paragraph, extract_astring_t* content, extract_odt_styles_t* styles ) -/* Append odt xml for <paragraph> to <content>. Updates *state if we change -font. */ +/* Append odt xml for <paragraph> to <content>. Updates *content_state if we +change font. */ { int e = -1; int l; - if (extract_odt_paragraph_start(alloc, content)) goto end; + if (s_odt_paragraph_start(alloc, content)) goto end; for (l=0; l<paragraph->lines_num; ++l) { @@ -286,50 +273,41 @@ font. */ int si; span_t* span = line->spans[s]; double font_size_new; - state->ctm_prev = &span->ctm; + content_state->ctm_prev = &span->ctm; font_size_new = extract_matrices_to_font_size(&span->ctm, &span->trm); - if (!state->font_name - || strcmp(span->font_name, state->font_name) - || span->flags.font_bold != state->font_bold - || span->flags.font_italic != state->font_italic - || font_size_new != state->font_size + if (!content_state->font.name + || strcmp(span->font_name, content_state->font.name) + || span->flags.font_bold != content_state->font.bold + || span->flags.font_italic != content_state->font.italic + || font_size_new != content_state->font.size ) { - if (state->font_name) + if (content_state->font.name) { - if (extract_odt_run_finish(alloc, content)) goto end; + if (s_odt_run_finish(alloc, content_state, content)) goto end; } - state->font_name = span->font_name; - state->font_bold = span->flags.font_bold; - state->font_italic = span->flags.font_italic; - state->font_size = font_size_new; - if (extract_odt_run_start( - alloc, - content, - styles, - state->font_name, - state->font_size, - state->font_bold, - state->font_italic - )) goto end; + content_state->font.name = span->font_name; + content_state->font.bold = span->flags.font_bold; + content_state->font.italic = span->flags.font_italic; + content_state->font.size = font_size_new; + if (extract_odt_run_start( alloc, content, styles, content_state)) goto end; } for (si=0; si<span->chars_num; ++si) { char_t* char_ = &span->chars[si]; int c = char_->ucs; - if (extract_astring_cat_xmlc(alloc, content, c)) goto end; + if (extract_astring_catc_unicode_xml(alloc, content, c)) goto end; } /* Remove any trailing '-' at end of line. */ - if (astring_char_truncate_if(content, '-')) goto end; + if (extract_astring_char_truncate_if(content, '-')) goto end; } } - if (state->font_name) + if (content_state->font.name) { - if (extract_odt_run_finish(alloc, content)) goto end; - state->font_name = NULL; + if (s_odt_run_finish(alloc, content_state, content)) goto end; } - if (extract_odt_paragraph_finish(alloc, content)) goto end; + if (s_odt_paragraph_finish(alloc, content)) goto end; e = 0; @@ -337,7 +315,7 @@ font. */ return e; } -static int extract_document_append_image( +static int s_odt_append_image( extract_alloc_t* alloc, extract_astring_t* content, image_t* image @@ -362,7 +340,7 @@ static int extract_document_append_image( } -static int extract_document_output_rotated_paragraphs( +static int s_odt_output_rotated_paragraphs( extract_alloc_t* alloc, extract_page_t* page, int paragraph_begin, @@ -375,14 +353,14 @@ static int extract_document_output_rotated_paragraphs( int text_box_id, extract_astring_t* content, extract_odt_styles_t* styles, - content_state_t* state + content_state_t* content_state ) /* Writes paragraph to content inside rotated text box. */ { int e = 0; int p; double pt_to_inch = 1/72.0; - outf("rotated paragraphs: rotation_rad=%f (x y)=(%i %i) (w h)=(%i %i)", rotation_rad, x_pt, y_pt, w_pt, h_pt); + outf("rotated paragraphs: rotation_rad=%f (x y)=(%f %f) (w h)=(%f %f)", rotation_rad, x_pt, y_pt, w_pt, h_pt); // https://docs.oasis-open.org/office/OpenDocument/v1.3/cs02/part3-schema/OpenDocument-v1.3-cs02-part3-schema.html#attribute-draw_transform // says rotation is in degrees, but we seem to require -radians. @@ -414,7 +392,7 @@ static int extract_document_output_rotated_paragraphs( for (p=paragraph_begin; p<paragraph_end; ++p) { paragraph_t* paragraph = page->paragraphs[p]; - if (!e) e = extract_document_to_odt_content_paragraph(alloc, state, paragraph, content, styles); + if (!e) e = s_document_to_odt_content_paragraph(alloc, content_state, paragraph, content, styles); } if (!e) e = extract_astring_cat(alloc, content, "\n"); @@ -427,6 +405,219 @@ static int extract_document_output_rotated_paragraphs( } +static int s_odt_append_table(extract_alloc_t* alloc, table_t* table, extract_astring_t* content, extract_odt_styles_t* styles) +{ + int e = -1; + int y; + + { + int x; + static int table_number = 0; + table_number += 1; + if (extract_astring_catf(alloc, content, + "\n" + " <table:table text:style-name=\"extract.table\" table:name=\"extract.table.%i\">\n" + " <table:table-columns>\n" + , + table_number + )) goto end; + + for (x=0; x<table->cells_num_x; ++x) + { + if (extract_astring_cat(alloc, content, + " <table:table-column table:style-name=\"extract.table.column\"/>\n" + )) goto end; + } + if (extract_astring_cat(alloc, content, + " </table:table-columns>\n" + )) goto end; + } + for (y=0; y<table->cells_num_y; ++y) + { + int x; + if (extract_astring_cat(alloc, content, + " <table:table-row>\n" + )) goto end; + + for (x=0; x<table->cells_num_x; ++x) + { + cell_t* cell = table->cells[y*table->cells_num_x + x]; + if (!cell->above || !cell->left) + { + if (extract_astring_cat(alloc, content, " <table:covered-table-cell/>\n")) goto end; + continue; + } + + if (extract_astring_cat(alloc, content, " <table:table-cell")) goto end; + if (cell->extend_right > 1) + { + if (extract_astring_catf(alloc, content, " table:number-columns-spanned=\"%i\"", cell->extend_right)) goto end; + } + if (cell->extend_down > 1) + { + if (extract_astring_catf(alloc, content, " table:number-rows-spanned=\"%i\"", cell->extend_down)) goto end; + } + if (extract_astring_catf(alloc, content, ">\n")) goto end; + + /* Write contents of this cell. */ + { + int p; + content_state_t content_state; + content_state.font.name = NULL; + content_state.ctm_prev = NULL; + for (p=0; p<cell->paragraphs_num; ++p) + { + paragraph_t* paragraph = cell->paragraphs[p]; + if (s_document_to_odt_content_paragraph(alloc, &content_state, paragraph, content, styles)) goto end; + } + if (content_state.font.name) + { + if (s_odt_run_finish(alloc, &content_state, content)) goto end; + } + if (extract_astring_cat(alloc, content, "\n")) goto end; + } + if (extract_astring_cat(alloc, content, " </table:table-cell>\n")) goto end; + } + if (extract_astring_cat(alloc, content, " </table:table-row>\n")) goto end; + } + if (extract_astring_cat(alloc, content, " </table:table>\n")) goto end; + e = 0; + + end: + return e; +} + + +static int s_odt_append_rotated_paragraphs( + extract_alloc_t* alloc, + extract_page_t* page, + content_state_t* content_state, + int* p, + int* text_box_id, + const matrix_t* ctm, + double rotate, + extract_astring_t* content, + extract_odt_styles_t* styles + ) +/* Appends paragraphs with same rotation, starting with page->paragraphs[*p] +and updates *p. */ +{ + /* Find extent of paragraphs with this same rotation. extent + will contain max width and max height of paragraphs, in units + before application of ctm, i.e. before rotation. */ + int e = -1; + point_t extent = {0, 0}; + int p0 = *p; + int p1; + paragraph_t* paragraph = page->paragraphs[*p]; + + outf("rotate=%.2frad=%.1fdeg ctm: ef=(%f %f) abcd=(%f %f %f %f)", + rotate, rotate * 180 / pi, + ctm->e, + ctm->f, + ctm->a, + ctm->b, + ctm->c, + ctm->d + ); + + { + /* We assume that first span is at origin of text + block. This assumes left-to-right text. */ + double rotate0 = rotate; + const matrix_t* ctm0 = ctm; + point_t origin = + { + paragraph->lines[0]->spans[0]->chars[0].x, + paragraph->lines[0]->spans[0]->chars[0].y + }; + matrix_t ctm_inverse = {1, 0, 0, 1, 0, 0}; + double ctm_det = ctm->a*ctm->d - ctm->b*ctm->c; + if (ctm_det != 0) + { + ctm_inverse.a = +ctm->d / ctm_det; + ctm_inverse.b = -ctm->b / ctm_det; + ctm_inverse.c = -ctm->c / ctm_det; + ctm_inverse.d = +ctm->a / ctm_det; + } + else + { + outf("cannot invert ctm=(%f %f %f %f)", + ctm->a, ctm->b, ctm->c, ctm->d); + } + + for (*p=p0; *p<page->paragraphs_num; ++*p) + { + paragraph = page->paragraphs[*p]; + ctm = ¶graph->lines[0]->spans[0]->ctm; + rotate = atan2(ctm->b, ctm->a); + if (rotate != rotate0) + { + break; + } + + /* Update <extent>. */ + { + int l; + for (l=0; l<paragraph->lines_num; ++l) + { + line_t* line = paragraph->lines[l]; + span_t* span = extract_line_span_last(line); + char_t* char_ = extract_span_char_last(span); + double adv = char_->adv * extract_matrix_expansion(span->trm); + double x = char_->x + adv * cos(rotate); + double y = char_->y + adv * sin(rotate); + + double dx = x - origin.x; + double dy = y - origin.y; + + /* Position relative to origin and before box rotation. */ + double xx = ctm_inverse.a * dx + ctm_inverse.b * dy; + double yy = ctm_inverse.c * dx + ctm_inverse.d * dy; + yy = -yy; + if (xx > extent.x) extent.x = xx; + if (yy > extent.y) extent.y = yy; + if (0) outf("rotate=%f *p=%i: origin=(%f %f) xy=(%f %f) dxy=(%f %f) xxyy=(%f %f) span: %s", + rotate, *p, origin.x, origin.y, x, y, dx, dy, xx, yy, extract_span_string(alloc, span)); + } + } + } + p1 = *p; + rotate = rotate0; + ctm = ctm0; + outf("rotate=%f p0=%i p1=%i. extent is: (%f %f)", + rotate, p0, p1, extent.x, extent.y); + } + + /* Paragraphs p0..p1-1 have same rotation. We output them into + a single rotated text box. */ + + /* We need unique id for text box. */ + *text_box_id += 1; + + if (s_odt_output_rotated_paragraphs( + alloc, + page, + p0, + p1, + rotate, + ctm->e, + ctm->f, + extent.x, + extent.y, + *text_box_id, + content, + styles, + content_state + )) goto end; + *p = p1 - 1; + e = 0; + + end: + return e; +} + + int extract_document_to_odt_content( extract_alloc_t* alloc, document_t* document, @@ -445,156 +636,66 @@ int extract_document_to_odt_content( for (p=0; p<document->pages_num; ++p) { extract_page_t* page = document->pages[p]; - int p; - content_state_t state; - state.font_name = NULL; - state.font_size = 0; - state.font_bold = 0; - state.font_italic = 0; - state.ctm_prev = NULL; + int p = 0; + int t = 0; + content_state_t content_state; + content_state.font.name = NULL; + content_state.font.size = 0; + content_state.font.bold = 0; + content_state.font.italic = 0; + content_state.ctm_prev = NULL; - for (p=0; p<page->paragraphs_num; ++p) + for(;;) { - paragraph_t* paragraph = page->paragraphs[p]; - const matrix_t* ctm = ¶graph->lines[0]->spans[0]->ctm; - double rotate = atan2(ctm->b, ctm->a); + paragraph_t* paragraph = (p == page->paragraphs_num) ? NULL : page->paragraphs[p]; + table_t* table = (t == page->tables_num) ? NULL : page->tables[t]; + double y_paragraph; + double y_table; + if (!paragraph && !table) break; + y_paragraph = (paragraph) ? paragraph->lines[0]->spans[0]->chars[0].y : DBL_MAX; + y_table = (table) ? table->pos.y : DBL_MAX; - if (spacing - && state.ctm_prev - && paragraph->lines_num - && paragraph->lines[0]->spans_num - && matrix_cmp4( - state.ctm_prev, - ¶graph->lines[0]->spans[0]->ctm - ) - ) + if (paragraph && y_paragraph < y_table) { - /* Extra vertical space between paragraphs that were at - different angles in the original document. */ - if (extract_odt_paragraph_empty(alloc, content, styles)) goto end; - } + const matrix_t* ctm = ¶graph->lines[0]->spans[0]->ctm; + double rotate = atan2(ctm->b, ctm->a); + + if (spacing + && content_state.ctm_prev + && paragraph->lines_num + && paragraph->lines[0]->spans_num + && extract_matrix_cmp4( + content_state.ctm_prev, + ¶graph->lines[0]->spans[0]->ctm + ) + ) + { + /* Extra vertical space between paragraphs that were at + different angles in the original document. */ + if (s_odt_append_empty_paragraph(alloc, content, styles)) goto end; + } - if (spacing) - { - /* Extra vertical space between paragraphs. */ - if (extract_odt_paragraph_empty(alloc, content, styles)) goto end; - } - - if (rotation && rotate != 0) - { - /* Find extent of paragraphs with this same rotation. extent - will contain max width and max height of paragraphs, in units - before application of ctm, i.e. before rotation. */ - point_t extent = {0, 0}; - int p0 = p; - int p1; - - outf("rotate=%.2frad=%.1fdeg ctm: ef=(%f %f) abcd=(%f %f %f %f)", - rotate, rotate * 180 / pi, - ctm->e, - ctm->f, - ctm->a, - ctm->b, - ctm->c, - ctm->d - ); - + if (spacing) { - /* We assume that first span is at origin of text - block. This assumes left-to-right text. */ - double rotate0 = rotate; - const matrix_t* ctm0 = ctm; - point_t origin = - { - paragraph->lines[0]->spans[0]->chars[0].x, - paragraph->lines[0]->spans[0]->chars[0].y - }; - matrix_t ctm_inverse = {1, 0, 0, 1, 0, 0}; - double ctm_det = ctm->a*ctm->d - ctm->b*ctm->c; - if (ctm_det != 0) - { - ctm_inverse.a = +ctm->d / ctm_det; - ctm_inverse.b = -ctm->b / ctm_det; - ctm_inverse.c = -ctm->c / ctm_det; - ctm_inverse.d = +ctm->a / ctm_det; - } - else - { - outf("cannot invert ctm=(%f %f %f %f)", - ctm->a, ctm->b, ctm->c, ctm->d); - } - - for (p=p0; p<page->paragraphs_num; ++p) - { - paragraph = page->paragraphs[p]; - ctm = ¶graph->lines[0]->spans[0]->ctm; - rotate = atan2(ctm->b, ctm->a); - if (rotate != rotate0) - { - break; - } - - /* Update <extent>. */ - { - int l; - for (l=0; l<paragraph->lines_num; ++l) - { - line_t* line = paragraph->lines[l]; - span_t* span = line_span_last(line); - char_t* char_ = span_char_last(span); - double adv = char_->adv * matrix_expansion(span->trm); - double x = char_->x + adv * cos(rotate); - double y = char_->y + adv * sin(rotate); - - double dx = x - origin.x; - double dy = y - origin.y; - - /* Position relative to origin and before box rotation. */ - double xx = ctm_inverse.a * dx + ctm_inverse.b * dy; - double yy = ctm_inverse.c * dx + ctm_inverse.d * dy; - yy = -yy; - if (xx > extent.x) extent.x = xx; - if (yy > extent.y) extent.y = yy; - if (0) outf("rotate=%f p=%i: origin=(%f %f) xy=(%f %f) dxy=(%f %f) xxyy=(%f %f) span: %s", - rotate, p, origin.x, origin.y, x, y, dx, dy, xx, yy, span_string(alloc, span)); - } - } - } - p1 = p; - rotate = rotate0; - ctm = ctm0; - outf("rotate=%f p0=%i p1=%i. extent is: (%f %f)", - rotate, p0, p1, extent.x, extent.y); + /* Extra vertical space between paragraphs. */ + if (s_odt_append_empty_paragraph(alloc, content, styles)) goto end; } - - /* Paragraphs p0..p1-1 have same rotation. We output them into - a single rotated text box. */ - - /* We need unique id for text box. */ - text_box_id += 1; - - if (extract_document_output_rotated_paragraphs( - alloc, - page, - p0, - p1, - rotate, - ctm->e, - ctm->f, - extent.x, - extent.y, - text_box_id, - content, - styles, - &state - )) goto end; - p = p1 - 1; + + if (rotation && rotate != 0) + { + if (s_odt_append_rotated_paragraphs(alloc, page, &content_state, &p, &text_box_id, ctm, rotate, content, styles)) goto end; + } + else + { + if (s_document_to_odt_content_paragraph(alloc, &content_state, paragraph, content, styles)) goto end; + } + p += 1; } - else + else if (table) { - if (extract_document_to_odt_content_paragraph(alloc, &state, paragraph, content, styles)) goto end; + if (s_odt_append_table(alloc, table, content, styles)) goto end; + t += 1; } - } outf("images=%i", images); @@ -604,7 +705,7 @@ int extract_document_to_odt_content( outf("page->images_num=%i", page->images_num); for (i=0; i<page->images_num; ++i) { - extract_document_append_image(alloc, content, &page->images[i]); + s_odt_append_image(alloc, content, &page->images[i]); } } } @@ -658,26 +759,39 @@ int extract_odt_content_item( char* text_intermediate = NULL; extract_astring_t styles_definitions = {0}; + /* Insert content before '</office:text>'. */ if (extract_content_insert( alloc, text, NULL /*single*/, - NULL, - "</office:text>", + NULL /*mid_begin_name*/, + "</office:text>" /*mid_end_name*/, contentss, contentss_num, &text_intermediate )) goto end; outf("text_intermediate: %s", text_intermediate); - if (extract_odt_styles_definitions(alloc, styles, &styles_definitions)) goto end; + /* Convert <styles> to text. */ + if (s_odt_styles_definitions(alloc, styles, &styles_definitions)) goto end; + /* To make tables work, we seem to need to specify table and column + styles, and these can be empty. todo: maybe specify exact sizes based + on the pdf table and cell dimensions. */ + if (extract_astring_cat(alloc, &styles_definitions, + "\n" + "<style:style style:name=\"extract.table\" style:family=\"table\"/>\n" + "<style:style style:name=\"extract.table.column\" style:family=\"table-column\"/>\n" + )) goto end; + + /* Replace '<office:automatic-styles/>' with text from + <styles_definitions>. */ e = extract_content_insert( alloc, text_intermediate, "<office:automatic-styles/>" /*single*/, - NULL, - NULL, //"</office:automatic-styles>", + NULL /*mid_begin_name*/, + NULL /*mid_end_name*/, &styles_definitions, 1, text2 @@ -719,14 +833,14 @@ int extract_odt_content_item( } e = 0; end: - outf("e=%i errno=%i text2=%s", e, errno, text2); + outf("e=%i errno=%i text2=%s", e, errno, text2 ? *text2 : ""); if (e) { /* We might have set <text2> to new content. */ extract_free(alloc, text2); /* We might have used <temp> as a temporary buffer. */ - extract_astring_free(alloc, &temp); } + extract_astring_free(alloc, &temp); extract_astring_init(&temp); return e; } @@ -747,7 +861,6 @@ int extract_odt_write_template( int e = -1; int i; char* path_tempdir = NULL; - FILE* f = NULL; char* path = NULL; char* text = NULL; char* text2 = NULL; @@ -827,7 +940,6 @@ int extract_odt_write_template( } /* Copy images into <path_tempdir>/Pictures/. */ - outf(""); extract_free(alloc, &path); if (extract_asprintf(alloc, &path, "%s/Pictures", path_tempdir) < 0) goto end; if (extract_mkdir(path, 0777)) @@ -835,7 +947,6 @@ int extract_odt_write_template( outf("Failed to mkdir %s", path); goto end; } - outf(""); for (i=0; i<images->images_num; ++i) { image_t* image = &images->images[i]; @@ -869,8 +980,6 @@ int extract_odt_write_template( extract_free(alloc, &path); extract_free(alloc, &text); extract_free(alloc, &text2); - //extract_odt_styles_free(alloc, &styles); - if (f) fclose(f); if (e) { diff --git a/extract/src/outf.c b/extract/src/outf.c index 95575c16..de7662f6 100644 --- a/extract/src/outf.c +++ b/extract/src/outf.c @@ -5,14 +5,14 @@ #include <stdio.h> #include <string.h> -static int s_verbose = 0; +int extract_outf_verbose = 0; -void outf_verbose_set(int verbose) +void extract_outf_verbose_set(int verbose) { - s_verbose = verbose; + extract_outf_verbose = verbose; } -void (outf)( +void (extract_outf)( int level, const char* file, int line, @@ -23,7 +23,7 @@ void (outf)( ) { va_list va; - if (level > s_verbose) { + if (level > extract_outf_verbose) { return; } diff --git a/extract/src/outf.h b/extract/src/outf.h index a2b6c078..f9b97a93 100644 --- a/extract/src/outf.h +++ b/extract/src/outf.h @@ -1,32 +1,42 @@ #ifndef ARTIFEX_EXTRACT_OUTF_H #define ARTIFEX_EXTRACT_OUTF_H +/* Simple printf-style debug output. */ + +#if defined(__GNUC__) || defined(__clang__) || defined(_WIN32) + #define extract_FUNCTION __FUNCTION__ +#else + #define extract_FUNCTION "" +#endif + +#define outf(format, ...) \ + (1 > extract_outf_verbose) ? (void) 0 : (extract_outf)(1, __FILE__, __LINE__, extract_FUNCTION, 1 /*ln*/, format, ##__VA_ARGS__) + +#define outf0(format, ...) \ + (0 > extract_outf_verbose) ? (void) 0 : (extract_outf)(0, __FILE__, __LINE__, extract_FUNCTION, 1 /*ln*/, format, ##__VA_ARGS__) + +#define outfx(format, ...) + /* Only for internal use by extract code. */ -void (outf)( +extern int extract_outf_verbose; + +void (extract_outf)( int level, const char* file, int line, const char* fn, int ln, const char* format, ... - ); + ) + #ifdef __GNUC__ + __attribute__ ((format (printf, 6, 7))) + #endif + ; /* Outputs text if <level> is less than or equal to verbose value set by outf_level_set(). */ -#define outf(format, ...) \ - (outf)(1, __FILE__, __LINE__, __FUNCTION__, 1 /*ln*/, format, ##__VA_ARGS__) - -#define outf0(format, ...) \ - (outf)(0, __FILE__, __LINE__, __FUNCTION__, 1 /*ln*/, format, ##__VA_ARGS__) - -#define outfx(format, ...) - -/* Simple printf-style debug output. */ - -#define outfx(format, ...) - -void outf_verbose_set(int verbose); +void extract_outf_verbose_set(int verbose); /* Set verbose value. Higher values are more verbose. Initial value is 0. */ #endif diff --git a/extract/src/sys.c b/extract/src/sys.c index 131f6312..2359acab 100644 --- a/extract/src/sys.c +++ b/extract/src/sys.c @@ -82,7 +82,7 @@ int extract_read_all_path(extract_alloc_t* alloc, const char* path, char** o_te e = 0; end: if (f) fclose(f); - if (e) extract_free(alloc, &o_text); + if (e) extract_free(alloc, o_text); return e; } diff --git a/extract/src/text.c b/extract/src/text.c index f832baa2..e75e3e69 100644 --- a/extract/src/text.c +++ b/extract/src/text.c @@ -18,23 +18,6 @@ int extract_content_insert( int contentss_num, char** o_out ) -/* Creates a new string by inserting sequence of strings into a template -string. - -If <single_name> is in <original>, it is replaced by <contentss>. - -Otherwise the text between the end of <mid_begin_name> and beginning of -<mid_end_name> is replaced by <contentss>. - -If <mid_begin_name> is NULL, we insert into the zero-length region before -<mid_end_name>. - -If <mid_end_name> is NULL, we insert into the zero-length region after -<mid_begin_name>. - -At least one of <single_name>, <mid_begin_name> and <mid_end_name> must be -non-NULL. -*/ { int e = -1; const char* mid_begin = NULL; @@ -92,6 +75,11 @@ non-NULL. if (extract_astring_catl(alloc, &out, contentss[i].chars, contentss[i].chars_num)) goto end; } } + assert( mid_end); + /* As per docs, at least one of <single_name>, <mid_begin_name> and + <mid_end_name> is non-null, and this ensures that mid_end must not be null. + */ + /* coverity[var_deref_model] */ if (extract_astring_cat(alloc, &out, mid_end)) goto end; *o_out = out.chars; diff --git a/extract/src/xml.c b/extract/src/xml.c index 8dab511b..24116f6d 100644 --- a/extract/src/xml.c +++ b/extract/src/xml.c @@ -349,7 +349,7 @@ int extract_xml_pparse_init(extract_alloc_t* alloc, extract_buffer_t* buffer, co } first_line_buffer[actual] = 0; if (strcmp(first_line, first_line_buffer)) { - outf("Unrecognised prefix: ", first_line_buffer); + outf("Unrecognised prefix: %s", first_line_buffer); errno = ESRCH; goto end; } @@ -393,7 +393,10 @@ static const char* extract_xml_tag_string(extract_alloc_t* alloc, extract_xml_ta { static char* buffer = NULL; extract_free(alloc, &buffer); - extract_asprintf(alloc, &buffer, "<name=%s>", tag->name ? tag->name : ""); + if (extract_asprintf(alloc, &buffer, "<name=%s>", tag->name ? tag->name : "")) + { + return ""; + } return buffer; } @@ -410,7 +413,9 @@ int extract_xml_pparse_next(extract_buffer_t* buffer, extract_xml_tag_t* out) assert(buffer); extract_xml_tag_free(alloc, out); - /* Read tag name. */ + /* Read tag name. Initialise it to empty string so we never return + out->name==null on success. */ + if (str_catl( alloc, &out->name, NULL, 0)) goto end; for( i=0;; ++i) { int e = extract_buffer_read(buffer, &c, 1, NULL); if (e) { @@ -438,6 +443,7 @@ int extract_xml_pparse_next(extract_buffer_t* buffer, extract_xml_tag_t* out) int quote_single = 0; int quote_double = 0; size_t l; + if (str_catl( alloc, &attribute_value, NULL, 0)) goto end; for(;;) { if (s_next(buffer, &ret, &c)) goto end; if (c == '\'') quote_single = !quote_single; @@ -469,6 +475,10 @@ int extract_xml_pparse_next(extract_buffer_t* buffer, extract_xml_tag_t* out) } } + /* Ensure name and value are not NULL. */ + if (str_catl( alloc, &attribute_name, NULL, 0)) goto end; + if (str_catl( alloc, &attribute_value, NULL, 0)) goto end; + if (extract_xml_tag_attributes_append(alloc, out, attribute_name, attribute_value)) goto end; attribute_name = NULL; attribute_value = NULL; diff --git a/extract/src/xml.h b/extract/src/xml.h index d11fd886..8bc4dae2 100644 --- a/extract/src/xml.h +++ b/extract/src/xml.h @@ -35,6 +35,9 @@ void extract_xml_tag_free(extract_alloc_t* alloc, extract_xml_tag_t* tag); int extract_xml_pparse_init(extract_alloc_t* alloc, extract_buffer_t* buffer, const char* first_line); /* extract_xml_pparse_*(): simple XML 'pull' parser. +If <first_line> is not NULL, we require that <buffer> starts with the specified +text. Usually one would include a final newline in <first_line>. + extract_xml_pparse_init() merely consumes the initial '<'. Thereafter extract_xml_pparse_next() consumes the next '<' before returning the previous tag. */ @@ -53,6 +56,9 @@ int extract_xml_pparse_next(extract_buffer_t* buffer, extract_xml_tag_t* out); Returns 0 with *out containing next tag; or -1 with errno set if error; or +1 with errno=ESRCH if EOF. +If we return 0, we guarantee that out->name points to valid string and that +each item in out->attributes has similarly valid name and value members. + *out is initially passed to extract_xml_tag_free(), so *out must have been initialised, e.g. by by extract_xml_tag_init(). */ diff --git a/extract/src/zip.c b/extract/src/zip.c index 03bfd024..691b743b 100644 --- a/extract/src/zip.c +++ b/extract/src/zip.c @@ -10,6 +10,7 @@ #include <assert.h> #include <errno.h> #include <limits.h> +#include <time.h> #ifdef _MSC_VER #include "compat_stdint.h" @@ -74,8 +75,38 @@ int extract_zip_open(extract_buffer_t* buffer, extract_zip_t** o_zip) /* We could maybe convert current date/time to the ms-dos format required here, but using zeros doesn't seem to make a difference to Word etc. */ - zip->mtime = 0; - zip->mdate = 0; + + { + time_t t = time(NULL); + struct tm* tm; + #ifdef _POSIX_SOURCE + struct tm tm_local; + tm = gmtime_r(&t, &tm_local); + #else + tm = gmtime(&t); + #endif + if (tm) + { + /* mdate and mtime are in MS DOS format: + mtime: + bits 0-4: seconds / 2. + bits 5-10: minute (0-59). + bits 11-15: hour (0-23). + mdate: + bits 0-4: day of month (1-31). + bits 5-8: month (1=jan, 2=feb, etc). + bits 9-15: year - 1980. + */ + zip->mtime = (uint16_t) ((tm->tm_hour << 11) | (tm->tm_min << 5) | (tm->tm_sec / 2)); + zip->mdate = (uint16_t) (((1900 + tm->tm_year - 1980) << 9) | ((tm->tm_mon + 1) << 5) | tm->tm_mday); + } + else + { + outf0("*** gmtime_r() failed"); + zip->mtime = 0; + zip->mdate = 0; + } + } /* These are all copied from command-line zip on unix. */ zip->version_creator = (0x3 << 8) + 30; /* 0x3 is unix, 30 means 3.0. */ @@ -115,7 +146,9 @@ static int s_native_little_endinesss(void) /* Native big-endiness. */ return 0; } - abort(); + /* Would like to call abort() here, but that breaks on AIX/gcc. */ + assert(0); + return 0; } @@ -148,7 +181,7 @@ static int s_write_compressed( /* Uses zlib to write raw deflate compressed data to zip->buffer. */ { int ze; - z_stream zstream; + z_stream zstream = {0}; /* Initialise to keep Coverity quiet. */ if (zip->errno_) return -1; if (zip->eof) return +1; @@ -313,7 +346,7 @@ int extract_zip_write_file( cd_file->name = NULL; cd_file->mtime = zip->mtime; - cd_file->mdate = zip->mtime; + cd_file->mdate = zip->mdate; cd_file->crc_sum = (int32_t) crc32(crc32(0, NULL, 0), data, (int) data_length); cd_file->size_uncompressed = (int) data_length; if (zip->compression_method == 0) |