summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'extract/src')
-rw-r--r--extract/src/astring.c127
-rw-r--r--extract/src/astring.h28
-rw-r--r--extract/src/buffer-test.c2
-rw-r--r--extract/src/buffer.c2
-rw-r--r--extract/src/document.c88
-rw-r--r--extract/src/document.h137
-rw-r--r--extract/src/docx.c598
-rw-r--r--extract/src/docx.h4
-rwxr-xr-xextract/src/docx_template_build.py30
-rw-r--r--extract/src/extract-exe.c3
-rw-r--r--extract/src/extract.c891
-rw-r--r--extract/src/html.c314
-rw-r--r--extract/src/html.h23
-rw-r--r--extract/src/join.c1241
-rw-r--r--extract/src/mem.c22
-rw-r--r--extract/src/mem.h13
-rwxr-xr-xextract/src/memento.py39
-rw-r--r--extract/src/misc-test.c58
-rw-r--r--extract/src/odt.c627
-rw-r--r--extract/src/outf.c10
-rw-r--r--extract/src/outf.h40
-rw-r--r--extract/src/sys.c2
-rw-r--r--extract/src/text.c22
-rw-r--r--extract/src/xml.c16
-rw-r--r--extract/src/xml.h6
-rw-r--r--extract/src/zip.c43
26 files changed, 3533 insertions, 853 deletions
diff --git a/extract/src/astring.c b/extract/src/astring.c
index fd09d639..e5d40217 100644
--- a/extract/src/astring.c
+++ b/extract/src/astring.c
@@ -27,6 +27,9 @@ void extract_astring_free(extract_alloc_t* alloc, extract_astring_t* string)
int extract_astring_catl(extract_alloc_t* alloc, extract_astring_t* string, const char* s, size_t s_len)
{
if (extract_realloc2(alloc, &string->chars, string->chars_num+1, string->chars_num + s_len + 1)) return -1;
+ /* Coverity doesn't seem to realise that extract_realloc2() modifies
+ string->chars. */
+ /* coverity[deref_parm_field_in_call] */
memcpy(string->chars + string->chars_num, s, s_len);
string->chars[string->chars_num + s_len] = 0;
string->chars_num += s_len;
@@ -65,7 +68,7 @@ int extract_astring_truncate(extract_astring_t* content, int len)
return 0;
}
-int astring_char_truncate_if(extract_astring_t* content, char c)
+int extract_astring_char_truncate_if(extract_astring_t* content, char c)
{
if (content->chars_num && content->chars[content->chars_num-1] == c) {
extract_astring_truncate(content, 1);
@@ -73,40 +76,58 @@ int astring_char_truncate_if(extract_astring_t* content, char c)
return 0;
}
-int extract_astring_cat_xmlc(extract_alloc_t* alloc, extract_astring_t* string, int c)
+int extract_astring_catc_unicode(
+ extract_alloc_t* alloc,
+ extract_astring_t* string,
+ int c,
+ int xml,
+ int ascii_ligatures,
+ int ascii_dash,
+ int ascii_apostrophe
+ )
{
int ret = -1;
if (0) {}
/* Escape XML special characters. */
- else if (c == '<') extract_astring_cat(alloc, string, "&lt;");
- else if (c == '>') extract_astring_cat(alloc, string, "&gt;");
- else if (c == '&') extract_astring_cat(alloc, string, "&amp;");
- else if (c == '"') extract_astring_cat(alloc, string, "&quot;");
- else if (c == '\'') extract_astring_cat(alloc, string, "&apos;");
+ else if (xml && c == '<') extract_astring_cat(alloc, string, "&lt;");
+ else if (xml && c == '>') extract_astring_cat(alloc, string, "&gt;");
+ else if (xml && c == '&') extract_astring_cat(alloc, string, "&amp;");
+ else if (xml && c == '"') extract_astring_cat(alloc, string, "&quot;");
+ else if (xml && c == '\'') extract_astring_cat(alloc, string, "&apos;");
/* Expand ligatures. */
- else if (c == 0xFB00)
+ else if (ascii_ligatures && c == 0xFB00)
{
if (extract_astring_cat(alloc, string, "ff")) goto end;
}
- else if (c == 0xFB01)
+ else if (ascii_ligatures && c == 0xFB01)
{
if (extract_astring_cat(alloc, string, "fi")) goto end;
}
- else if (c == 0xFB02)
+ else if (ascii_ligatures && c == 0xFB02)
{
if (extract_astring_cat(alloc, string, "fl")) goto end;
}
- else if (c == 0xFB03)
+ else if (ascii_ligatures && c == 0xFB03)
{
if (extract_astring_cat(alloc, string, "ffi")) goto end;
}
- else if (c == 0xFB04)
+ else if (ascii_ligatures && c == 0xFB04)
{
if (extract_astring_cat(alloc, string, "ffl")) goto end;
}
+
+ /* Convert some special characters to ascii. */
+ else if (ascii_dash && c == 0x2212)
+ {
+ if (extract_astring_catc(alloc, string, '-')) goto end;
+ }
+ else if (ascii_apostrophe && c == 0x2019)
+ {
+ if (extract_astring_catc(alloc, string, '\'')) goto end;
+ }
/* Output ASCII verbatim. */
else if (c >= 32 && c <= 127)
@@ -117,18 +138,65 @@ int extract_astring_cat_xmlc(extract_alloc_t* alloc, extract_astring_t* string,
/* Escape all other characters. */
else
{
- char buffer[32];
- if (c < 32
- && (c != 0x9 && c != 0xa && c != 0xd)
- )
+ if (xml)
{
- /* Illegal xml character; see
- https://www.w3.org/TR/xml/#charsets. We replace with
- 0xfffd, the unicode replacement character. */
- c = 0xfffd;
+ char buffer[32];
+ if (c < 32
+ && (c != 0x9 && c != 0xa && c != 0xd)
+ )
+ {
+ /* Illegal xml character; see
+ https://www.w3.org/TR/xml/#charsets. We replace with
+ 0xfffd, the unicode replacement character. */
+ c = 0xfffd;
+ }
+ snprintf(buffer, sizeof(buffer), "&#x%x;", c);
+ if (extract_astring_cat(alloc, string, buffer)) goto end;
+ }
+ else
+ {
+ /* Use utf8. */
+ if (c < 0x80)
+ {
+ if (extract_astring_catc(alloc, string, (char) c)) return -1;
+ }
+ else if (c < 0x0800)
+ {
+ char cc[2] =
+ {
+ (char) (((c >> 6) & 0x1f) | 0xc0),
+ (char) (((c >> 0) & 0x3f) | 0x80)
+ };
+ if (extract_astring_catl(alloc, string, cc, sizeof(cc))) return -1;
+ }
+ else if (c < 0x10000)
+ {
+ char cc[3] =
+ {
+ (char) (((c >> 12) & 0x0f) | 0xe0),
+ (char) (((c >> 6) & 0x3f) | 0x80),
+ (char) (((c >> 0) & 0x3f) | 0x80)
+ };
+ if (extract_astring_catl(alloc, string, cc, sizeof(cc))) return -1;
+ }
+ else if (c < 0x110000)
+ {
+ char cc[4] =
+ {
+ (char) (((c >> 18) & 0x07) | 0xf0),
+ (char) (((c >> 12) & 0x3f) | 0x80),
+ (char) (((c >> 6) & 0x3f) | 0x80),
+ (char) (((c >> 0) & 0x3f) | 0x80)
+ };
+ if (extract_astring_catl(alloc, string, cc, sizeof(cc))) return -1;
+ }
+ else
+ {
+ /* Use replacement character. */
+ char cc[4] = { (char) 0xef, (char) 0xbf, (char) 0xbd, 0};
+ if (extract_astring_catl(alloc, string, cc, sizeof(cc))) return -1;
+ }
}
- snprintf(buffer, sizeof(buffer), "&#x%x;", c);
- if (extract_astring_cat(alloc, string, buffer)) goto end;
}
ret = 0;
@@ -136,3 +204,18 @@ int extract_astring_cat_xmlc(extract_alloc_t* alloc, extract_astring_t* string,
end:
return ret;
}
+
+int extract_astring_catc_unicode_xml(extract_alloc_t* alloc, extract_astring_t* string, int c)
+{
+ /* Fixme, better to use ascii_ligatures=0, but that requires updates to
+ expected output files. */
+ return extract_astring_catc_unicode(
+ alloc,
+ string,
+ c,
+ 1 /*xml*/,
+ 1 /*ascii_ligatures*/,
+ 0 /*ascii_dash*/,
+ 0 /*ascii_apostrophe*/
+ );
+}
diff --git a/extract/src/astring.h b/extract/src/astring.h
index c2b60d25..aef4d87f 100644
--- a/extract/src/astring.h
+++ b/extract/src/astring.h
@@ -11,8 +11,11 @@ typedef struct
} extract_astring_t;
void extract_astring_init(extract_astring_t* string);
+/* Initialises <string> so it is ready for use. */
void extract_astring_free(extract_alloc_t* alloc, extract_astring_t* string);
+/* Frees any existing data and returns with <string> ready for use as if by
+extract_astring_init(). */
int extract_astring_catl(extract_alloc_t* alloc, extract_astring_t* string, const char* s, size_t s_len);
@@ -24,10 +27,33 @@ int extract_astring_catf(extract_alloc_t* alloc, extract_astring_t* string, cons
int extract_astring_truncate(extract_astring_t* content, int len);
/* Removes last <len> chars. */
-int astring_char_truncate_if(extract_astring_t* content, char c);
+int extract_astring_char_truncate_if(extract_astring_t* content, char c);
/* Removes last char if it is <c>. */
int extract_astring_cat_xmlc(extract_alloc_t* alloc, extract_astring_t* string, int c);
/* Appends specified character using XML escapes as necessary. */
+int extract_astring_catc_unicode(
+ extract_alloc_t* alloc,
+ extract_astring_t* string,
+ int c,
+ int xml,
+ int ascii_ligatures,
+ int ascii_dash,
+ int ascii_apostrophe
+ );
+/* Appends unicode character <c> to <string>.
+ xml:
+ If true, we use XML escape sequences for special characters such as '<'
+ and unicode values above 127. Otherwise we encode as utf8.
+ ascii_ligatures: if true we expand ligatures to "fl", "fi" etc.
+ ascii_dash:
+ If true we replace unicode dash characters with '-'.
+ ascii_apostrophe:
+ If true we replace unicode apostrophe with ascii single-quote "'".
+*/
+
+int extract_astring_catc_unicode_xml(extract_alloc_t* alloc, extract_astring_t* string, int c);
+/* Appends specific unicode character, using XML escape sequences as required. */
+
#endif
diff --git a/extract/src/buffer-test.c b/extract/src/buffer-test.c
index 6701fbab..a8464c2a 100644
--- a/extract/src/buffer-test.c
+++ b/extract/src/buffer-test.c
@@ -298,7 +298,7 @@ static void test_file(void)
int main(void)
{
- outf_verbose_set(1);
+ extract_outf_verbose_set(1);
test_read();
test_write();
test_file();
diff --git a/extract/src/buffer.c b/extract/src/buffer.c
index 3fd35bfd..b25dee73 100644
--- a/extract/src/buffer.c
+++ b/extract/src/buffer.c
@@ -375,7 +375,7 @@ int extract_buffer_write_internal(
not recoverable. <pos> will be the number of bytes in
source..+numbytes that have been successfully flushed, and
could be negative if we failed to flush earlier data. */
- outf("failed to flush. actual=%i delta=%i\n", actual, delta);
+ outf("failed to flush. actual=%li delta=%li\n", (long) actual, (long) delta);
e = 0;
goto end;
}
diff --git a/extract/src/document.c b/extract/src/document.c
new file mode 100644
index 00000000..d501f259
--- /dev/null
+++ b/extract/src/document.c
@@ -0,0 +1,88 @@
+#include "document.h"
+#include "outf.h"
+
+
+void extract_span_init(span_t* span)
+{
+ span->font_name = NULL;
+ span->chars = NULL;
+ span->chars_num = 0;
+}
+
+void extract_span_free(extract_alloc_t* alloc, span_t** pspan)
+{
+ if (!*pspan) return;
+ extract_free(alloc, &(*pspan)->font_name);
+ extract_free(alloc, &(*pspan)->chars);
+ extract_free(alloc, pspan);
+}
+
+void extract_spans_free(extract_alloc_t* alloc, span_t*** pspans, int spans_num)
+{
+ span_t** spans = *pspans;
+ int s;
+ for (s=0; s<spans_num; ++s)
+ {
+ extract_span_free(alloc, &spans[s]);
+ }
+ extract_free(alloc, pspans);
+}
+
+void extract_line_free(extract_alloc_t* alloc, line_t** pline)
+{
+ line_t* line = *pline;
+ int s;
+ for (s=0; s<line->spans_num; ++s)
+ {
+ extract_span_free(alloc, &line->spans[s]);
+ }
+ extract_free(alloc, &line->spans);
+ extract_free(alloc, pline);
+}
+
+void extract_lines_free(extract_alloc_t* alloc, line_t*** plines, int lines_num)
+{
+ int l;
+ line_t** lines = *plines;
+ for (l=0; l<lines_num; ++l)
+ {
+ extract_line_free(alloc, &lines[l]);
+ }
+ extract_free(alloc, plines);
+}
+
+void extract_image_clear(extract_alloc_t* alloc, image_t* image)
+{
+ extract_free(alloc, &image->type);
+ extract_free(alloc, &image->name);
+ extract_free(alloc, &image->id);
+ if (image->data_free) {
+ image->data_free(image->data_free_handle, image->data);
+ }
+}
+
+void extract_cell_free(extract_alloc_t* alloc, cell_t** pcell)
+{
+ int p;
+ cell_t* cell = *pcell;
+ if (!cell) return;
+
+ outf("cell->lines_num=%i", cell->lines_num);
+ outf("cell->paragraphs_num=%i", cell->paragraphs_num);
+ extract_lines_free(alloc, &cell->lines, cell->lines_num);
+
+ outf("cell=%p cell->paragraphs_num=%i", cell, cell->paragraphs_num);
+ for (p=0; p<cell->paragraphs_num; ++p)
+ {
+ paragraph_t* paragraph = cell->paragraphs[p];
+ outf("paragraph->lines_num=%i", paragraph->lines_num);
+ /* We don't attempt to free paragraph->lines[] because they point into
+ cell->lines which are already freed. */
+ extract_free(alloc, &paragraph->lines);
+ extract_free(alloc, &cell->paragraphs[p]);
+ }
+ extract_free(alloc, &cell->paragraphs);
+ extract_free(alloc, pcell);
+}
+
+
diff --git a/extract/src/document.h b/extract/src/document.h
index c59348f4..2dc4f1ee 100644
--- a/extract/src/document.h
+++ b/extract/src/document.h
@@ -1,6 +1,15 @@
#ifndef ARTIFEX_EXTRACT_DOCUMENT_H
#define ARTIFEX_EXTRACT_DOCUMENT_H
+#include "../include/extract.h"
+
+#ifdef _MSC_VER
+ #include "compat_stdint.h"
+#else
+ #include <stdint.h>
+#endif
+
+
static const double pi = 3.141592653589793;
typedef struct
@@ -9,6 +18,16 @@ typedef struct
double y;
} point_t;
+const char* extract_point_string(const point_t* point);
+
+typedef struct
+{
+ point_t min;
+ point_t max;
+} rect_t;
+
+const char* extract_rect_string(const rect_t* rect);
+
typedef struct
{
double a;
@@ -19,9 +38,15 @@ typedef struct
double f;
} matrix_t;
-double matrix_expansion(matrix_t m);
+const char* extract_matrix_string(const matrix_t* matrix);
-int matrix_cmp4(const matrix_t* lhs, const matrix_t* rhs)
+double extract_matrix_expansion(matrix_t m);
+/* Returns a*d - b*c. */
+
+point_t extract_multiply_matrix_point(matrix_t m, point_t p);
+matrix_t extract_multiply_matrix_matrix(matrix_t m1, matrix_t m2);
+
+int extract_matrix_cmp4(const matrix_t* lhs, const matrix_t* rhs)
;
/* Returns zero if first four members of *lhs and *rhs are equal, otherwise
+/-1. */
@@ -48,7 +73,7 @@ typedef struct
matrix_t trm;
char* font_name;
- /* font size is matrix_expansion(trm). */
+ /* font size is extract_matrix_cmp4(trm). */
struct {
unsigned font_bold : 1;
@@ -61,14 +86,21 @@ typedef struct
} span_t;
/* List of chars that have same font and are usually adjacent. */
-char_t* span_char_last(span_t* span);
+void extract_span_init(span_t* span);
+
+void extract_span_free(extract_alloc_t* alloc, span_t** pspan);
+/* Frees a span_t, returning with *pspan set to NULL. */
+
+void extract_spans_free(extract_alloc_t* alloc, span_t*** pspans, int spans_num);
+
+char_t* extract_span_char_last(span_t* span);
/* Returns last character in span. */
-int span_append_c(extract_alloc_t* alloc, span_t* span, int c);
+int extract_span_append_c(extract_alloc_t* alloc, span_t* span, int c);
/* Appends new char_t to an span_t with .ucs=c and all other
fields zeroed. */
-const char* span_string(extract_alloc_t* alloc, span_t* span);
+const char* extract_span_string(extract_alloc_t* alloc, span_t* span);
/* Returns static string containing info about span_t. */
typedef struct
@@ -78,10 +110,13 @@ typedef struct
} line_t;
/* List of spans that are aligned on same line. */
-span_t* line_span_first(line_t* line);
+void extract_line_free(extract_alloc_t* alloc, line_t** pline);
+void extract_lines_free(extract_alloc_t* alloc, line_t*** plines, int lines_num);
+
+span_t* extract_line_span_first(line_t* line);
/* Returns first span in a line. */
-span_t* line_span_last(line_t* line);
+span_t* extract_line_span_last(line_t* line);
/* Returns last span in a line. */
typedef struct
@@ -112,6 +147,61 @@ typedef struct
<name> and <id> are created to be unique identifiers for use in generated docx
file. */
+void extract_image_clear(extract_alloc_t* alloc, image_t* image);
+
+typedef struct
+{
+ float color;
+ rect_t rect;
+} tableline_t;
+/* A line that is part of a table. */
+
+typedef struct
+{
+ tableline_t* tablelines;
+ int tablelines_num;
+} tablelines_t;
+
+
+typedef struct
+{
+ rect_t rect;
+
+ /* If left/above is true, this cell is not obscured by cell to its
+ left/above. */
+ uint8_t left;
+ uint8_t above;
+
+ /* extend_right and extend_down are 1 for normal cells, 2 for cells which
+ extend right/down to cover an additional column/row, 3 to cover two
+ additional columns/rows etc. */
+ int extend_right;
+ int extend_down;
+
+ /* Contents of this cell. */
+ line_t** lines;
+ int lines_num;
+ paragraph_t** paragraphs;
+ int paragraphs_num;
+} cell_t;
+/* A cell within a table. */
+
+void extract_cell_init(cell_t* cell);
+void extract_cell_free(extract_alloc_t* alloc, cell_t** pcell);
+
+typedef struct
+{
+ point_t pos; /* top-left. */
+
+ /* Array of cells_num_x*cells_num_y cells; cell (x, y) is:
+ cells_num_x * y + x.
+ */
+ cell_t** cells;
+ int cells_num_x;
+ int cells_num_y;
+} table_t;
+
+
typedef struct
{
span_t** spans;
@@ -129,10 +219,17 @@ typedef struct
int paragraphs_num;
/* These refer to items in .lines. Initially empty, then set
by extract_join(). */
+
+ tablelines_t tablelines_horizontal;
+ tablelines_t tablelines_vertical;
+
+ table_t** tables;
+ int tables_num;
} extract_page_t;
/* A page. Contains different representations of the list of spans. NB not
-called page_t because this clashes with a system type on hpux. */
++called page_t because this clashes with a system type on hpux. */
+
typedef struct
{
@@ -150,9 +247,31 @@ typedef struct
int imagetypes_num;
} images_t;
+
int extract_document_join(extract_alloc_t* alloc, document_t* document);
+/* This does all the work of finding paragraphs and tables. */
double extract_matrices_to_font_size(matrix_t* ctm, matrix_t* trm);
+/* Things below here are used when generating output. */
+
+typedef struct
+{
+ char* name;
+ double size;
+ int bold;
+ int italic;
+} font_t;
+/* Basic information about current font. */
+
+typedef struct
+{
+ font_t font;
+ matrix_t* ctm_prev;
+} content_state_t;
+/* Used to keep track of font information when writing paragraphs of odt
+content, e.g. so we know whether a font has changed so need to start a new odt
+span. */
+
#endif
diff --git a/extract/src/docx.c b/extract/src/docx.c
index 4532cd4e..761de176 100644
--- a/extract/src/docx.c
+++ b/extract/src/docx.c
@@ -21,6 +21,7 @@ docx_paragraph_finish(). */
#include <assert.h>
#include <errno.h>
+#include <float.h>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
@@ -29,46 +30,42 @@ docx_paragraph_finish(). */
#include <sys/stat.h>
-static int extract_docx_paragraph_start(extract_alloc_t* alloc, extract_astring_t* content)
+static int s_docx_paragraph_start(extract_alloc_t* alloc, extract_astring_t* content)
{
return extract_astring_cat(alloc, content, "\n\n<w:p>");
}
-static int extract_docx_paragraph_finish(extract_alloc_t* alloc, extract_astring_t* content)
+static int s_docx_paragraph_finish(extract_alloc_t* alloc, extract_astring_t* content)
{
return extract_astring_cat(alloc, content, "\n</w:p>");
}
-static int extract_docx_run_start(
+static int s_docx_run_start(
extract_alloc_t* alloc,
extract_astring_t* content,
- const char* font_name,
- double font_size,
- int bold,
- int italic
+ content_state_t* content_state
)
-/* Starts a new run. Caller must ensure that extract_docx_run_finish() was
+/* Starts a new run. Caller must ensure that s_docx_run_finish() was
called to terminate any previous run. */
{
int e = 0;
if (!e) e = extract_astring_cat(alloc, content, "\n<w:r><w:rPr><w:rFonts w:ascii=\"");
- if (!e) e = extract_astring_cat(alloc, content, font_name);
+ if (!e) e = extract_astring_cat(alloc, content, content_state->font.name);
if (!e) e = extract_astring_cat(alloc, content, "\" w:hAnsi=\"");
- if (!e) e = extract_astring_cat(alloc, content, font_name);
+ if (!e) e = extract_astring_cat(alloc, content, content_state->font.name);
if (!e) e = extract_astring_cat(alloc, content, "\"/>");
- if (!e && bold) e = extract_astring_cat(alloc, content, "<w:b/>");
- if (!e && italic) e = extract_astring_cat(alloc, content, "<w:i/>");
+ if (!e && content_state->font.bold) e = extract_astring_cat(alloc, content, "<w:b/>");
+ if (!e && content_state->font.italic) e = extract_astring_cat(alloc, content, "<w:i/>");
{
char font_size_text[32];
- if (0) font_size = 10;
if (!e) e = extract_astring_cat(alloc, content, "<w:sz w:val=\"");
- snprintf(font_size_text, sizeof(font_size_text), "%f", font_size * 2);
+ snprintf(font_size_text, sizeof(font_size_text), "%f", content_state->font.size * 2);
extract_astring_cat(alloc, content, font_size_text);
extract_astring_cat(alloc, content, "\"/>");
if (!e) e = extract_astring_cat(alloc, content, "<w:szCs w:val=\"");
- snprintf(font_size_text, sizeof(font_size_text), "%f", font_size * 1.5);
+ snprintf(font_size_text, sizeof(font_size_text), "%f", content_state->font.size * 1.5);
extract_astring_cat(alloc, content, font_size_text);
extract_astring_cat(alloc, content, "\"/>");
}
@@ -77,38 +74,39 @@ called to terminate any previous run. */
}
-static int extract_docx_run_finish(extract_alloc_t* alloc, extract_astring_t* content)
+static int s_docx_run_finish(extract_alloc_t* alloc, content_state_t* state, extract_astring_t* content)
{
+ if (state) state->font.name = NULL;
return extract_astring_cat(alloc, content, "</w:t></w:r>");
}
-static int extract_docx_paragraph_empty(extract_alloc_t* alloc, extract_astring_t* content)
+static int s_docx_paragraph_empty(extract_alloc_t* alloc, extract_astring_t* content)
/* Append an empty paragraph to *content. */
{
int e = -1;
- if (extract_docx_paragraph_start(alloc, content)) goto end;
+ static char fontname[] = "OpenSans";
+ content_state_t content_state = {0};
+ if (s_docx_paragraph_start(alloc, content)) goto end;
/* It seems like our choice of font size here doesn't make any difference
to the ammount of vertical space, unless we include a non-space
character. Presumably something to do with the styles in the template
document. */
- if (extract_docx_run_start(
- alloc,
- content,
- "OpenSans",
- 10 /*font_size*/,
- 0 /*font_bold*/,
- 0 /*font_italic*/
- )) goto end;
+ content_state.font.name = fontname;
+ content_state.font.size = 10;
+ content_state.font.bold = 0;
+ content_state.font.italic = 0;
+
+ if (s_docx_run_start(alloc, content, &content_state)) goto end;
//docx_char_append_string(content, "&#160;"); /* &#160; is non-break space. */
- if (extract_docx_run_finish(alloc, content)) goto end;
- if (extract_docx_paragraph_finish(alloc, content)) goto end;
+ if (s_docx_run_finish(alloc, NULL /*state*/, content)) goto end;
+ if (s_docx_paragraph_finish(alloc, content)) goto end;
e = 0;
end:
return e;
}
-static int extract_docx_char_truncate_if(extract_astring_t* content, char c)
+static int s_docx_char_truncate_if(extract_astring_t* content, char c)
/* Removes last char if it is <c>. */
{
if (content->chars_num && content->chars[content->chars_num-1] == c) {
@@ -118,22 +116,9 @@ static int extract_docx_char_truncate_if(extract_astring_t* content, char c)
}
-typedef struct
-{
- const char* font_name;
- double font_size;
- int font_bold;
- int font_italic;
- matrix_t* ctm_prev;
-} content_state_t;
-/* Used to keep track of font information when writing paragraphs of docx
-content, e.g. so we know whether a font has changed so need to start a new docx
-span. */
-
-
-static int extract_document_to_docx_content_paragraph(
+static int s_document_to_docx_content_paragraph(
extract_alloc_t* alloc,
- content_state_t* state,
+ content_state_t* content_state,
paragraph_t* paragraph,
extract_astring_t* content
)
@@ -142,7 +127,7 @@ font. */
{
int e = -1;
int l;
- if (extract_docx_paragraph_start(alloc, content)) goto end;
+ if (s_docx_paragraph_start(alloc, content)) goto end;
for (l=0; l<paragraph->lines_num; ++l) {
line_t* line = paragraph->lines[l];
@@ -151,45 +136,38 @@ font. */
int si;
span_t* span = line->spans[s];
double font_size_new;
- state->ctm_prev = &span->ctm;
+ content_state->ctm_prev = &span->ctm;
font_size_new = extract_matrices_to_font_size(&span->ctm, &span->trm);
- if (!state->font_name
- || strcmp(span->font_name, state->font_name)
- || span->flags.font_bold != state->font_bold
- || span->flags.font_italic != state->font_italic
- || font_size_new != state->font_size
+ if (!content_state->font.name
+ || strcmp(span->font_name, content_state->font.name)
+ || span->flags.font_bold != content_state->font.bold
+ || span->flags.font_italic != content_state->font.italic
+ || font_size_new != content_state->font.size
) {
- if (state->font_name) {
- if (extract_docx_run_finish(alloc, content)) goto end;
+ if (content_state->font.name) {
+ if (s_docx_run_finish(alloc, content_state, content)) goto end;
}
- state->font_name = span->font_name;
- state->font_bold = span->flags.font_bold;
- state->font_italic = span->flags.font_italic;
- state->font_size = font_size_new;
- if (extract_docx_run_start(
- alloc,
- content,
- state->font_name,
- state->font_size,
- state->font_bold,
- state->font_italic
- )) goto end;
+ content_state->font.name = span->font_name;
+ content_state->font.bold = span->flags.font_bold;
+ content_state->font.italic = span->flags.font_italic;
+ content_state->font.size = font_size_new;
+ if (s_docx_run_start(alloc, content, content_state)) goto end;
}
for (si=0; si<span->chars_num; ++si) {
char_t* char_ = &span->chars[si];
int c = char_->ucs;
- if (extract_astring_cat_xmlc(alloc, content, c)) goto end;
+ if (extract_astring_catc_unicode_xml(alloc, content, c)) goto end;
}
/* Remove any trailing '-' at end of line. */
- if (extract_docx_char_truncate_if(content, '-')) goto end;
+ if (s_docx_char_truncate_if(content, '-')) goto end;
}
}
- if (state->font_name) {
- if (extract_docx_run_finish(alloc, content)) goto end;
- state->font_name = NULL;
+ if (content_state->font.name)
+ {
+ if (s_docx_run_finish(alloc, content_state, content)) goto end;
}
- if (extract_docx_paragraph_finish(alloc, content)) goto end;
+ if (s_docx_paragraph_finish(alloc, content)) goto end;
e = 0;
@@ -197,7 +175,7 @@ font. */
return e;
}
-static int extract_document_append_image(
+static int s_docx_append_image(
extract_alloc_t* alloc,
extract_astring_t* content,
image_t* image
@@ -265,7 +243,7 @@ static int extract_document_append_image(
}
-static int extract_document_output_rotated_paragraphs(
+static int s_docx_output_rotated_paragraphs(
extract_alloc_t* alloc,
extract_page_t* page,
int paragraph_begin,
@@ -353,7 +331,7 @@ static int extract_document_output_rotated_paragraphs(
/* Output paragraphs p0..p2-1. */
for (p=paragraph_begin; p<paragraph_end; ++p) {
paragraph_t* paragraph = page->paragraphs[p];
- if (extract_document_to_docx_content_paragraph(alloc, state, paragraph, content)) goto end;
+ if (s_document_to_docx_content_paragraph(alloc, state, paragraph, content)) goto end;
}
extract_astring_cat(alloc, content, "\n");
@@ -387,7 +365,7 @@ static int extract_document_output_rotated_paragraphs(
for (p=paragraph_begin; p<paragraph_end; ++p) {
paragraph_t* paragraph = page->paragraphs[p];
- if (extract_document_to_docx_content_paragraph(alloc, state, paragraph, content)) goto end;
+ if (s_document_to_docx_content_paragraph(alloc, state, paragraph, content)) goto end;
}
extract_astring_cat(alloc, content, "\n");
@@ -406,6 +384,257 @@ static int extract_document_output_rotated_paragraphs(
}
+static int s_docx_append_table(extract_alloc_t* alloc, table_t* table, extract_astring_t* content)
+/* Appends table to content.
+
+We do not fix the size of the table or its columns and rows, but instead leave layout up
+to the application. */
+{
+ int e = -1;
+ int y;
+
+ if (extract_astring_cat(alloc, content,
+ "\n"
+ " <w:tbl>\n"
+ " <w:tblLayout w:type=\"autofit\"/>\n"
+ )) goto end;
+
+ for (y=0; y<table->cells_num_y; ++y)
+ {
+ int x;
+ if (extract_astring_cat(alloc, content,
+ " <w:tr>\n"
+ " <w:trPr/>\n"
+ )) goto end;
+
+ for (x=0; x<table->cells_num_x; ++x)
+ {
+ cell_t* cell = table->cells[y*table->cells_num_x + x];
+ if (!cell->left) continue;
+
+ if (extract_astring_cat(alloc, content, " <w:tc>\n")) goto end;
+
+ /* Write cell properties. */
+ {
+ if (extract_astring_cat(alloc, content,
+ " <w:tcPr>\n"
+ " <w:tcBorders>\n"
+ " <w:top w:val=\"double\" w:sz=\"2\" w:space=\"0\" w:color=\"808080\"/>\n"
+ " <w:start w:val=\"double\" w:sz=\"2\" w:space=\"0\" w:color=\"808080\"/>\n"
+ " <w:bottom w:val=\"double\" w:sz=\"2\" w:space=\"0\" w:color=\"808080\"/>\n"
+ " <w:end w:val=\"double\" w:sz=\"2\" w:space=\"0\" w:color=\"808080\"/>\n"
+ " </w:tcBorders>\n"
+ )) goto end;
+ if (cell->extend_right > 1)
+ {
+ if (extract_astring_catf(alloc, content, " <w:gridSpan w:val=\"%i\"/>\n", cell->extend_right)) goto end;
+ }
+ if (cell->above)
+ {
+ if (cell->extend_down > 1)
+ {
+ if (extract_astring_catf(alloc, content, " <w:vMerge w:val=\"restart\"/>\n", cell->extend_down)) goto end;
+ }
+ }
+ else
+ {
+ if (extract_astring_catf(alloc, content, " <w:vMerge w:val=\"continue\"/>\n")) goto end;
+ }
+ if (extract_astring_cat(alloc, content, " </w:tcPr>\n")) goto end;
+ }
+
+ /* Write contents of this cell. */
+ {
+ size_t chars_num_old = content->chars_num;
+ int p;
+ content_state_t content_state = {0};
+ content_state.font.name = NULL;
+ content_state.ctm_prev = NULL;
+ for (p=0; p<cell->paragraphs_num; ++p)
+ {
+ paragraph_t* paragraph = cell->paragraphs[p];
+ if (s_document_to_docx_content_paragraph(alloc, &content_state, paragraph, content)) goto end;
+ }
+ if (content_state.font.name)
+ {
+ if (s_docx_run_finish(alloc, &content_state, content)) goto end;
+ }
+
+ /* Need to write out at least an empty paragraph in each cell,
+ otherwise Word/Libreoffice fail to show table at all; the
+ OOXML spec says "If a table cell does not include at least one
+ block-level element, then this document shall be considered
+ corrupt." */
+ if (content->chars_num == chars_num_old)
+ {
+ if (extract_astring_catf(alloc, content, "<w:p/>\n")) goto end;
+ }
+ }
+ if (extract_astring_cat(alloc, content, " </w:tc>\n")) goto end;
+ }
+ if (extract_astring_cat(alloc, content, " </w:tr>\n")) goto end;
+ }
+ if (extract_astring_cat(alloc, content, " </w:tbl>\n")) goto end;
+ e = 0;
+
+ end:
+ return e;
+}
+
+static int s_docx_append_rotated_paragraphs(
+ extract_alloc_t* alloc,
+ extract_page_t* page,
+ content_state_t* state,
+ int* p,
+ int* text_box_id,
+ const matrix_t* ctm,
+ double rotate,
+ extract_astring_t* content
+ )
+/* Appends paragraphs with same rotation, starting with page->paragraphs[*p]
+and updates *p. */
+{
+ /* Find extent of paragraphs with this same rotation. extent
+ will contain max width and max height of paragraphs, in units
+ before application of ctm, i.e. before rotation. */
+ int e = -1;
+ point_t extent = {0, 0};
+ int p0 = *p;
+ int p1;
+ paragraph_t* paragraph = page->paragraphs[*p];
+
+ outf("rotate=%.2frad=%.1fdeg ctm: ef=(%f %f) abcd=(%f %f %f %f)",
+ rotate, rotate * 180 / pi,
+ ctm->e,
+ ctm->f,
+ ctm->a,
+ ctm->b,
+ ctm->c,
+ ctm->d
+ );
+
+ {
+ /* We assume that first span is at origin of text
+ block. This assumes left-to-right text. */
+ double rotate0 = rotate;
+ const matrix_t* ctm0 = ctm;
+ point_t origin = {
+ paragraph->lines[0]->spans[0]->chars[0].x,
+ paragraph->lines[0]->spans[0]->chars[0].y
+ };
+ matrix_t ctm_inverse = {1, 0, 0, 1, 0, 0};
+ double ctm_det = ctm->a*ctm->d - ctm->b*ctm->c;
+ if (ctm_det != 0) {
+ ctm_inverse.a = +ctm->d / ctm_det;
+ ctm_inverse.b = -ctm->b / ctm_det;
+ ctm_inverse.c = -ctm->c / ctm_det;
+ ctm_inverse.d = +ctm->a / ctm_det;
+ }
+ else {
+ outf("cannot invert ctm=(%f %f %f %f)",
+ ctm->a, ctm->b, ctm->c, ctm->d);
+ }
+
+ for (*p=p0; *p<page->paragraphs_num; ++(*p)) {
+ paragraph = page->paragraphs[*p];
+ ctm = &paragraph->lines[0]->spans[0]->ctm;
+ rotate = atan2(ctm->b, ctm->a);
+ if (rotate != rotate0) {
+ break;
+ }
+
+ /* Update <extent>. */
+ {
+ int l;
+ for (l=0; l<paragraph->lines_num; ++l) {
+ line_t* line = paragraph->lines[l];
+ span_t* span = extract_line_span_last(line);
+ char_t* char_ = extract_span_char_last(span);
+ double adv = char_->adv * extract_matrix_expansion(span->trm);
+ double x = char_->x + adv * cos(rotate);
+ double y = char_->y + adv * sin(rotate);
+
+ double dx = x - origin.x;
+ double dy = y - origin.y;
+
+ /* Position relative to origin and before box rotation. */
+ double xx = ctm_inverse.a * dx + ctm_inverse.b * dy;
+ double yy = ctm_inverse.c * dx + ctm_inverse.d * dy;
+ yy = -yy;
+ if (xx > extent.x) extent.x = xx;
+ if (yy > extent.y) extent.y = yy;
+ if (0) outf("rotate=%f *p=%i: origin=(%f %f) xy=(%f %f) dxy=(%f %f) xxyy=(%f %f) span: %s",
+ rotate, *p, origin.x, origin.y, x, y, dx, dy, xx, yy, extract_span_string(alloc, span));
+ }
+ }
+ }
+ p1 = *p;
+ rotate = rotate0;
+ ctm = ctm0;
+ outf("rotate=%f p0=%i p1=%i. extent is: (%f %f)",
+ rotate, p0, p1, extent.x, extent.y);
+ }
+
+ /* Paragraphs p0..p1-1 have same rotation. We output them into
+ a single rotated text box. */
+
+ /* We need unique id for text box. */
+ *text_box_id += 1;
+
+ {
+ /* Angles are in units of 1/60,000 degree. */
+ int rot = (int) (rotate * 180 / pi * 60000);
+
+ /* <wp:anchor distT=\.. etc are in EMU - 1/360,000 of a cm.
+ relativeHeight is z-ordering. (wp:positionV:wp:posOffset,
+ wp:positionV:wp:posOffset) is position of origin of box in
+ EMU.
+
+ The box rotates about its centre but we want to rotate
+ about the origin (top-left). So we correct the position of
+ box by subtracting the vector that the top-left moves when
+ rotated by angle <rotate> about the middle. */
+ double point_to_emu = 12700; /* https://en.wikipedia.org/wiki/Office_Open_XML_file_formats#DrawingML */
+ int x = (int) (ctm->e * point_to_emu);
+ int y = (int) (ctm->f * point_to_emu);
+ int w = (int) (extent.x * point_to_emu);
+ int h = (int) (extent.y * point_to_emu);
+ int dx;
+ int dy;
+
+ if (0) outf("rotate: %f rad, %f deg. rot=%i", rotate, rotate*180/pi, rot);
+
+ h *= 2;
+ /* We can't predict how much space Word will actually
+ require for the rotated text, so make the box have the
+ original width but allow text to take extra vertical
+ space. There doesn't seem to be a way to make the text box
+ auto-grow to contain the text. */
+
+ dx = (int) ((1-cos(rotate)) * w / 2.0 + sin(rotate) * h / 2.0);
+ dy = (int) ((cos(rotate)-1) * h / 2.0 + sin(rotate) * w / 2.0);
+ outf("ctm->e,f=%f,%f rotate=%f => x,y=%ik %ik dx,dy=%ik %ik",
+ ctm->e,
+ ctm->f,
+ rotate * 180/pi,
+ x/1000,
+ y/1000,
+ dx/1000,
+ dy/1000
+ );
+ x -= dx;
+ y -= -dy;
+
+ if (s_docx_output_rotated_paragraphs(alloc, page, p0, p1, rot, x, y, w, h, *text_box_id, content, state)) goto end;
+ }
+ *p = p1 - 1;
+ e = 0;
+
+ end:
+
+ return e;
+}
+
int extract_document_to_docx_content(
extract_alloc_t* alloc,
document_t* document,
@@ -422,184 +651,73 @@ int extract_document_to_docx_content(
/* Write paragraphs into <content>. */
for (p=0; p<document->pages_num; ++p) {
extract_page_t* page = document->pages[p];
- int p;
- content_state_t state;
- state.font_name = NULL;
- state.font_size = 0;
- state.font_bold = 0;
- state.font_italic = 0;
- state.ctm_prev = NULL;
- for (p=0; p<page->paragraphs_num; ++p) {
- paragraph_t* paragraph = page->paragraphs[p];
- const matrix_t* ctm = &paragraph->lines[0]->spans[0]->ctm;
- double rotate = atan2(ctm->b, ctm->a);
+ int p = 0;
+ int t = 0;
+
+ content_state_t content_state;
+ content_state.font.name = NULL;
+ content_state.font.size = 0;
+ content_state.font.bold = 0;
+ content_state.font.italic = 0;
+ content_state.ctm_prev = NULL;
+
+ /* Output paragraphs and tables in order of y coordinate. */
+ for(;;)
+ {
+ paragraph_t* paragraph = (p == page->paragraphs_num) ? NULL : page->paragraphs[p];
+ table_t* table = (t == page->tables_num) ? NULL : page->tables[t];
+ double y_paragraph;
+ double y_table;
+ if (!paragraph && !table) break;
+ y_paragraph = (paragraph) ? paragraph->lines[0]->spans[0]->chars[0].y : DBL_MAX;
+ y_table = (table) ? table->pos.y : DBL_MAX;
- if (spacing
- && state.ctm_prev
- && paragraph->lines_num
- && paragraph->lines[0]->spans_num
- && matrix_cmp4(
- state.ctm_prev,
- &paragraph->lines[0]->spans[0]->ctm
- )
- ) {
- /* Extra vertical space between paragraphs that were at
- different angles in the original document. */
- if (extract_docx_paragraph_empty(alloc, content)) goto end;
- }
+ if (paragraph && y_paragraph < y_table)
+ {
+ const matrix_t* ctm = &paragraph->lines[0]->spans[0]->ctm;
+ double rotate = atan2(ctm->b, ctm->a);
+
+ if (spacing
+ && content_state.ctm_prev
+ && paragraph->lines_num
+ && paragraph->lines[0]->spans_num
+ && extract_matrix_cmp4(
+ content_state.ctm_prev,
+ &paragraph->lines[0]->spans[0]->ctm
+ )
+ ) {
+ /* Extra vertical space between paragraphs that were at
+ different angles in the original document. */
+ if (s_docx_paragraph_empty(alloc, content)) goto end;
+ }
- if (spacing) {
- /* Extra vertical space between paragraphs. */
- if (extract_docx_paragraph_empty(alloc, content)) goto end;
- }
-
- if (rotation && rotate != 0) {
-
- /* Find extent of paragraphs with this same rotation. extent
- will contain max width and max height of paragraphs, in units
- before application of ctm, i.e. before rotation. */
- point_t extent = {0, 0};
- int p0 = p;
- int p1;
-
- outf("rotate=%.2frad=%.1fdeg ctm: ef=(%f %f) abcd=(%f %f %f %f)",
- rotate, rotate * 180 / pi,
- ctm->e,
- ctm->f,
- ctm->a,
- ctm->b,
- ctm->c,
- ctm->d
- );
-
- {
- /* We assume that first span is at origin of text
- block. This assumes left-to-right text. */
- double rotate0 = rotate;
- const matrix_t* ctm0 = ctm;
- point_t origin = {
- paragraph->lines[0]->spans[0]->chars[0].x,
- paragraph->lines[0]->spans[0]->chars[0].y
- };
- matrix_t ctm_inverse = {1, 0, 0, 1, 0, 0};
- double ctm_det = ctm->a*ctm->d - ctm->b*ctm->c;
- if (ctm_det != 0) {
- ctm_inverse.a = +ctm->d / ctm_det;
- ctm_inverse.b = -ctm->b / ctm_det;
- ctm_inverse.c = -ctm->c / ctm_det;
- ctm_inverse.d = +ctm->a / ctm_det;
- }
- else {
- outf("cannot invert ctm=(%f %f %f %f)",
- ctm->a, ctm->b, ctm->c, ctm->d);
- }
+ if (spacing) {
+ /* Extra vertical space between paragraphs. */
+ if (s_docx_paragraph_empty(alloc, content)) goto end;
+ }
- for (p=p0; p<page->paragraphs_num; ++p) {
- paragraph = page->paragraphs[p];
- ctm = &paragraph->lines[0]->spans[0]->ctm;
- rotate = atan2(ctm->b, ctm->a);
- if (rotate != rotate0) {
- break;
- }
-
- /* Update <extent>. */
- {
- int l;
- for (l=0; l<paragraph->lines_num; ++l) {
- line_t* line = paragraph->lines[l];
- span_t* span = line_span_last(line);
- char_t* char_ = span_char_last(span);
- double adv = char_->adv * matrix_expansion(span->trm);
- double x = char_->x + adv * cos(rotate);
- double y = char_->y + adv * sin(rotate);
-
- double dx = x - origin.x;
- double dy = y - origin.y;
-
- /* Position relative to origin and before box rotation. */
- double xx = ctm_inverse.a * dx + ctm_inverse.b * dy;
- double yy = ctm_inverse.c * dx + ctm_inverse.d * dy;
- yy = -yy;
- if (xx > extent.x) extent.x = xx;
- if (yy > extent.y) extent.y = yy;
- if (0) outf("rotate=%f p=%i: origin=(%f %f) xy=(%f %f) dxy=(%f %f) xxyy=(%f %f) span: %s",
- rotate, p, origin.x, origin.y, x, y, dx, dy, xx, yy, span_string(alloc, span));
- }
- }
- }
- p1 = p;
- rotate = rotate0;
- ctm = ctm0;
- outf("rotate=%f p0=%i p1=%i. extent is: (%f %f)",
- rotate, p0, p1, extent.x, extent.y);
+ if (rotation && rotate != 0)
+ {
+ if (s_docx_append_rotated_paragraphs(alloc, page, &content_state, &p, &text_box_id, ctm, rotate, content)) goto end;
}
-
- /* Paragraphs p0..p1-1 have same rotation. We output them into
- a single rotated text box. */
-
- /* We need unique id for text box. */
- text_box_id += 1;
-
+ else
{
- /* Angles are in units of 1/60,000 degree. */
- int rot = (int) (rotate * 180 / pi * 60000);
-
- /* <wp:anchor distT=\.. etc are in EMU - 1/360,000 of a cm.
- relativeHeight is z-ordering. (wp:positionV:wp:posOffset,
- wp:positionV:wp:posOffset) is position of origin of box in
- EMU.
-
- The box rotates about its centre but we want to rotate
- about the origin (top-left). So we correct the position of
- box by subtracting the vector that the top-left moves when
- rotated by angle <rotate> about the middle. */
- double point_to_emu = 12700; /* https://en.wikipedia.org/wiki/Office_Open_XML_file_formats#DrawingML */
- int x = (int) (ctm->e * point_to_emu);
- int y = (int) (ctm->f * point_to_emu);
- int w = (int) (extent.x * point_to_emu);
- int h = (int) (extent.y * point_to_emu);
- int dx;
- int dy;
-
- if (0) outf("rotate: %f rad, %f deg. rot=%i", rotate, rotate*180/pi, rot);
-
- h *= 2;
- /* We can't predict how much space Word will actually
- require for the rotated text, so make the box have the
- original width but allow text to take extra vertical
- space. There doesn't seem to be a way to make the text box
- auto-grow to contain the text. */
-
- dx = (int) ((1-cos(rotate)) * w / 2.0 + sin(rotate) * h / 2.0);
- dy = (int) ((cos(rotate)-1) * h / 2.0 + sin(rotate) * w / 2.0);
- outf("ctm->e,f=%f,%f rotate=%f => x,y=%ik %ik dx,dy=%ik %ik",
- ctm->e,
- ctm->f,
- rotate * 180/pi,
- x/1000,
- y/1000,
- dx/1000,
- dy/1000
- );
- x -= dx;
- y -= -dy;
-
- if (extract_document_output_rotated_paragraphs(alloc, page, p0, p1, rot, x, y, w, h, text_box_id, content, &state)) goto end;
+ if (s_document_to_docx_content_paragraph(alloc, &content_state, paragraph, content)) goto end;
}
- p = p1 - 1;
- //p = page->paragraphs_num - 1;
+ p += 1;
}
- else {
- if (extract_document_to_docx_content_paragraph(alloc, &state, paragraph, content)) goto end;
+ else if (table)
+ {
+ if (s_docx_append_table(alloc, table, content)) goto end;
+ t += 1;
}
-
}
if (images) {
int i;
for (i=0; i<page->images_num; ++i) {
- extract_document_append_image(alloc, content, &page->images[i]);
+ s_docx_append_image(alloc, content, &page->images[i]);
}
}
}
@@ -738,7 +856,6 @@ int extract_docx_write_template(
int e = -1;
int i;
char* path_tempdir = NULL;
- FILE* f = NULL;
char* path = NULL;
char* text = NULL;
char* text2 = NULL;
@@ -841,7 +958,6 @@ int extract_docx_write_template(
extract_free(alloc, &path);
extract_free(alloc, &text);
extract_free(alloc, &text2);
- if (f) fclose(f);
if (e) {
outf("Failed to create %s", path_out);
diff --git a/extract/src/docx.h b/extract/src/docx.h
index 6e26568f..976272a6 100644
--- a/extract/src/docx.h
+++ b/extract/src/docx.h
@@ -13,8 +13,8 @@ int extract_document_to_docx_content(
int images,
extract_astring_t* content
);
-/* Makes *o_content point to a string containing all paragraphs in *document in
-docx XML format.
+/* Makes *o_content point to a string containing all paragraphs, images and
+tables (tables as of 2021-07-22) in *document in docx XML format.
This string can be passed to extract_docx_content_item() or
extract_docx_write_template() to be inserted into a docx archive's
diff --git a/extract/src/docx_template_build.py b/extract/src/docx_template_build.py
index 5e2f5380..8b836300 100755
--- a/extract/src/docx_template_build.py
+++ b/extract/src/docx_template_build.py
@@ -9,6 +9,9 @@ Args:
--pretty <directory>
Prettyfies all .xml files within <directory> using 'xmllint --format'.
+ -f
+ Force touch of output file, even if unchanged.
+
-i <in-path>
Set template docx/odt file to extract from.
@@ -57,12 +60,17 @@ def write(text, path, encoding):
with open(path, 'wb') as f:
f.write(text.encode(encoding))
-def write_if_diff(text, path, encoding):
- if os.path.isfile(path):
- old = read(path, encoding)
- if old == text:
- return
- print(f'Updating path={path} because contents have changed')
+def write_if_diff(text, path, encoding, force):
+ '''
+ Does nothing if <force> is false and file named <path> already contains
+ <text>. Otherwise writes <text> to file named <path>.
+ '''
+ if not force:
+ if os.path.isfile(path):
+ old = read(path, encoding)
+ if old == text:
+ return
+ print(f'Updating path={path} because contents have changed')
write(text, path, encoding)
def check_path_safe(path):
@@ -98,6 +106,8 @@ def main():
path_in = None
path_out = None
infix = None
+ force = False
+
args = iter(sys.argv[1:])
while 1:
try: arg = next(args)
@@ -114,6 +124,8 @@ def main():
path = os.path.join(dirpath, filename)
system(f'xmllint --format {path} > {path}-')
system(f'mv {path}- {path}')
+ elif arg == '-f':
+ force = True
elif arg == '-i':
path_in = next(args)
elif arg == '-n':
@@ -166,7 +178,7 @@ def main():
for filename in sorted(filenames):
num_items += 1
path = os.path.join(dirpath, filename)
- print(f'looking at path={path}')
+ #print(f'looking at path={path}')
name = path[ len(path_temp)+1: ]
out_c.write(f' {{\n')
out_c.write(f' "{name}",\n')
@@ -213,7 +225,7 @@ def main():
out_c.write(f'int {infix}_template_items_num = {num_items};\n')
out_c = out_c.getvalue()
- write_if_diff(out_c, f'{path_out}.c', 'utf-8')
+ write_if_diff(out_c, f'{path_out}.c', 'utf-8', force)
out_h = io.StringIO()
out_h.write(f'#ifndef EXTRACT_{infix.upper()}_TEMPLATE_H\n')
@@ -233,7 +245,7 @@ def main():
out_h.write(f'\n')
out_h.write(f'\n')
out_h.write(f'#endif\n')
- write_if_diff(out_h.getvalue(), f'{path_out}.h', 'utf-8')
+ write_if_diff(out_h.getvalue(), f'{path_out}.h', 'utf-8', force)
#os.system(f'rm -r "{path_temp}"')
if __name__ == '__main__':
diff --git a/extract/src/extract-exe.c b/extract/src/extract-exe.c
index 22b520db..ee34023a 100644
--- a/extract/src/extract-exe.c
+++ b/extract/src/extract-exe.c
@@ -139,6 +139,7 @@ int main(int argc, char** argv)
if (arg_next_string(argv, argc, &i, &format_name)) goto end;
if (!strcmp(format_name, "odt")) format = extract_format_ODT;
else if (!strcmp(format_name, "docx")) format = extract_format_DOCX;
+ else if (!strcmp(format_name, "html")) format = extract_format_HTML;
else
{
printf("-f value should be 'odt' or 'docx', not '%s'.\n", format_name);
@@ -170,7 +171,7 @@ int main(int argc, char** argv)
else if (!strcmp(arg, "-v")) {
int verbose;
if (arg_next_int(argv, argc, &i, &verbose)) goto end;
- outf_verbose_set(verbose);
+ extract_outf_verbose_set(verbose);
outf("Have changed verbose to %i", verbose);
}
else if (!strcmp(arg, "--v-alloc")) {
diff --git a/extract/src/extract.c b/extract/src/extract.c
index 9eb85d2f..2c375571 100644
--- a/extract/src/extract.c
+++ b/extract/src/extract.c
@@ -5,6 +5,7 @@
#include "document.h"
#include "docx.h"
#include "docx_template.h"
+#include "html.h"
#include "mem.h"
#include "memento.h"
#include "odt.h"
@@ -25,7 +26,7 @@
-double matrix_expansion(matrix_t m)
+double extract_matrix_expansion(matrix_t m)
{
return sqrt(fabs(m.a * m.d - m.b * m.c));
}
@@ -41,14 +42,31 @@ static void char_init(char_t* item)
item->adv = 0;
}
+const char* extract_point_string(const point_t* point)
+{
+ static char buffer[128];
+ snprintf(buffer, sizeof(buffer), "(%f %f)", point->x, point->y);
+ return buffer;
+}
+
+const char* extract_rect_string(const rect_t* rect)
+{
+ static char buffer[2][256];
+ static int i = 0;
+ i = (i + 1) % 2;
+ snprintf(buffer[i], sizeof(buffer[i]), "((%f %f) (%f %f))", rect->min.x, rect->min.y, rect->max.x, rect->max.y);
+ return buffer[i];
+}
-const char* span_string(extract_alloc_t* alloc, span_t* span)
+const char* extract_span_string(extract_alloc_t* alloc, span_t* span)
{
static extract_astring_t ret = {0};
double x0 = 0;
double y0 = 0;
+ point_t pre0 = {0, 0};
double x1 = 0;
double y1 = 0;
+ point_t pre1 = {0, 0};
int c0 = 0;
int c1 = 0;
int i;
@@ -62,17 +80,23 @@ const char* span_string(extract_alloc_t* alloc, span_t* span)
c0 = span->chars[0].ucs;
x0 = span->chars[0].x;
y0 = span->chars[0].y;
+ pre0.x = span->chars[0].pre_x;
+ pre0.y = span->chars[0].pre_y;
c1 = span->chars[span->chars_num-1].ucs;
x1 = span->chars[span->chars_num-1].x;
y1 = span->chars[span->chars_num-1].y;
+ pre1.x = span->chars[span->chars_num-1].pre_x;
+ pre1.y = span->chars[span->chars_num-1].pre_y;
}
{
- char buffer[200];
+ char buffer[400];
snprintf(buffer, sizeof(buffer),
- "span chars_num=%i (%c:%f,%f)..(%c:%f,%f) font=%s:(%f,%f) wmode=%i chars_num=%i: ",
+ "span ctm=%s trm=%s chars_num=%i (%c:%f,%f pre(%f %f))..(%c:%f,%f pre(%f %f)) font=%s:(%f,%f) wmode=%i chars_num=%i: ",
+ extract_matrix_string(&span->ctm),
+ extract_matrix_string(&span->trm),
span->chars_num,
- c0, x0, y0,
- c1, x1, y1,
+ c0, x0, y0, pre0.x, pre0.y,
+ c1, x1, y1, pre1.x, pre1.y,
span->font_name,
span->trm.a,
span->trm.d,
@@ -84,9 +108,11 @@ const char* span_string(extract_alloc_t* alloc, span_t* span)
snprintf(
buffer,
sizeof(buffer),
- " i=%i {x=%f adv=%f}",
+ " i=%i {x=%f y=%f ucs=%i adv=%f}",
i,
span->chars[i].x,
+ span->chars[i].y,
+ span->chars[i].ucs,
span->chars[i].adv
);
extract_astring_cat(alloc, &ret, buffer);
@@ -101,7 +127,7 @@ const char* span_string(extract_alloc_t* alloc, span_t* span)
return ret.chars;
}
-int span_append_c(extract_alloc_t* alloc, span_t* span, int c)
+int extract_span_append_c(extract_alloc_t* alloc, span_t* span, int c)
{
char_t* item;
if (extract_realloc2(
@@ -119,7 +145,7 @@ int span_append_c(extract_alloc_t* alloc, span_t* span, int c)
return 0;
}
-char_t* span_char_last(span_t* span)
+char_t* extract_span_char_last(span_t* span)
{
assert(span->chars_num > 0);
return &span->chars[span->chars_num-1];
@@ -138,58 +164,62 @@ static const char* line_string(line_t* line)
int i;
for (i=0; i<line->spans_num; ++i) {
extract_astring_cat(&ret, " ");
- extract_astring_cat(&ret, span_string(line->spans[i]));
+ extract_astring_cat(&ret, extract_span_string(line->spans[i]));
}
return ret.chars;
}
#endif
/* Returns first span in a line. */
-span_t* line_span_last(line_t* line)
+span_t* extract_line_span_last(line_t* line)
{
assert(line->spans_num > 0);
return line->spans[line->spans_num - 1];
}
-span_t* line_span_first(line_t* line)
+span_t* extract_line_span_first(line_t* line)
{
assert(line->spans_num > 0);
return line->spans[0];
}
-static void page_free(extract_alloc_t* alloc, extract_page_t* page)
+
+static void table_free(extract_alloc_t* alloc, table_t** ptable)
+{
+ int c;
+ table_t* table = *ptable;
+ outf("table->cells_num_x=%i table->cells_num_y=%i",
+ table->cells_num_x,
+ table->cells_num_y
+ );
+ for (c = 0; c< table->cells_num_x * table->cells_num_y; ++c)
+ {
+ extract_cell_free(alloc, &table->cells[c]);
+ }
+ extract_free(alloc, &table->cells);
+ extract_free(alloc, ptable);
+}
+
+static void page_free(extract_alloc_t* alloc, extract_page_t** ppage)
{
- int s;
+ extract_page_t* page = *ppage;
if (!page) return;
- for (s=0; s<page->spans_num; ++s) {
- span_t* span = page->spans[s];
- if (span) {
- extract_free(alloc, &span->chars);
- extract_free(alloc, &span->font_name);
- }
- extract_free(alloc, &span);
- }
- extract_free(alloc, &page->spans);
+ outf0("page=%p page->spans_num=%i page->lines_num=%i",
+ page, page->spans_num, page->lines_num);
+ extract_spans_free(alloc, &page->spans, page->spans_num);
- {
- int l;
- for (l=0; l<page->lines_num; ++l) {
- line_t* line = page->lines[l];
- extract_free(alloc, &line->spans);
- extract_free(alloc, &line);
- /* We don't free line->spans->chars[] because already freed via
- page->spans. */
- }
- }
- extract_free(alloc, &page->lines);
+ extract_lines_free(alloc, &page->lines, page->lines_num);
{
int p;
for (p=0; p<page->paragraphs_num; ++p) {
paragraph_t* paragraph = page->paragraphs[p];
+ /* We don't call extract_lines_free(&paragraph->lines) because
+ these point into the same data as page->lines, which we have
+ already freed above. */
if (paragraph) extract_free(alloc, &paragraph->lines);
- extract_free(alloc, &paragraph);
+ extract_free(alloc, &page->paragraphs[p]);
}
}
extract_free(alloc, &page->paragraphs);
@@ -197,13 +227,26 @@ static void page_free(extract_alloc_t* alloc, extract_page_t* page)
{
int i;
for (i=0; i<page->images_num; ++i) {
- extract_free(alloc, &page->images[i].data);
- extract_free(alloc, &page->images[i].type);
- extract_free(alloc, &page->images[i].id);
- extract_free(alloc, &page->images[i].name);
+ extract_image_clear(alloc, &page->images[i]);
}
+ extract_free(alloc, &page->images);
}
extract_free(alloc, &page->images);
+
+ extract_free(alloc, &page->tablelines_horizontal.tablelines);
+ extract_free(alloc, &page->tablelines_vertical.tablelines);
+
+ {
+ int t;
+ outf("page=%p page->tables_num=%i", page, page->tables_num);
+ for (t=0; t<page->tables_num; ++t)
+ {
+ table_free(alloc, &page->tables[t]);
+ }
+ extract_free(alloc, &page->tables);
+ }
+
+ extract_free(alloc, ppage);
}
static span_t* page_span_append(extract_alloc_t* alloc, extract_page_t* page)
@@ -212,9 +255,7 @@ error. */
{
span_t* span;
if (extract_malloc(alloc, &span, sizeof(*span))) return NULL;
- span->font_name = NULL;
- span->chars = NULL;
- span->chars_num = 0;
+ extract_span_init(span);
if (extract_realloc2(
alloc,
&page->spans,
@@ -234,14 +275,7 @@ static void extract_images_free(extract_alloc_t* alloc, images_t* images)
{
int i;
for (i=0; i<images->images_num; ++i) {
- image_t* image = &images->images[i];
- extract_free(alloc, &image->type);
- extract_free(alloc, &image->name);
- extract_free(alloc, &image->id);
- if (image->data_free) {
- image->data_free(image->data_free_handle, image->data);
- }
- extract_free(alloc, &images->images[i]);
+ extract_image_clear(alloc, &images->images[i]);
}
extract_free(alloc, &images->images);
extract_free(alloc, &images->imagetypes);
@@ -260,10 +294,12 @@ On return document->page[].images* will be NULL etc.
int p;
images_t images = {0};
outf("extract_document_images(): images.images_num=%i", images.images_num);
- for (p=0; p<document->pages_num; ++p) {
+ for (p=0; p<document->pages_num; ++p)
+ {
extract_page_t* page = document->pages[p];
int i;
- for (i=0; i<page->images_num; ++i) {
+ for (i=0; i<page->images_num; ++i)
+ {
image_t* image;
if (extract_realloc2(
alloc,
@@ -280,14 +316,17 @@ On return document->page[].images* will be NULL etc.
/* Add image type if we haven't seen it before. */
{
int it;
- for (it=0; it<images.imagetypes_num; ++it) {
+ for (it=0; it<images.imagetypes_num; ++it)
+ {
outf("it=%i images.imagetypes[it]=%s image->type=%s",
it, images.imagetypes[it], image->type);
if (!strcmp(images.imagetypes[it], image->type)) {
break;
}
}
- if (it == images.imagetypes_num) {
+ if (it == images.imagetypes_num)
+ {
+ /* We haven't seen this image type before. */
if (extract_realloc2(
alloc,
&images.imagetypes,
@@ -314,9 +353,12 @@ On return document->page[].images* will be NULL etc.
}
e = 0;
end:
- if (e) {
+ if (e)
+ {
+ extract_free(alloc, &images.images);
}
- else {
+ else
+ {
*o_images = images;
}
return e;
@@ -330,8 +372,7 @@ static void extract_document_free(extract_alloc_t* alloc, document_t* document)
}
for (p=0; p<document->pages_num; ++p) {
extract_page_t* page = document->pages[p];
- page_free(alloc, page);
- extract_free(alloc, &page);
+ page_free(alloc, &page);
}
extract_free(alloc, &document->pages);
document->pages = NULL;
@@ -347,7 +388,7 @@ static int s_sign(double x)
return 0;
}
-int matrix_cmp4(const matrix_t* lhs, const matrix_t* rhs)
+int extract_matrix_cmp4(const matrix_t* lhs, const matrix_t* rhs)
{
int ret;
ret = s_sign(lhs->a - rhs->a); if (ret) return ret;
@@ -358,7 +399,7 @@ int matrix_cmp4(const matrix_t* lhs, const matrix_t* rhs)
}
-static point_t multiply_matrix_point(matrix_t m, point_t p)
+point_t extract_multiply_matrix_point(matrix_t m, point_t p)
{
double x = p.x;
p.x = m.a * x + m.c * p.y;
@@ -366,6 +407,18 @@ static point_t multiply_matrix_point(matrix_t m, point_t p)
return p;
}
+matrix_t extract_multiply_matrix_matrix(matrix_t m1, matrix_t m2)
+{
+ matrix_t ret;
+ ret.a = m1.a * m2.a + m1.b * m2.c;
+ ret.b = m1.a * m2.b + m1.b * m2.d;
+ ret.c = m1.c * m2.a + m1.d * m2.c;
+ ret.d = m1.c * m2.b + m1.d * m2.d;
+ ret.e = m1.e + m2.e;
+ ret.f = m1.f + m2.f;
+ return ret;
+}
+
static int s_matrix_read(const char* text, matrix_t* matrix)
{
int n;
@@ -427,8 +480,8 @@ char_t into a new span_t. */
return 0;
}
- font_size = matrix_expansion(span->trm)
- * matrix_expansion(span->ctm);
+ font_size = extract_matrix_expansion(span->trm)
+ * extract_matrix_expansion(span->ctm);
if (span->flags.wmode) {
dir.x = 0;
@@ -438,7 +491,7 @@ char_t into a new span_t. */
dir.x = 1;
dir.y = 0;
}
- dir = multiply_matrix_point(span->trm, dir);
+ dir = extract_multiply_matrix_point(span->trm, dir);
x = char_[-2].pre_x + char_[-2].adv * dir.x;
y = char_[-2].pre_y + char_[-2].adv * dir.y;
@@ -470,10 +523,10 @@ char_t into a new span_t. */
sometimes seem to appear in the middle of words for some
reason. */
outfx("removing space before final char in: %s",
- span_string(span));
+ extract_span_string(span));
span->chars[span->chars_num-2] = span->chars[span->chars_num-1];
span->chars_num -= 1;
- outfx("span is now: %s", span_string(span));
+ outfx("span is now: %s", extract_span_string(span));
return 0;
}
}
@@ -536,9 +589,42 @@ struct extract_t
int contentss_num;
images_t images;
-
+
extract_format_t format;
extract_odt_styles_t odt_styles;
+
+ char* tables_csv_format;
+ int tables_csv_i;
+
+ enum
+ {
+ path_type_NONE,
+ path_type_FILL,
+ path_type_STROKE,
+ } path_type;
+
+ union
+ {
+ struct
+ {
+ matrix_t ctm;
+ double color;
+ point_t points[4];
+ int n;
+ } fill;
+
+ struct
+ {
+ matrix_t ctm;
+ double color;
+ double width;
+ point_t point0;
+ int point0_set;
+ point_t point;
+ int point_set;
+ } stroke;
+
+ } path;
};
@@ -551,7 +637,12 @@ int extract_begin(
int e = -1;
extract_t* extract;
- if (format != extract_format_ODT && format != extract_format_DOCX)
+ if (1
+ && format != extract_format_ODT
+ && format != extract_format_DOCX
+ && format != extract_format_HTML
+ && format != extract_format_TEXT
+ )
{
outf0("Invalid format=%i\n", format);
errno = EINVAL;
@@ -570,6 +661,8 @@ int extract_begin(
extract->image_n = 10;
extract->format = format;
+ extract->tables_csv_format = NULL;
+ extract->tables_csv_i = 0;
e = 0;
@@ -578,6 +671,11 @@ int extract_begin(
return e;
}
+int extract_tables_csv_format(extract_t* extract, const char* path_format)
+{
+ return extract_strdup(extract->alloc, path_format, &extract->tables_csv_format);
+}
+
static void image_free_fn(void* handle, void* image_data)
{
@@ -872,6 +970,22 @@ int extract_span_begin(
span_t* span;
assert(extract->document.pages_num > 0);
page = extract->document.pages[extract->document.pages_num-1];
+ outf("extract_span_begin(): ctm=(%f %f %f %f %f %f) trm=(%f %f %f %f %f %f) font_name=%s, wmode=%i",
+ ctm_a,
+ ctm_b,
+ ctm_c,
+ ctm_d,
+ ctm_e,
+ ctm_f,
+ trm_a,
+ trm_b,
+ trm_c,
+ trm_d,
+ trm_e,
+ trm_f,
+ font_name,
+ wmode
+ );
span = page_span_append(extract->alloc, page);
if (!span) goto end;
span->ctm.a = ctm_a;
@@ -880,12 +994,14 @@ int extract_span_begin(
span->ctm.d = ctm_d;
span->ctm.e = ctm_e;
span->ctm.f = ctm_f;
+
span->trm.a = trm_a;
span->trm.b = trm_b;
span->trm.c = trm_c;
span->trm.d = trm_d;
span->trm.e = trm_e;
span->trm.f = trm_f;
+
{
const char* ff = strchr(font_name, '+');
const char* f = (ff) ? ff+1 : font_name;
@@ -916,7 +1032,49 @@ int extract_add_char(
extract_page_t* page = extract->document.pages[extract->document.pages_num-1];
span_t* span = page->spans[page->spans_num - 1];
- if (autosplit && y - extract->span_offset_y != 0) {
+ outf("(%f %f) ucs=% 5i=%c adv=%f", x, y, ucs, (ucs >=32 && ucs< 127) ? ucs : ' ', adv);
+ /* Ignore the specified <autosplit> - there seems no advantage to not
+ splitting spans on multiple lines, and not doing so causes problems with
+ missing spaces in the output. */
+ autosplit = 1;
+
+ if (span->chars_num)
+ {
+ char_t* char_prev = &span->chars[span->chars_num - 1];
+ double xx = span->ctm.a * x + span->ctm.c * y + span->ctm.e;
+ double yy = span->ctm.b * x + span->ctm.d * y + span->ctm.f;
+ double dx = xx - char_prev->x;
+ double dy = yy - char_prev->y;
+ double a = atan2(dy, dx);
+ double span_a;
+ matrix_t m = extract_multiply_matrix_matrix(span->trm, span->ctm);
+ point_t dir = {1 - span->flags.wmode, span->flags.wmode};
+ dir = extract_multiply_matrix_point(m, dir);
+ span_a = atan2(dir.y, dir.x);
+ if (fabs(span_a - a) > 0.01)
+ {
+ /* Create new span. */
+ span_t* span0 = span;
+ outf("chars_num=%i prev=(%f %f) => (%f %f) xy=(%f %f) => xxyy=(%f %f) delta=(%f %f) a=%f not in line with dir=(%f %f) a=%f: ",
+ span->chars_num,
+ char_prev->pre_x, char_prev->pre_y,
+ char_prev->x, char_prev->y,
+ x, y,
+ xx, yy,
+ dx, dy, a,
+ dir.x, dir.y, span_a
+ );
+ extract->num_spans_autosplit += 1;
+ span = page_span_append(extract->alloc, page);
+ if (!span) goto end;
+ *span = *span0;
+ span->chars = NULL;
+ span->chars_num = 0;
+ if (extract_strdup(extract->alloc, span0->font_name, &span->font_name)) goto end;
+ }
+ }
+
+ if (0 && autosplit && y - extract->span_offset_y != 0) {
double e = span->ctm.e + span->ctm.a * (x - extract->span_offset_x)
+ span->ctm.b * (y - extract->span_offset_y);
@@ -949,21 +1107,20 @@ int extract_add_char(
char_pre_y, offset_y);
}
- if (span_append_c(extract->alloc, span, 0 /*c*/)) goto end;
+ if (extract_span_append_c(extract->alloc, span, 0 /*c*/)) goto end;
+ /* Coverity warns, but extract_span_append_c() will have appended an item. */
+ /* coverity[var_deref_op] */
char_ = &span->chars[ span->chars_num-1];
- char_->pre_x = x - extract->span_offset_x;
- char_->pre_y = y - extract->span_offset_y;
+ char_->pre_x = x;
+ char_->pre_y = y;
- char_->x = span->ctm.a * char_->pre_x + span->ctm.b * char_->pre_y;
- char_->y = span->ctm.c * char_->pre_x + span->ctm.d * char_->pre_y;
+ char_->x = span->ctm.a * char_->pre_x + span->ctm.c * char_->pre_y + span->ctm.e;
+ char_->y = span->ctm.b * char_->pre_x + span->ctm.d * char_->pre_y + span->ctm.f;
char_->adv = adv;
char_->ucs = ucs;
- char_->x += span->ctm.e;
- char_->y += span->ctm.f;
-
{
int page_spans_num_old = page->spans_num;
if (page_span_end_clean(extract->alloc, page)) goto end;
@@ -1049,6 +1206,174 @@ int extract_add_image(
return e;
}
+
+static int tablelines_append(extract_alloc_t* alloc, tablelines_t* tablelines, rect_t* rect, double color)
+{
+ if (extract_realloc(
+ alloc,
+ &tablelines->tablelines,
+ sizeof(*tablelines->tablelines) * (tablelines->tablelines_num + 1)
+ )) return -1;
+ tablelines->tablelines[ tablelines->tablelines_num].rect = *rect;
+ tablelines->tablelines[ tablelines->tablelines_num].color = (float) color;
+ tablelines->tablelines_num += 1;
+ return 0;
+}
+
+static point_t transform(double x, double y,
+ double ctm_a,
+ double ctm_b,
+ double ctm_c,
+ double ctm_d,
+ double ctm_e,
+ double ctm_f
+ )
+{
+ point_t ret;
+ ret.x = ctm_a * x + ctm_b * y + ctm_e;
+ ret.y = ctm_c * x + ctm_d * y + ctm_f;
+ return ret;
+}
+
+static double s_min(double a, double b)
+{
+ return (a < b) ? a : b;
+}
+
+static double s_max(double a, double b)
+{
+ return (a > b) ? a : b;
+}
+
+int extract_add_path4(
+ extract_t* extract,
+ double ctm_a,
+ double ctm_b,
+ double ctm_c,
+ double ctm_d,
+ double ctm_e,
+ double ctm_f,
+ double x0,
+ double y0,
+ double x1,
+ double y1,
+ double x2,
+ double y2,
+ double x3,
+ double y3,
+ double color
+ )
+{
+ extract_page_t* page = extract->document.pages[extract->document.pages_num-1];
+ point_t points[4] = {
+ transform(x0, y0, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f),
+ transform(x1, y1, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f),
+ transform(x2, y2, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f),
+ transform(x3, y3, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f)
+ };
+ rect_t rect;
+ int i;
+ double dx;
+ double dy;
+ if (0 && color == 1)
+ {
+ return 0;
+ }
+ outf("cmt=(%f %f %f %f %f %f) points=[(%f %f) (%f %f) (%f %f) (%f %f)]",
+ ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f,
+ x0, y0, x1, y1, x2, y2, x3, y3
+ );
+ outf("extract_add_path4(): [(%f %f) (%f %f) (%f %f) (%f %f)]",
+ x0, y0, x1, y1, x2, y2, x3, y3);
+ /* Find first step with dx > 0. */
+ for (i=0; i<4; ++i)
+ {
+ if (points[(i+1) % 4].x > points[(i+0) % 4].x) break;
+ }
+ outf("i=%i", i);
+ if (i == 4) return 0;
+ rect.min.x = points[(i+0) % 4].x;
+ rect.max.x = points[(i+1) % 4].x;
+ if (points[(i+2) % 4].x != rect.max.x) return 0;
+ if (points[(i+3) % 4].x != rect.min.x) return 0;
+ y0 = points[(i+1) % 4].y;
+ y1 = points[(i+2) % 4].y;
+ if (y0 == y1) return 0;
+ if (points[(i+3) % 4].y != y1) return 0;
+ if (points[(i+4) % 4].y != y0) return 0;
+ rect.min.y = (y1 > y0) ? y0 : y1;
+ rect.max.y = (y1 > y0) ? y1 : y0;
+
+ dx = rect.max.x - rect.min.x;
+ dy = rect.max.y - rect.min.y;
+ if (dx / dy > 5)
+ {
+ /* Horizontal line. */
+ outf("have found horizontal line: %s", extract_rect_string(&rect));
+ if (tablelines_append(extract->alloc, &page->tablelines_horizontal, &rect, color)) return -1;
+ }
+ else if (dy / dx > 5)
+ {
+ /* Vertical line. */
+ outf("have found vertical line: %s", extract_rect_string(&rect));
+ if (tablelines_append(extract->alloc, &page->tablelines_vertical, &rect, color)) return -1;
+ }
+ return 0;
+}
+
+
+int extract_add_line(
+ extract_t* extract,
+ double ctm_a,
+ double ctm_b,
+ double ctm_c,
+ double ctm_d,
+ double ctm_e,
+ double ctm_f,
+ double width,
+ double x0,
+ double y0,
+ double x1,
+ double y1,
+ double color
+ )
+{
+ extract_page_t* page = extract->document.pages[extract->document.pages_num-1];
+ point_t p0 = transform(x0, y0, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f);
+ point_t p1 = transform(x1, y1, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f);
+ double width2 = width * sqrt( fabs( ctm_a * ctm_d - ctm_b * ctm_c));
+ rect_t rect;
+ (void) color;
+ rect.min.x = s_min(p0.x, p1.x);
+ rect.min.y = s_min(p0.y, p1.y);
+ rect.max.x = s_max(p0.x, p1.x);
+ rect.max.y = s_max(p0.y, p1.y);
+
+ outf("%s: width=%f ((%f %f)(%f %f)) rect=%s",
+ extract_FUNCTION,
+ width,
+ x0, y0, x1, y1,
+ extract_rect_string(&rect)
+ );
+ if (rect.min.x == rect.max.x && rect.min.y == rect.max.y)
+ {
+ }
+ else if (rect.min.x == rect.max.x)
+ {
+ rect.min.x -= width2 / 2;
+ rect.max.x += width2 / 2;
+ return tablelines_append(extract->alloc, &page->tablelines_vertical, &rect, color);
+ }
+ else if (rect.min.y == rect.max.y)
+ {
+ rect.min.y -= width2 / 2;
+ rect.max.y += width2 / 2;
+ return tablelines_append(extract->alloc, &page->tablelines_horizontal, &rect, color);
+ }
+ return 0;
+}
+
+
int extract_page_begin(extract_t* extract)
{
/* Appends new empty extract_page_t to an extract->document. */
@@ -1062,6 +1387,13 @@ int extract_page_begin(extract_t* extract)
page->paragraphs_num = 0;
page->images = NULL;
page->images_num = 0;
+ page->tablelines_horizontal.tablelines = NULL;
+ page->tablelines_horizontal.tablelines_num = 0;
+ page->tablelines_vertical.tablelines = NULL;
+ page->tablelines_vertical.tablelines_num = 0;
+ page->tables = NULL;
+ page->tables_num = 0;
+
if (extract_realloc2(
extract->alloc,
&extract->document.pages,
@@ -1076,6 +1408,231 @@ int extract_page_begin(extract_t* extract)
return 0;
}
+int extract_fill_begin(
+ extract_t* extract,
+ double ctm_a,
+ double ctm_b,
+ double ctm_c,
+ double ctm_d,
+ double ctm_e,
+ double ctm_f,
+ double color
+ )
+{
+ assert(extract->path_type == path_type_NONE);
+ extract->path_type = path_type_FILL;
+ extract->path.fill.color = color;
+ extract->path.fill.n = 0;
+ extract->path.fill.ctm.a = ctm_a;
+ extract->path.fill.ctm.b = ctm_b;
+ extract->path.fill.ctm.c = ctm_c;
+ extract->path.fill.ctm.d = ctm_d;
+ extract->path.fill.ctm.e = ctm_e;
+ extract->path.fill.ctm.f = ctm_f;
+ return 0;
+}
+
+int extract_stroke_begin(
+ extract_t* extract,
+ double ctm_a,
+ double ctm_b,
+ double ctm_c,
+ double ctm_d,
+ double ctm_e,
+ double ctm_f,
+ double line_width,
+ double color
+ )
+{
+ assert(extract->path_type == path_type_NONE);
+ extract->path_type = path_type_STROKE;
+ extract->path.stroke.ctm.a = ctm_a;
+ extract->path.stroke.ctm.b = ctm_b;
+ extract->path.stroke.ctm.c = ctm_c;
+ extract->path.stroke.ctm.d = ctm_d;
+ extract->path.stroke.ctm.e = ctm_e;
+ extract->path.stroke.ctm.f = ctm_f;
+ extract->path.stroke.width = line_width;
+ extract->path.stroke.color = color;
+ extract->path.stroke.point0_set = 0;
+ extract->path.stroke.point_set = 0;
+ return 0;
+}
+
+int extract_moveto(extract_t* extract, double x, double y)
+{
+ if (extract->path_type == path_type_FILL)
+ {
+ if (extract->path.fill.n == -1) return 0;
+ if (extract->path.fill.n != 0)
+ {
+ outf0("returning error. extract->path.fill.n=%i", extract->path.fill.n);
+ extract->path.fill.n = -1;
+ return 0;
+ }
+ extract->path.fill.points[extract->path.fill.n].x = x;
+ extract->path.fill.points[extract->path.fill.n].y = y;
+ extract->path.fill.n += 1;
+ return 0;
+ }
+ else if (extract->path_type == path_type_STROKE)
+ {
+ extract->path.stroke.point.x = x;
+ extract->path.stroke.point.y = y;
+ extract->path.stroke.point_set = 1;
+ if (!extract->path.stroke.point0_set)
+ {
+ extract->path.stroke.point0 = extract->path.stroke.point;
+ extract->path.stroke.point0_set = 1;
+ }
+ return 0;
+ }
+ else
+ {
+ assert(0);
+ return -1;
+ }
+}
+
+int extract_lineto(extract_t* extract, double x, double y)
+{
+ if (extract->path_type == path_type_FILL)
+ {
+ if (extract->path.fill.n == -1) return 0;
+ if (extract->path.fill.n == 0 || extract->path.fill.n >= 4)
+ {
+ outf0("returning error. extract->path.fill.n=%i", extract->path.fill.n);
+ extract->path.fill.n = -1;
+ return 0;
+ }
+ extract->path.fill.points[extract->path.fill.n].x = x;
+ extract->path.fill.points[extract->path.fill.n].y = y;
+ extract->path.fill.n += 1;
+ return 0;
+ }
+ else if (extract->path_type == path_type_STROKE)
+ {
+ if (extract->path.stroke.point_set)
+ {
+ if (extract_add_line(
+ extract,
+ extract->path.stroke.ctm.a,
+ extract->path.stroke.ctm.b,
+ extract->path.stroke.ctm.c,
+ extract->path.stroke.ctm.d,
+ extract->path.stroke.ctm.e,
+ extract->path.stroke.ctm.f,
+ extract->path.stroke.width,
+ extract->path.stroke.point.x,
+ extract->path.stroke.point.y,
+ x,
+ y,
+ extract->path.stroke.color
+ ))
+ {
+ return -1;
+ }
+ }
+ extract->path.stroke.point.x = x;
+ extract->path.stroke.point.y = y;
+ extract->path.stroke.point_set = 1;
+ if (!extract->path.stroke.point0_set)
+ {
+ extract->path.stroke.point0 = extract->path.stroke.point;
+ extract->path.stroke.point0_set = 1;
+ }
+ return 0;
+ }
+ else
+ {
+ assert(0);
+ return -1;
+ }
+}
+
+int extract_closepath(extract_t* extract)
+{
+ if (extract->path_type == path_type_FILL)
+ {
+ if (extract->path.fill.n == 4)
+ {
+ /* We are closing a four-element path, so this could be a thin
+ rectangle that defines a line in a table. */
+ int e;
+ e = extract_add_path4(
+ extract,
+ extract->path.fill.ctm.a,
+ extract->path.fill.ctm.b,
+ extract->path.fill.ctm.c,
+ extract->path.fill.ctm.d,
+ extract->path.fill.ctm.e,
+ extract->path.fill.ctm.f,
+ extract->path.fill.points[0].x,
+ extract->path.fill.points[0].y,
+ extract->path.fill.points[1].x,
+ extract->path.fill.points[1].y,
+ extract->path.fill.points[2].x,
+ extract->path.fill.points[2].y,
+ extract->path.fill.points[3].x,
+ extract->path.fill.points[3].y,
+ extract->path.fill.color
+ );
+ if (e) return e;
+ }
+ extract->path.fill.n = 0;
+ return 0;
+ }
+ else if (extract->path_type == path_type_STROKE)
+ {
+ if (extract->path.stroke.point0_set && extract->path.stroke.point_set)
+ {
+ if (extract_add_line(
+ extract,
+ extract->path.stroke.ctm.a,
+ extract->path.stroke.ctm.b,
+ extract->path.stroke.ctm.c,
+ extract->path.stroke.ctm.d,
+ extract->path.stroke.ctm.e,
+ extract->path.stroke.ctm.f,
+ extract->path.stroke.width,
+ extract->path.stroke.point.x,
+ extract->path.stroke.point.y,
+ extract->path.stroke.point0.x,
+ extract->path.stroke.point0.y,
+ extract->path.stroke.color
+ ))
+ {
+ return -1;
+ }
+ return 0;
+ }
+ extract->path.stroke.point = extract->path.stroke.point0;
+ return 0;
+ }
+ else
+ {
+ assert(0);
+ return -1;
+ }
+}
+
+
+int extract_fill_end(extract_t* extract)
+{
+ assert(extract->path_type == path_type_FILL);
+ extract->path_type = path_type_NONE;
+ return 0;
+}
+
+
+int extract_stroke_end(extract_t* extract)
+{
+ assert(extract->path_type == path_type_STROKE);
+ extract->path_type = path_type_NONE;
+ return 0;
+}
+
+
int extract_page_end(extract_t* extract)
{
@@ -1083,6 +1640,118 @@ int extract_page_end(extract_t* extract)
return 0;
}
+
+static int paragraphs_to_text_content(
+ extract_alloc_t* alloc,
+ paragraph_t** paragraphs,
+ int paragraphs_num,
+ extract_astring_t* text
+ )
+{
+ int p;
+ for (p=0; p<paragraphs_num; ++p)
+ {
+ paragraph_t* paragraph = paragraphs[p];
+ int l;
+ for (l=0; l<paragraph->lines_num; ++l)
+ {
+ line_t* line = paragraph->lines[l];
+ int s;
+ for (s=0; s<line->spans_num; ++s)
+ {
+ span_t* span = line->spans[s];
+ int c;
+ for (c=0; c<span->chars_num; ++c)
+ {
+ /* We encode each character as utf8. */
+ char_t* char_ = &span->chars[c];
+ unsigned cc = char_->ucs;
+ if (extract_astring_catc_unicode(
+ alloc,
+ text,
+ cc,
+ 0 /*xml*/,
+ 1 /*ascii_ligatures*/,
+ 1 /*ascii_dash*/,
+ 1 /*ascii_apostrophe*/
+ )) return -1;
+ }
+ }
+ }
+ if (extract_astring_catc(alloc, text, '\n')) return -1;
+ }
+ return 0;
+}
+
+
+static int extract_write_tables_csv(extract_t* extract)
+{
+ int ret = -1;
+ int p;
+ char* path = NULL;
+ FILE* f = NULL;
+ extract_astring_t text = {NULL, 0};
+ if (!extract->tables_csv_format) return 0;
+
+ outf("extract_write_tables_csv(): path_format=%s", extract->tables_csv_format);
+ outf("extract->document.pages_num=%i", extract->document.pages_num);
+ for (p=0; p<extract->document.pages_num; ++p)
+ {
+ extract_page_t* page = extract->document.pages[p];
+ int t;
+ outf("p=%i page->tables_num=%i", p, page->tables_num);
+ for (t=0; t<page->tables_num; ++t)
+ {
+ table_t* table = page->tables[t];
+ int y;
+ extract_free(extract->alloc, &path);
+ if (extract_asprintf(extract->alloc, &path, extract->tables_csv_format, extract->tables_csv_i) < 0) goto end;
+ extract->tables_csv_i += 1;
+ outf("Writing table %i to: %s", t, path);
+ outf("table->cells_num_x=%i", table->cells_num_x);
+ outf("table->cells_num_y=%i", table->cells_num_y);
+ f = fopen(path, "w");
+ if (!f) goto end;
+ for (y=0; y<table->cells_num_y; ++y)
+ {
+ int x;
+ int have_output = 0;
+ for (x=0; x<table->cells_num_x; ++x)
+ {
+ cell_t* cell = table->cells[table->cells_num_x * y + x];
+ extract_astring_free(extract->alloc, &text);
+ if (y==0)
+ {
+ outf("y=0 x=%i cell->rect=%s", x, extract_rect_string(&cell->rect));
+ }
+ if (have_output) fprintf(f, ",");
+ have_output = 1;
+ if (paragraphs_to_text_content(
+ extract->alloc,
+ cell->paragraphs,
+ cell->paragraphs_num,
+ &text
+ )) goto end;
+ /* Reference cvs output trims trailing spaces. */
+ extract_astring_char_truncate_if(&text, ' ');
+ fprintf(f, "\"%s\"", text.chars ? text.chars : "");
+ }
+ fprintf(f, "\n");
+ }
+ fclose(f);
+ f = NULL;
+ }
+ }
+ ret = 0;
+
+ end:
+ if (f) fclose(f);
+ extract_free(extract->alloc, &path);
+ extract_astring_free(extract->alloc, &text);
+ return ret;
+}
+
+
int extract_process(
extract_t* extract,
int spacing,
@@ -1126,6 +1795,30 @@ int extract_process(
&extract->contentss[extract->contentss_num - 1]
)) goto end;
}
+ else if (extract->format == extract_format_HTML)
+ {
+ if (extract_document_to_html_content(
+ extract->alloc,
+ &extract->document,
+ rotation,
+ images,
+ &extract->contentss[extract->contentss_num - 1]
+ )) goto end;
+ }
+ else if (extract->format == extract_format_TEXT)
+ {
+ int p;
+ for (p=0; p<extract->document.pages_num; ++p)
+ {
+ extract_page_t* page = extract->document.pages[p];
+ if (paragraphs_to_text_content(
+ extract->alloc,
+ page->paragraphs,
+ page->paragraphs_num,
+ &extract->contentss[extract->contentss_num - 1]
+ )) goto end;
+ }
+ }
else
{
outf0("Invalid format=%i", extract->format);
@@ -1136,11 +1829,15 @@ int extract_process(
if (extract_document_images(extract->alloc, &extract->document, &extract->images)) goto end;
+ if (extract->tables_csv_format)
+ {
+ extract_write_tables_csv(extract);
+ }
+
{
int i;
for (i=0; i<extract->document.pages_num; ++i) {
- page_free(extract->alloc, extract->document.pages[i]);
- extract_free(extract->alloc, &extract->document.pages[i]);
+ page_free(extract->alloc, &extract->document.pages[i]);
}
extract_free(extract->alloc, &extract->document.pages);
extract->document.pages_num = 0;
@@ -1159,9 +1856,9 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer)
char* text2 = NULL;
int i;
- if (extract_zip_open(buffer, &zip)) goto end;
if (extract->format == extract_format_ODT)
{
+ if (extract_zip_open(buffer, &zip)) goto end;
for (i=0; i<odt_template_items_num; ++i) {
const odt_template_item_t* item = &odt_template_items[i];
extract_free(extract->alloc, &text2);
@@ -1191,9 +1888,11 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer)
if (extract_asprintf(extract->alloc, &text2, "Pictures/%s", image->name) < 0) goto end;
if (extract_zip_write_file(zip, image->data, image->data_size, text2)) goto end;
}
+ if (extract_zip_close(&zip)) goto end;
}
else if (extract->format == extract_format_DOCX)
{
+ if (extract_zip_open(buffer, &zip)) goto end;
for (i=0; i<docx_template_items_num; ++i) {
const docx_template_item_t* item = &docx_template_items[i];
extract_free(extract->alloc, &text2);
@@ -1222,6 +1921,22 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer)
if (extract_asprintf(extract->alloc, &text2, "word/media/%s", image->name) < 0) goto end;
if (extract_zip_write_file(zip, image->data, image->data_size, text2)) goto end;
}
+ if (extract_zip_close(&zip)) goto end;
+
+ }
+ else if (extract->format == extract_format_HTML)
+ {
+ for (i=0; i<extract->contentss_num; ++i)
+ {
+ if (extract_buffer_write(buffer, extract->contentss[i].chars, extract->contentss[i].chars_num, NULL)) goto end;
+ }
+ }
+ else if (extract->format == extract_format_TEXT)
+ {
+ for (i=0; i<extract->contentss_num; ++i)
+ {
+ if (extract_buffer_write(buffer, extract->contentss[i].chars, extract->contentss[i].chars_num, NULL)) goto end;
+ }
}
else
{
@@ -1231,15 +1946,15 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer)
return 1;
}
- if (extract_zip_close(&zip)) goto end;
- assert(!zip);
-
e = 0;
end:
- if (e) outf("failed: %s", strerror(errno));
+ if (e)
+ {
+ outf("failed: %s", strerror(errno));
+ extract_zip_close(&zip);
+ }
extract_free(extract->alloc, &text2);
- extract_zip_close(&zip);
return e;
}
@@ -1300,6 +2015,7 @@ int extract_write_template(
}
}
+
void extract_end(extract_t** pextract)
{
extract_t* extract = *pextract;
@@ -1314,12 +2030,13 @@ void extract_end(extract_t** pextract)
extract_free(extract->alloc, &extract->contentss);
}
extract_images_free(extract->alloc, &extract->images);
+ extract_odt_styles_free(extract->alloc, &extract->odt_styles);
extract_free(extract->alloc, pextract);
}
void extract_internal_end(void)
{
- span_string(NULL, NULL);
+ extract_span_string(NULL, NULL);
}
void extract_exp_min(extract_t* extract, size_t size)
@@ -1329,8 +2046,8 @@ void extract_exp_min(extract_t* extract, size_t size)
double extract_matrices_to_font_size(matrix_t* ctm, matrix_t* trm)
{
- double font_size = matrix_expansion(*trm)
- * matrix_expansion(*ctm);
+ double font_size = extract_matrix_expansion(*trm)
+ * extract_matrix_expansion(*ctm);
/* Round font_size to nearest 0.01. */
font_size = (double) (int) (font_size * 100.0f + 0.5f) / 100.0f;
return font_size;
diff --git a/extract/src/html.c b/extract/src/html.c
new file mode 100644
index 00000000..d12a3101
--- /dev/null
+++ b/extract/src/html.c
@@ -0,0 +1,314 @@
+/* These extract_html_*() functions generate docx content and docx zip archive
+data.
+
+Caller must call things in a sensible order to create valid content -
+e.g. don't call docx_paragraph_start() twice without intervening call to
+docx_paragraph_finish(). */
+
+#include "../include/extract.h"
+
+#include "astring.h"
+#include "document.h"
+#include "html.h"
+#include "mem.h"
+#include "memento.h"
+#include "outf.h"
+#include "sys.h"
+#include "text.h"
+#include "zip.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <float.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <sys/stat.h>
+
+
+static void content_state_init(content_state_t* content_state)
+{
+ content_state->font.name = NULL;
+ content_state->font.size = 0;
+ content_state->font.bold = 0;
+ content_state->font.italic = 0;
+ content_state->ctm_prev = NULL;
+}
+
+static int content_state_reset(extract_alloc_t* alloc, content_state_t* content_state, extract_astring_t* content)
+{
+ int e = -1;
+ if (content_state->font.bold)
+ {
+ if (extract_astring_cat(alloc, content, "</b>")) goto end;
+ content_state->font.bold = 0;
+ }
+ if (content_state->font.italic)
+ {
+ if (extract_astring_cat(alloc, content, "</i>")) goto end;
+ content_state->font.italic = 0;
+ }
+ e = 0;
+
+ end:
+ return e;
+}
+
+static int paragraph_to_html_content(
+ extract_alloc_t* alloc,
+ content_state_t* content_state,
+ paragraph_t* paragraph,
+ int single_line,
+ extract_astring_t* content
+ )
+{
+ int e = -1;
+ const char* endl = (single_line) ? "" : "\n";
+ int l;
+ if (extract_astring_catf(alloc, content, "%s%s<p>", endl, endl)) goto end;
+
+ for (l=0; l<paragraph->lines_num; ++l)
+ {
+ line_t* line = paragraph->lines[l];
+ int s;
+ for (s=0; s<line->spans_num; ++s)
+ {
+ int c;
+ span_t* span = line->spans[s];
+ content_state->ctm_prev = &span->ctm;
+ if (span->flags.font_bold != content_state->font.bold)
+ {
+ if (extract_astring_cat(alloc, content,
+ span->flags.font_bold ? "<b>" : "</b>"
+ )) goto end;
+ content_state->font.bold = span->flags.font_bold;
+ }
+ if (span->flags.font_italic != content_state->font.italic)
+ {
+ if ( extract_astring_cat(alloc, content,
+ span->flags.font_italic ? "<i>" : "</i>"
+ )) goto end;
+ content_state->font.italic = span->flags.font_italic;
+ }
+
+ for (c=0; c<span->chars_num; ++c)
+ {
+ char_t* char_ = &span->chars[c];
+ if (extract_astring_catc_unicode_xml(alloc, content, char_->ucs)) goto end;
+ }
+ }
+
+ if (content->chars_num && l+1 < paragraph->lines_num)
+ {
+ if (content->chars[content->chars_num-1] == '-') content->chars_num -= 1;
+ else if (content->chars[content->chars_num-1] != ' ')
+ {
+ extract_astring_catc(alloc, content, ' ');
+ }
+ }
+ }
+ if (extract_astring_catf(alloc, content, "%s</p>", endl)) goto end;
+
+ e = 0;
+
+ end:
+ return e;
+}
+
+
+static int paragraphs_to_html_content(
+ extract_alloc_t* alloc,
+ content_state_t* state,
+ paragraph_t** paragraphs,
+ int paragraphs_num,
+ int single_line,
+ extract_astring_t* content
+ )
+/* Append html for paragraphs[] to <content>. Updates *state if we change font
+etc. */
+{
+ int e = -1;
+ int p;
+ for (p=0; p<paragraphs_num; ++p)
+ {
+ paragraph_t* paragraph = paragraphs[p];
+ if (paragraph_to_html_content(alloc, state, paragraph, single_line, content)) goto end;
+ }
+
+ if (content_state_reset(alloc, state, content)) goto end;
+ e = 0;
+
+ end:
+ return e;
+}
+
+static int append_table(extract_alloc_t* alloc, content_state_t* state, table_t* table, extract_astring_t* content)
+{
+ int e = -1;
+ int y;
+
+ if (extract_astring_cat(alloc, content, "\n\n<table border=\"1\" style=\"border-collapse:collapse\">\n")) goto end;
+
+ for (y=0; y<table->cells_num_y; ++y)
+ {
+ /* If 1, we put each <td>...</td> on a separate line. */
+ int multiline = 0;
+ int x;
+ if (extract_astring_cat(alloc, content, " <tr>\n")) goto end;
+ if (!multiline)
+ {
+ if (extract_astring_cat(alloc, content, " ")) goto end;
+ }
+ for (x=0; x<table->cells_num_x; ++x)
+ {
+ cell_t* cell = table->cells[y*table->cells_num_x + x];
+ if (!cell->above || !cell->left)
+ {
+ /* HTML does not require anything for cells that are subsumed
+ by other cells that extend horizontally and vertically. */
+ continue;
+ }
+ if (extract_astring_cat(alloc, content, " ")) goto end;
+ if (extract_astring_cat(alloc, content, "<td")) goto end;
+
+ if (cell->extend_right > 1)
+ {
+ if (extract_astring_catf(alloc, content, " colspan=\"%i\"", cell->extend_right)) goto end;
+ }
+ if (cell->extend_down > 1)
+ {
+ if (extract_astring_catf(alloc, content, " rowspan=\"%i\"", cell->extend_down)) goto end;
+ }
+
+ if (extract_astring_cat(alloc, content, ">")) goto end;
+
+ if (paragraphs_to_html_content(alloc, state, cell->paragraphs, cell->paragraphs_num, 1 /* single_line*/, content)) goto end;
+ if (extract_astring_cat(alloc, content, "</td>")) goto end;
+ if (extract_astring_cat(alloc, content, "\n")) goto end;
+
+ if (content_state_reset(alloc, state, content)) goto end;
+ }
+ if (!multiline)
+ {
+ if (extract_astring_cat(alloc, content, "\n")) goto end;
+ }
+ if (extract_astring_cat(alloc, content, " </tr>\n")) goto end;
+ }
+ if (extract_astring_cat(alloc, content, "</table>\n\n")) goto end;
+ e = 0;
+
+ end:
+ return e;
+}
+
+
+static char_t* paragraph_first_char(const paragraph_t* paragraph)
+{
+ line_t* line = paragraph->lines[paragraph->lines_num - 1];
+ span_t* span = line->spans[line->spans_num - 1];
+ return &span->chars[0];
+}
+
+static int compare_paragraph_y(const void* a, const void* b)
+{
+ const paragraph_t* const* a_paragraph = a;
+ const paragraph_t* const* b_paragraph = b;
+ double a_y = paragraph_first_char(*a_paragraph)->y;
+ double b_y = paragraph_first_char(*b_paragraph)->y;
+ if (a_y > b_y) return +1;
+ if (a_y < b_y) return -1;
+ return 0;
+}
+
+int extract_document_to_html_content(
+ extract_alloc_t* alloc,
+ document_t* document,
+ int rotation,
+ int images,
+ extract_astring_t* content
+ )
+{
+ int ret = -1;
+ int p;
+ paragraph_t** paragraphs = NULL;
+
+ (void) rotation;
+ (void) images;
+
+ extract_astring_cat(alloc, content, "<html>\n");
+ extract_astring_cat(alloc, content, "<body>\n");
+
+ /* Write paragraphs into <content>. */
+ for (p=0; p<document->pages_num; ++p)
+ {
+ extract_page_t* page = document->pages[p];
+ int p;
+ int t;
+ content_state_t state;
+ content_state_init(&state);
+ extract_free(alloc, &paragraphs);
+
+ /* Output paragraphs and tables in order of increasing <y> coordinate.
+
+ Unfortunately the paragraph ordering we do in page->paragraphs[]
+ isn't quite right and results in bad ordering if ctm/trm matrices are
+ inconsistent. So we create our own list of paragraphs sorted strictly
+ by y coordinate of the first char of each paragraph. */
+ if (extract_malloc(alloc, &paragraphs, sizeof(*paragraphs) * page->paragraphs_num)) goto end;
+ for (p = 0; p < page->paragraphs_num; ++p)
+ {
+ paragraphs[p] = page->paragraphs[p];
+ }
+ qsort(paragraphs, page->paragraphs_num, sizeof(*paragraphs), compare_paragraph_y);
+
+ if (0)
+ {
+ int p;
+ outf0("paragraphs are:");
+ for (p=0; p<page->paragraphs_num; ++p)
+ {
+ paragraph_t* paragraph = page->paragraphs[p];
+ line_t* line = paragraph->lines[0];
+ span_t* span = line->spans[0];
+ outf0(" p=%i: %s", p, extract_span_string(NULL, span));
+ }
+ }
+
+ p = 0;
+ t = 0;
+ for(;;)
+ {
+ double y_paragraph;
+ double y_table;
+ paragraph_t* paragraph = (p == page->paragraphs_num) ? NULL : paragraphs[p];
+ table_t* table = (t == page->tables_num) ? NULL : page->tables[t];
+ if (!paragraph && !table) break;
+ y_paragraph = (paragraph) ? paragraph->lines[0]->spans[0]->chars[0].y : DBL_MAX;
+ y_table = (table) ? table->pos.y : DBL_MAX;
+ outf("p=%i y_paragraph=%f", p, y_paragraph);
+ outf("t=%i y_table=%f", t, y_table);
+ if (paragraph && y_paragraph < y_table)
+ {
+ //extract_astring_catf(alloc, content, "<p>@@@ paragraph %i y=%f @@@)</p>\n", p, y_paragraph);
+ if (paragraph_to_html_content(alloc, &state, paragraph, 0 /*single_line*/, content)) goto end;
+ if (content_state_reset(alloc, &state, content)) goto end;
+ p += 1;
+ }
+ else if (table)
+ {
+ //extract_astring_catf(alloc, content, "<p>@@@ table %t y=%f @@@)</p>\n", p, y_table);
+ if (append_table(alloc, &state, table, content)) goto end;
+ t += 1;
+ }
+ }
+ }
+ extract_astring_cat(alloc, content, "</body>\n");
+ extract_astring_cat(alloc, content, "</html>\n");
+ ret = 0;
+
+ end:
+ extract_free(alloc, &paragraphs);
+ return ret;
+}
diff --git a/extract/src/html.h b/extract/src/html.h
new file mode 100644
index 00000000..6148a067
--- /dev/null
+++ b/extract/src/html.h
@@ -0,0 +1,23 @@
+#ifndef ARTIFEX_EXTRACT_HTML_H
+#define ARTIFEX_EXTRACT_HTML_H
+
+/* Only for internal use by extract code. */
+
+/* Things for creating docx files. */
+
+int extract_document_to_html_content(
+ extract_alloc_t* alloc,
+ document_t* document,
+ int rotation,
+ int images,
+ extract_astring_t* content
+ );
+/* Makes *o_content point to a string containing all paragraphs in *document in
+docx XML format.
+
+This string can be passed to extract_docx_content_item() or
+extract_docx_write_template() to be inserted into a docx archive's
+word/document.xml. */
+
+
+#endif
diff --git a/extract/src/join.c b/extract/src/join.c
index f12e2751..4425de3d 100644
--- a/extract/src/join.c
+++ b/extract/src/join.c
@@ -7,6 +7,7 @@
#include "outf.h"
#include <assert.h>
+#include <float.h>
#include <math.h>
#include <stdio.h>
@@ -17,24 +18,39 @@ static char_t* span_char_first(span_t* span)
return &span->chars[0];
}
+static span_t* s_line_span_first(line_t* line)
+{
+ return extract_line_span_first(line);
+}
+
/* Returns first char_t in a line. */
static char_t* line_item_first(line_t* line)
{
- span_t* span = line_span_first(line);
+ span_t* span = s_line_span_first(line);
return span_char_first(span);
}
/* Returns last char_t in a line. */
static char_t* line_item_last(line_t* line)
{
- span_t* span = line_span_last(line);
- return span_char_last(span);
+ span_t* span = extract_line_span_last(line);
+ return extract_span_char_last(span);
}
-static const char* matrix_string(const matrix_t* matrix)
+static point_t char_to_point(const char_t* char_)
{
- static char ret[64];
- snprintf(ret, sizeof(ret), "{%f %f %f %f %f %f}",
+ point_t ret;
+ ret.x = char_->x;
+ ret.y = char_->y;
+ return ret;
+}
+
+const char* extract_matrix_string(const matrix_t* matrix)
+{
+ static char ret[5][64];
+ static int i = 0;
+ i = (i + 1) % 5;
+ snprintf(ret[i], sizeof(ret[i]), "{%f %f %f %f %f %f}",
matrix->a,
matrix->b,
matrix->c,
@@ -42,17 +58,17 @@ static const char* matrix_string(const matrix_t* matrix)
matrix->e,
matrix->f
);
- return ret;
+ return ret[i];
}
/* Returns total width of span. */
static double span_adv_total(span_t* span)
{
- double dx = span_char_last(span)->x - span_char_first(span)->x;
- double dy = span_char_last(span)->y - span_char_first(span)->y;
+ double dx = extract_span_char_last(span)->x - span_char_first(span)->x;
+ double dy = extract_span_char_last(span)->y - span_char_first(span)->y;
/* We add on the advance of the last item; this avoids us returning zero if
there's only one item. */
- double adv = span_char_last(span)->adv * matrix_expansion(span->trm);
+ double adv = extract_span_char_last(span)->adv * extract_matrix_expansion(span->trm);
return sqrt(dx*dx + dy*dy) + adv;
}
@@ -66,15 +82,30 @@ static double spans_adv(
double delta_x = b->x - a->x;
double delta_y = b->y - a->y;
double s = sqrt( delta_x*delta_x + delta_y*delta_y);
- double a_size = a->adv * matrix_expansion(a_span->trm);
+ double a_size = a->adv * extract_matrix_expansion(a_span->trm);
s -= a_size;
return s;
}
static double span_angle(span_t* span)
{
- /* Assume ctm is a rotation matix. */
double ret = atan2(-span->ctm.c, span->ctm.a);
+ if (0)
+ {
+ /* This is an attempt to take into account the trm matrix when looking
+ at spans, because for agstat.pdf vertical text seems to be achieved
+ by making trm rotate by 90 degrees. But it messes up the ordering of
+ rotated paragraphs in Python2.pdf so is disabled for now. */
+ matrix_t m = extract_multiply_matrix_matrix(span->trm, span->ctm);
+ point_t dir;
+ double ret;
+ dir.x = span->flags.wmode ? 0 : 1;
+ dir.y = span->flags.wmode ? 1 : 0;
+ dir = extract_multiply_matrix_point(m, dir);
+ ret = atan2(dir.y, dir.x);
+ return ret;
+ }
+ /* Assume ctm is a rotation matix. */
outfx("ctm.a=%f ctm.b=%f ret=%f", span->ctm.a, span->ctm.b, ret);
return ret;
/* Not sure whether this is right. Inclined text seems to be done by
@@ -89,6 +120,22 @@ static double span_angle(span_t* span)
}*/
}
+static double span_angle2(span_t* span)
+{
+ if (span->chars_num > 1)
+ {
+ double dx = span->chars[span->chars_num-1].x - span->chars[0].x;
+ double dy = span->chars[span->chars_num-1].y - span->chars[0].y;
+ double ret1 = span_angle(span);
+ double ret2 = atan2(-dy, dx);
+ if (fabs(ret2 - ret1) > 0.01)
+ {
+ outf("### ret1=%f ret2=%f: %s", ret1, ret2, extract_span_string(NULL, span));
+ }
+ }
+ return span_angle(span);
+}
+
/* Returns static string containing brief info about span_t. */
static const char* span_string2(extract_alloc_t* alloc, span_t* span)
{
@@ -182,36 +229,36 @@ static int lines_are_compatible(
{
if (a == b) return 0;
if (!a->spans || !b->spans) return 0;
- if (line_span_first(a)->flags.wmode != line_span_first(b)->flags.wmode) {
+ if (s_line_span_first(a)->flags.wmode != s_line_span_first(b)->flags.wmode) {
return 0;
}
- if (matrix_cmp4(
- &line_span_first(a)->ctm,
- &line_span_first(b)->ctm
+ if (extract_matrix_cmp4(
+ &s_line_span_first(a)->ctm,
+ &s_line_span_first(b)->ctm
)) {
if (verbose) {
outf("ctm's differ:");
outf(" %f %f %f %f %f %f",
- line_span_first(a)->ctm.a,
- line_span_first(a)->ctm.b,
- line_span_first(a)->ctm.c,
- line_span_first(a)->ctm.d,
- line_span_first(a)->ctm.e,
- line_span_first(a)->ctm.f
+ s_line_span_first(a)->ctm.a,
+ s_line_span_first(a)->ctm.b,
+ s_line_span_first(a)->ctm.c,
+ s_line_span_first(a)->ctm.d,
+ s_line_span_first(a)->ctm.e,
+ s_line_span_first(a)->ctm.f
);
outf(" %f %f %f %f %f %f",
- line_span_first(b)->ctm.a,
- line_span_first(b)->ctm.b,
- line_span_first(b)->ctm.c,
- line_span_first(b)->ctm.d,
- line_span_first(b)->ctm.e,
- line_span_first(b)->ctm.f
+ s_line_span_first(b)->ctm.a,
+ s_line_span_first(b)->ctm.b,
+ s_line_span_first(b)->ctm.c,
+ s_line_span_first(b)->ctm.d,
+ s_line_span_first(b)->ctm.e,
+ s_line_span_first(b)->ctm.f
);
}
return 0;
}
{
- double angle_b = span_angle(line_span_first(b));
+ double angle_b = span_angle(s_line_span_first(b));
if (angle_b != angle_a) {
outfx("%s:%i: angles differ");
return 0;
@@ -221,6 +268,80 @@ static int lines_are_compatible(
}
+static const unsigned ucs_NONE = ((unsigned) -1);
+
+static int s_span_inside_rects(
+ extract_alloc_t* alloc,
+ span_t* span,
+ rect_t* rects,
+ int rects_num,
+ span_t* o_span
+ )
+/* Returns with <o_span> containing char_t's from <span> that are inside
+rects[], and *span modified to remove any char_t's that we have moved to
+<o_span>.
+
+May return with span->chars_num == 0, in which case the caller must remove the
+span (including freeing .font_name), because lots of code assumes that there
+are no empty spans. */
+{
+ int c;
+ *o_span = *span;
+ extract_strdup(alloc, span->font_name, &o_span->font_name);
+ o_span->chars = NULL;
+ o_span->chars_num = 0;
+ for (c=0; c<span->chars_num; ++c)
+ {
+ /* For now we just look at whether span's (x, y) is within any
+ rects[]. We could instead try to find character's bounding box etc. */
+ char_t* char_ = &span->chars[c];
+ int r;
+ for (r=0; r<rects_num; ++r)
+ {
+ rect_t* rect = &rects[r];
+ if (1
+ && char_->x >= rect->min.x
+ && char_->x < rect->max.x
+ && char_->y >= rect->min.y
+ && char_->y < rect->max.y
+ )
+ {
+ if (extract_span_append_c(alloc, o_span, char_->ucs)) return -1;
+ /* Coverity warns, but o_span must have at least one item. */
+ /* coverity[var_deref_op] */
+ *extract_span_char_last(o_span) = *char_;
+ char_->ucs = ucs_NONE; /* Mark for removal below, so it is not used again. */
+ break;
+ }
+ }
+ }
+
+ /* Remove any char_t's that we've used. */
+ {
+ int cc = 0;
+ for (c=0; c<span->chars_num; ++c)
+ {
+ char_t* char_ = &span->chars[c];
+ if (char_->ucs != ucs_NONE)
+ {
+ span->chars[cc] = span->chars[c];
+ cc += 1;
+ }
+ }
+ /* This might set span->chars_num to zero; our caller needs to remove
+ the span - lots of code assumes that all spans contain at least one
+ character. */
+ span->chars_num = cc;
+ }
+
+ if (o_span->chars_num)
+ {
+ //outf0(" span: %s", extract_span_string(alloc, span));
+ outf("o_span: %s", extract_span_string(alloc, o_span));
+ }
+ return 0;
+}
+
/* Creates representation of span_t's that consists of a list of line_t's, with
each line_t contains pointers to a list of span_t's.
@@ -230,11 +351,16 @@ On entry:
Original value of *o_lines and *o_lines_num are ignored.
<spans> points to array of <spans_num> span_t*'s, each pointing to
- an span_t.
+ a span_t.
On exit:
If we succeed, we return 0, with *o_lines pointing to array of *o_lines_num
- line_t*'s, each pointing to an line_t.
+ line_t*'s, each pointing to a line_t.
+
+ If <rects_num> is zero, each of these line_t's will contain pointers to
+ items in <spans>; otherwise each of the line_t's will contain new spans
+ which should be freed by the caller (spans are not necessarily wholy inside
+ or outside rects[] so we need to create new spams).
Otherwise we return -1 with errno set. *o_lines and *o_lines_num are
undefined.
@@ -242,35 +368,85 @@ On exit:
static int make_lines(
extract_alloc_t* alloc,
span_t** spans,
- int spans_num,
+ int* spans_num,
+ rect_t* rects,
+ int rects_num,
line_t*** o_lines,
int* o_lines_num
)
{
int ret = -1;
- /* Make an line_t for each span. Then we will join some of these
- line_t's together before returning. */
- int lines_num = spans_num;
+ /* Make a line_t for each span. Then we will join some of these line_t's
+ together before returning. */
+ int lines_num = 0;
line_t** lines = NULL;
int a;
int num_compatible;
int num_joins;
- if (extract_malloc(alloc, &lines, sizeof(*lines) * lines_num)) goto end;
-
- /* Ensure we can clean up after error. */
- for (a=0; a<lines_num; ++a) {
- lines[a] = NULL;
- }
- for (a=0; a<lines_num; ++a) {
- if (extract_malloc(alloc, &lines[a], sizeof(line_t))) goto end;
- lines[a]->spans_num = 0;
- if (extract_malloc(alloc, &lines[a]->spans, sizeof(span_t*) * 1)) goto end;
- lines[a]->spans_num = 1;
- lines[a]->spans[0] = spans[a];
- outfx("initial line a=%i: %s", a, line_string(lines[a]));
+ span_t* span = NULL;
+
+ if (rects_num)
+ {
+ /* Make <lines> contain new span_t's and char_t's that are inside rects[]. */
+ for (a=0; a<*spans_num; ++a)
+ {
+ if (spans[a]->chars_num == 0) continue; /* In case used for table, */
+ if (extract_realloc(alloc, &span, sizeof(*span))) goto end;
+ extract_span_init(span);
+ if (s_span_inside_rects(alloc, spans[a], rects, rects_num, span))
+ {
+ goto end;
+ }
+ if (span->chars_num)
+ {
+ if (extract_realloc(alloc, &lines, sizeof(*lines) * (lines_num + 1))) goto end;
+ if (extract_malloc(alloc, &lines[lines_num], sizeof(line_t))) goto end;
+ lines_num += 1;
+ if (extract_malloc(alloc, &lines[lines_num-1]->spans, sizeof(span_t*) * 1)) goto end;
+ lines[lines_num-1]->spans[0] = span;
+ lines[lines_num-1]->spans_num = 1;
+ span = NULL;
+ }
+ else
+ {
+ extract_span_free(alloc, &span);
+ }
+
+ if (!spans[a]->chars_num)
+ {
+ /* All characters in this span are inside table, so remove
+ entire span, otherwise the same characters will end up being
+ output outside the table also. */
+ extract_span_free(alloc, &spans[a]);
+ memmove(&spans[a], &spans[a+1], sizeof(*spans) * ((*spans_num) - (a+1)));
+ *spans_num -= 1;
+ a -= 1;
+ }
+ }
}
+ else
+ {
+ /* Make <lines> be a copy of <spans>. */
+ lines_num = *spans_num;
+ if (extract_malloc(alloc, &lines, sizeof(*lines) * lines_num)) goto end;
+ /* Ensure we can clean up after error. */
+ for (a=0; a<lines_num; ++a) {
+ lines[a] = NULL;
+ }
+ for (a=0; a<lines_num; ++a) {
+ if (extract_malloc(alloc, &lines[a], sizeof(line_t))) goto end;
+ lines[a]->spans_num = 0;
+ if (extract_malloc(alloc, &lines[a]->spans, sizeof(span_t*) * 1)) goto end;
+ lines[a]->spans_num = 1;
+ lines[a]->spans[0] = spans[a];
+ /* Ensure that spans[] can be safely freed now we've moved it into lines[]. */
+ spans[a] = NULL;
+ outfx("initial line a=%i: %s", a, line_string(lines[a]));
+ }
+ }
+
num_compatible = 0;
/* For each line, look for nearest aligned line, and append if found. */
@@ -290,14 +466,14 @@ static int make_lines(
}
if (0 && a < 1) verbose = 1;
- outfx("looking at line_a=%s", line_string2(line_a));
+ outfx("looking at line_a=%s", line_string2(alloc, line_a));
- span_a = line_span_last(line_a);
+ span_a = extract_line_span_last(line_a);
angle_a = span_angle(span_a);
if (verbose) outf("a=%i angle_a=%f ctm=%s: %s",
a,
angle_a * 180/pi,
- matrix_string(&span_a->ctm),
+ extract_matrix_string(&span_a->ctm),
line_string2(alloc, line_a)
);
@@ -310,7 +486,6 @@ static int make_lines(
continue;
}
if (verbose) {
- outf("");
outf("a=%i b=%i: nearest_line_b=%i nearest_adv=%f",
a,
b,
@@ -330,17 +505,17 @@ static int make_lines(
/* Find angle between last glyph of span_a and first glyph of
span_b. This detects whether the lines are lined up with each other
(as opposed to being at the same angle but in different lines). */
- span_t* span_b = line_span_first(line_b);
- double dx = span_char_first(span_b)->x - span_char_last(span_a)->x;
- double dy = span_char_first(span_b)->y - span_char_last(span_a)->y;
+ span_t* span_b = s_line_span_first(line_b);
+ double dx = span_char_first(span_b)->x - extract_span_char_last(span_a)->x;
+ double dy = span_char_first(span_b)->y - extract_span_char_last(span_a)->y;
double angle_a_b = atan2(-dy, dx);
const double angle_tolerance_deg = 1;
if (verbose) {
outf("delta=(%f %f) alast=(%f %f) bfirst=(%f %f): angle_a=%f angle_a_b=%f",
dx,
dy,
- span_char_last(span_a)->x,
- span_char_last(span_a)->y,
+ extract_span_char_last(span_a)->x,
+ extract_span_char_last(span_a)->y,
span_char_first(span_b)->x,
span_char_first(span_b)->y,
angle_a * 180 / pi,
@@ -353,7 +528,7 @@ static int make_lines(
/* Find distance between end of line_a and beginning of line_b. */
double adv = spans_adv(
span_a,
- span_char_last(span_a),
+ extract_span_char_last(span_a),
span_char_first(span_b)
);
if (verbose) outf("nearest_adv=%f. angle_a_b=%f adv=%f",
@@ -370,8 +545,8 @@ static int make_lines(
else {
if (verbose) outf(
"angle beyond tolerance: span_a last=(%f,%f) span_b first=(%f,%f) angle_a_b=%g angle_a=%g span_a.trm{a=%f b=%f}",
- span_char_last(span_a)->x,
- span_char_last(span_a)->y,
+ extract_span_char_last(span_a)->x,
+ extract_span_char_last(span_a)->y,
span_char_first(span_b)->x,
span_char_first(span_b)->y,
angle_a_b * 180 / pi,
@@ -386,24 +561,30 @@ static int make_lines(
if (nearest_line) {
/* line_a and nearest_line are aligned so we can move line_b's
spans on to the end of line_a. */
- span_t* span_b = line_span_first(nearest_line);
+ double average_adv;
+ span_t* span_b = s_line_span_first(nearest_line);
b = nearest_line_b;
if (verbose) outf("found nearest line. a=%i b=%i", a, b);
+ /* Find average advance of the two adjacent spans in the two
+ lines we are considering joining, so that we can decide whether
+ the distance between them is large enough to merit joining with
+ a space character). */
+ average_adv = (
+ (span_adv_total(span_a) + span_adv_total(span_b))
+ /
+ (double) (span_a->chars_num + span_b->chars_num)
+ );
+
+ if (0 && nearest_adv > 5 * average_adv)
+ {
+ continue;
+ }
+
if (1
- && span_char_last(span_a)->ucs != ' '
+ && extract_span_char_last(span_a)->ucs != ' '
&& span_char_first(span_b)->ucs != ' '
) {
- /* Find average advance of the two adjacent spans in the two
- lines we are considering joining, so that we can decide whether
- the distance between them is large enough to merit joining with
- a space character). */
- double average_adv = (
- (span_adv_total(span_a) + span_adv_total(span_b))
- /
- (double) (span_a->chars_num + span_b->chars_num)
- );
-
int insert_space = (nearest_adv > 0.25 * average_adv);
if (insert_space) {
/* Append space to span_a before concatenation. */
@@ -413,8 +594,8 @@ static int make_lines(
nearest_adv,
average_adv
);
- outf(" a: %s", span_string(alloc, span_a));
- outf(" b: %s", span_string(alloc, span_b));
+ outf(" a: %s", extract_span_string(alloc, span_a));
+ outf(" b: %s", extract_span_string(alloc, span_b));
}
if (extract_realloc2(
alloc,
@@ -427,6 +608,13 @@ static int make_lines(
extract_bzero(item, sizeof(*item));
item->ucs = ' ';
item->adv = nearest_adv;
+ /* This is a hack to give our extra space a vaguely useful
+ (x,y) coordinate - this can be used later on when ordering
+ paragraphs. We could try to be more accurate by adding
+ item[-1]'s .adv suitably transformed by .wmode, .ctm and
+ .trm. */
+ item->x = item[-1].x;
+ item->y = item[-1].y;
}
if (verbose) {
@@ -440,14 +628,14 @@ static int make_lines(
"joining line insert_space=%i a=%i (y=%f) to line b=%i (y=%f). nearest_adv=%f average_adv=%f",
insert_space,
a,
- span_char_last(span_a)->y,
+ extract_span_char_last(span_a)->y,
b,
span_char_first(span_b)->y,
nearest_adv,
average_adv
);
- outf("a: %s", span_string(alloc, span_a));
- outf("b: %s", span_string(alloc, span_b));
+ outf("a: %s", extract_span_string(alloc, span_a));
+ outf("b: %s", extract_span_string(alloc, span_b));
}
}
@@ -487,7 +675,7 @@ static int make_lines(
the new extended line_a needs checking again. */
a -= 1;
}
- outfx("new line is:\n %s", line_string2(line_a));
+ outfx("num_joins=%i new line is:\n %s", num_joins, line_string2(line_a));
}
}
@@ -524,7 +712,7 @@ static int make_lines(
ret = 0;
outf("Turned %i spans into %i lines. num_compatible=%i",
- spans_num,
+ *spans_num,
lines_num,
num_compatible
);
@@ -532,9 +720,18 @@ static int make_lines(
end:
if (ret) {
/* Free everything. */
+ extract_span_free(alloc, &span);
if (lines) {
for (a=0; a<lines_num; ++a) {
- if (lines[a]) extract_free(alloc, &lines[a]->spans);
+ if (lines[a])
+ {
+ int s;
+ for (s=0; s<lines[a]->spans_num; ++s)
+ {
+ extract_span_free(alloc, &lines[a]->spans[s]);
+ }
+ extract_free(alloc, &lines[a]->spans);
+ }
extract_free(alloc, &lines[a]);
}
}
@@ -552,7 +749,7 @@ static double line_font_size_max(line_t* line)
for (i=0; i<line->spans_num; ++i) {
span_t* span = line->spans[i];
/* fixme: <size> should be double, which changes some output. */
- double size = matrix_expansion(span->trm);
+ double size = extract_matrix_expansion(span->trm);
if (size > size_max) {
size_max = size;
}
@@ -581,21 +778,35 @@ respectively.
AQB is a right angle. We need to find AQ.
*/
-static double line_distance(
- double ax,
- double ay,
- double bx,
- double by,
- double angle
- )
+static double line_distance_y( double ax, double ay, double bx, double by, double angle)
{
double dx = bx - ax;
double dy = by - ay;
-
return dx * sin(angle) + dy * cos(angle);
}
+/* Returns distance QB in above diagram. */
+static double line_distance_x( double ax, double ay, double bx, double by, double angle)
+{
+ double dx = bx - ax;
+ double dy = by - ay;
+
+ return dx * cos(angle) - dy * sin(angle);
+}
+
+static double line_distance_xp(point_t a, point_t b, double angle)
+{
+ return line_distance_x(a.x, a.y, b.x, b.y, angle);
+}
+
+static int lines_overlap(point_t a_left, point_t a_right, point_t b_left, point_t b_right, double angle)
+{
+ if (line_distance_xp(a_left, b_right, angle) < 0) return 0;
+ if (line_distance_xp(a_right, b_left, angle) >= 0) return 0;
+ return 1;
+}
+
/* A comparison function for use with qsort(), for sorting paragraphs within a
page. */
@@ -606,14 +817,49 @@ static int paragraphs_cmp(const void* a, const void* b)
line_t* a_line = paragraph_line_first(*a_paragraph);
line_t* b_line = paragraph_line_first(*b_paragraph);
- span_t* a_span = line_span_first(a_line);
- span_t* b_span = line_span_first(b_line);
+ span_t* a_span = s_line_span_first(a_line);
+ span_t* b_span = s_line_span_first(b_line);
- /* If ctm matrices differ, always return this diff first. Note that we
- ignore .e and .f because if data is from ghostscript then .e and .f vary
- for each span, and we don't care about these differences. */
- int d = matrix_cmp4(&a_span->ctm, &b_span->ctm);
- if (d) return d;
+ if (0)
+ {
+ double a_angle = span_angle2(a_span);
+ double b_angle = span_angle2(b_span);
+ if (fabs(a_angle - b_angle) > 0.01)
+ {
+ outf0("angles differ: a_angle=%f b_angle=%f", a_angle, b_angle);
+ outf0("a_span: %s", extract_span_string(NULL, a_span));
+ outf0("b_span: %s", extract_span_string(NULL, b_span));
+ if (a_angle - b_angle > 3.14/2) {
+ /* Give up if more than 90 deg. */
+ return 0;
+ }
+ if (a_angle > b_angle) return 1;
+ if (a_angle < b_angle) return -1;
+ return 0;
+ }
+ }
+ if (1)
+ {
+ /* If ctm matrices differ, always return this diff first. Note that we
+ ignore .e and .f because if data is from ghostscript then .e and .f
+ vary for each span, and we don't care about these differences. */
+ int d = extract_matrix_cmp4(&a_span->ctm, &b_span->ctm);
+ if (d)
+ {
+ outf("extract_matrix_cmp4() returned non-zero.");
+ outf("a_span->ctm=%s trm=%s: %s",
+ extract_matrix_string(&a_span->ctm),
+ extract_matrix_string(&a_span->trm),
+ extract_span_string(NULL, a_span)
+ );
+ outf("b_span->ctm=%s trm=%s: %s",
+ extract_matrix_string(&b_span->ctm),
+ extract_matrix_string(&a_span->trm),
+ extract_span_string(NULL, b_span)
+ );
+ return d;
+ }
+ }
{
double a_angle = line_angle(a_line);
@@ -628,7 +874,7 @@ static int paragraphs_cmp(const void* a, const void* b)
double ay = line_item_first(a_line)->y;
double bx = line_item_first(b_line)->x;
double by = line_item_first(b_line)->y;
- double distance = line_distance(ax, ay, bx, by, angle);
+ double distance = line_distance_y(ax, ay, bx, by, angle);
if (distance > 0) return -1;
if (distance < 0) return +1;
}
@@ -669,7 +915,7 @@ static int make_paragraphs(
int num_joins;
paragraph_t** paragraphs = NULL;
- /* Start off with an paragraph_t for each line_t. */
+ /* Start off with a paragraph_t for each line_t. */
int paragraphs_num = lines_num;
if (extract_malloc(alloc, &paragraphs, sizeof(*paragraphs) * paragraphs_num)) goto end;
/* Ensure we can clean up after error when setting up. */
@@ -685,11 +931,12 @@ static int make_paragraphs(
paragraphs[a]->lines[0] = lines[a];
}
+ /* Now join paragraphs together where possible. */
num_joins = 0;
for (a=0; a<paragraphs_num; ++a) {
- paragraph_t* nearest_paragraph;
- int nearest_paragraph_b;
- double nearest_paragraph_distance;
+ paragraph_t* nearest_paragraph = NULL;
+ int nearest_paragraph_b = -1;
+ double nearest_paragraph_distance = -1;
line_t* line_a;
double angle_a;
int verbose;
@@ -702,14 +949,9 @@ static int make_paragraphs(
continue;
}
- nearest_paragraph = NULL;
- nearest_paragraph_b = -1;
- nearest_paragraph_distance = -1;
assert(paragraph_a->lines_num > 0);
-
line_a = paragraph_line_last(paragraph_a);
angle_a = line_angle(line_a);
-
verbose = 0;
/* Look for nearest paragraph_t that could be appended to
@@ -732,7 +974,7 @@ static int make_paragraphs(
double ay = line_item_last(line_a)->y;
double bx = line_item_first(line_b)->x;
double by = line_item_first(line_b)->y;
- double distance = line_distance(ax, ay, bx, by, angle_a);
+ double distance = line_distance_y(ax, ay, bx, by, angle_a);
if (verbose) {
outf(
"angle_a=%f a=(%f %f) b=(%f %f) delta=(%f %f) distance=%f:",
@@ -746,17 +988,39 @@ static int make_paragraphs(
outf(" line_a=%s", line_string2(alloc, line_a));
outf(" line_b=%s", line_string2(alloc, line_b));
}
- if (distance > 0) {
+ if (distance > 0)
+ {
if (nearest_paragraph_distance == -1
- || distance < nearest_paragraph_distance) {
- if (verbose) {
- outf("updating nearest. distance=%f:", distance);
- outf(" line_a=%s", line_string2(alloc, line_a));
- outf(" line_b=%s", line_string2(alloc, line_b));
+ || distance < nearest_paragraph_distance)
+ {
+ int ok = 1;
+ if (0)
+ {
+ /* Check whether lines overlap horizontally. */
+ point_t a_left = char_to_point(line_item_first(line_a));
+ point_t b_left = char_to_point(line_item_first(line_b));
+ point_t a_right = char_to_point(line_item_last(line_a));
+ point_t b_right = char_to_point(line_item_last(line_b));
+
+ if (!lines_overlap(a_left, a_right, b_left, b_right, angle_a))
+ {
+ outf("Not joining lines because not overlapping.");
+ ok = 0;
+ }
+ }
+
+ if (ok)
+ {
+ if (verbose) {
+ outf("updating nearest. distance=%f:", distance);
+ outf(" line_a=%s", line_string2(alloc, line_a));
+ outf(" line_b=%s", line_string2(alloc, line_b));
+ }
+
+ nearest_paragraph_distance = distance;
+ nearest_paragraph_b = b;
+ nearest_paragraph = paragraph_b;
}
- nearest_paragraph_distance = distance;
- nearest_paragraph_b = b;
- nearest_paragraph = paragraph_b;
}
}
}
@@ -787,24 +1051,34 @@ static int make_paragraphs(
outf(" %s", paragraph_string(alloc, paragraph_a));
outf(" %s", paragraph_string(alloc, nearest_paragraph));
outf("paragraph_a ctm=%s",
- matrix_string(&paragraph_a->lines[0]->spans[0]->ctm)
+ extract_matrix_string(&paragraph_a->lines[0]->spans[0]->ctm)
);
outf("paragraph_a trm=%s",
- matrix_string(&paragraph_a->lines[0]->spans[0]->trm)
+ extract_matrix_string(&paragraph_a->lines[0]->spans[0]->trm)
);
}
/* Join these two paragraph_t's. */
- a_span = line_span_last(line_a);
- if (span_char_last(a_span)->ucs == '-') {
+ a_span = extract_line_span_last(line_a);
+ if (extract_span_char_last(a_span)->ucs == '-'
+ || extract_span_char_last(a_span)->ucs == 0x2212 /* unicode dash */
+ )
+ {
/* remove trailing '-' at end of prev line. char_t doesn't
contain any malloc-heap pointers so this doesn't leak. */
a_span->chars_num -= 1;
}
- else {
+ else if (extract_span_char_last(a_span)->ucs == ' ')
+ {
+ }
+ else if (extract_span_char_last(a_span)->ucs == '/')
+ {
+ }
+ else
+ {
/* Insert space before joining adjacent lines. */
char_t* c_prev;
char_t* c;
- if (span_append_c(alloc, line_span_last(line_a), ' ')) goto end;
+ if (extract_span_append_c(alloc, extract_line_span_last(line_a), ' ')) goto end;
c_prev = &a_span->chars[ a_span->chars_num-2];
c = &a_span->chars[ a_span->chars_num-1];
c->x = c_prev->x + c_prev->adv * a_span->ctm.a;
@@ -834,9 +1108,10 @@ static int make_paragraphs(
num_joins += 1;
outfx(
- "have joined paragraph a=%i to snearest_paragraph_b=%i",
+ "have joined paragraph a=%i to nearest_paragraph_b=%i. num_joins=%i.",
a,
- nearest_paragraph_b
+ nearest_paragraph_b,
+ num_joins
);
if (nearest_paragraph_b > a) {
@@ -884,26 +1159,21 @@ static int make_paragraphs(
/* Sort paragraphs so they appear in correct order, using paragraphs_cmp().
*/
- qsort(
- paragraphs,
- paragraphs_num,
- sizeof(paragraph_t*), paragraphs_cmp
- );
+ qsort(paragraphs, paragraphs_num, sizeof(paragraph_t*), paragraphs_cmp);
*o_paragraphs = paragraphs;
*o_paragraphs_num = paragraphs_num;
ret = 0;
- outf("Turned %i lines into %i paragraphs",
- lines_num,
- paragraphs_num
- );
-
+ outf("Turned %i lines into %i paragraphs", lines_num, paragraphs_num);
end:
- if (ret) {
- if (paragraphs) {
- for (a=0; a<paragraphs_num; ++a) {
+ if (ret)
+ {
+ if (paragraphs)
+ {
+ for (a=0; a<paragraphs_num; ++a)
+ {
if (paragraphs[a]) extract_free(alloc, &paragraphs[a]->lines);
extract_free(alloc, &paragraphs[a]);
}
@@ -913,39 +1183,688 @@ static int make_paragraphs(
return ret;
}
-int extract_document_join(extract_alloc_t* alloc, document_t* document)
+static int s_join_page_rects(
+ extract_alloc_t* alloc,
+ extract_page_t* page,
+ rect_t* rects,
+ int rects_num,
+ line_t*** lines,
+ int* lines_num,
+ paragraph_t*** paragraphs,
+ int* paragraphs_num
+ )
+/* Extracts text that is inside any of rects[0..rects_num], or all text if
+rects_num is zero. */
{
- int ret = -1;
+ if (make_lines(
+ alloc,
+ page->spans,
+ &page->spans_num,
+ rects,
+ rects_num,
+ lines,
+ lines_num
+ )) return -1;
+ if (make_paragraphs(
+ alloc,
+ *lines,
+ *lines_num,
+ paragraphs,
+ paragraphs_num
+ )) return -1;
+
+ return 0;
+}
+
+
+static int tablelines_compare_x(const void* a, const void* b)
+/* Compares two tableline_t's rectangles using x as primary key. */
+{
+ const tableline_t* aa = a;
+ const tableline_t* bb = b;
+ if (aa->rect.min.x > bb->rect.min.x) return +1;
+ if (aa->rect.min.x < bb->rect.min.x) return -1;
+ if (aa->rect.min.y > bb->rect.min.y) return +1;
+ if (aa->rect.min.y < bb->rect.min.y) return -1;
+ return 0;
+}
- /* For each page in <document> we join spans into lines and paragraphs. A
- line is a list of spans that are at the same angle and on the same line. A
- paragraph is a list of lines that are at the same angle and close together.
+static int tablelines_compare_y(const void* a, const void* b)
+/* Compares two tableline_t's rectangles using y as primary key. */
+{
+ const tableline_t* aa = a;
+ const tableline_t* bb = b;
+ if (aa->rect.min.y > bb->rect.min.y) return +1;
+ if (aa->rect.min.y < bb->rect.min.y) return -1;
+ if (aa->rect.min.x > bb->rect.min.x) return +1;
+ if (aa->rect.min.x < bb->rect.min.x) return -1;
+ return 0;
+}
+
+static int table_find_y_range(extract_alloc_t* alloc, tablelines_t* all, double y_min, double y_max,
+ tablelines_t* out)
+/* Makes <out> to contain all lines in <all> with y coordinate in the range
+y_min..y_max. */
+{
+ int i;
+ for (i=0; i<all->tablelines_num; ++i)
+ {
+ if (all->tablelines[i].rect.min.y >= y_min && all->tablelines[i].rect.min.y < y_max)
+ {
+ if (extract_realloc(alloc, &out->tablelines, sizeof(*out->tablelines) * (out->tablelines_num + 1))) return -1;
+ out->tablelines[out->tablelines_num] = all->tablelines[i];
+ out->tablelines_num += 1;
+ }
+ else
+ {
+ outf("Excluding line because outside y=%f..%f: %s", y_min, y_max, extract_rect_string(&all->tablelines[i].rect));
+ }
+ }
+ return 0;
+}
+
+
+static int overlap(double a_min, double a_max, double b_min, double b_max)
+/* Returns one if a_min..a_max significantly overlapps b_min..b_max, otherwise
+zero. */
+{
+ double overlap;
+ int ret0;
+ int ret1;
+ assert(a_min < a_max);
+ assert(b_min < b_max);
+ if (b_min < a_min) b_min = a_min;
+ if (b_max > a_max) b_max = a_max;
+ if (b_max < b_min) b_max = b_min;
+ overlap = (b_max - b_min) / (a_max - a_min);
+ ret0 = overlap > 0.2;
+ ret1 = overlap > 0.8;
+ if (ret0 != ret1)
+ {
+ if (0) outf0("warning, unclear overlap=%f: a=%f..%f b=%f..%f", overlap, a_min, a_max, b_min, b_max);
+ }
+ return overlap > 0.8;
+}
+
+void extract_cell_init(cell_t* cell)
+{
+ cell->rect.min.x = 0;
+ cell->rect.min.y = 0;
+ cell->rect.max.x = 0;
+ cell->rect.max.y = 0;
+ cell->above = 0;
+ cell->left = 0;
+ cell->extend_right = 0;
+ cell->extend_down = 0;
+ cell->lines = NULL;
+ cell->lines_num = 0;
+ cell->paragraphs = NULL;
+ cell->paragraphs_num = 0;
+}
+
+
+static int table_find_extend(cell_t** cells, int cells_num_x, int cells_num_y)
+{
+ /* Find cell extensions to right and down by looking at cells' .left and
+ .above flags.
+
+ For example for adjacent cells ABC..., we extend A to include cells BC..
+ until we reach a cell with .left set to one.
+
+ ABCDE
+ FGHIJ
+ KLMNO
+
+ When looking to extend cell A, we only look at cells in the same column or
+ same row, (i.e. in the above example we look at BCDE and FK, and not at
+ GHIJ and LMNO).
+
+ For example if BCDE have no left lines and FK have no above lines, we
+ ignore any lines in GHIJ and LMNO and make A extend to the entire 3x4
+ box. Having found this box, we set .above=0 and .left to 0 in all enclosed
+ cells, which simplifies html table generation code.
*/
- int p;
- for (p=0; p<document->pages_num; ++p) {
- extract_page_t* page = document->pages[p];
- outf("processing page %i: num_spans=%i", p, page->spans_num);
+ int y;
+ for (y=0; y<cells_num_y; ++y)
+ {
+ int x;
+ for (x=0; x<cells_num_x; ++x)
+ {
+ cell_t* cell = cells[y * cells_num_x + x];
+ outf("xy=(%i %i) above=%i left=%i", x, y, cell->above, cell->left);
+ if (cell->left && cell->above)
+ {
+ /* See how far this cell extends to right and down. */
+ int xx;
+ int yy;
+ for (xx=x+1; xx<cells_num_x; ++xx)
+ {
+ if (cells[y * cells_num_x + xx]->left) break;
+ }
+ cell->extend_right = xx - x;
+ cell->rect.max.x = cells[y * cells_num_x + xx-1]->rect.max.x;
+ for (yy=y+1; yy<cells_num_y; ++yy)
+ {
+ if (cells[yy * cells_num_x + x]->above) break;
+ }
+ cell->extend_down = yy - y;
+ cell->rect.max.y = cells[(yy-1) * cells_num_x + x]->rect.max.y;
+
+ /* Clear .above and .left in enclosed cells. */
+ for (xx = x; xx < x + cell->extend_right; ++xx)
+ {
+ int yy;
+ for (yy = y; yy < y + cell->extend_down; ++yy)
+ {
+ cell_t* cell2 = cells[cells_num_x * yy + xx];
+ if ( xx==x && yy==y)
+ {}
+ else
+ {
+ if (xx==x)
+ {
+ cell2->extend_right = cell->extend_right;
+ }
+ cell2->above = 0;
+ /* We set .left to 1 for left-most cells - e.g. F
+ and K in the above diagram; this allows us to
+ generate correct html without lots of recursing
+ looking for extend_down in earlier cells. */
+ cell2->left = (xx == x);
+ outf("xy=(%i %i) xxyy=(%i %i) have set cell2->above=%i left=%i",
+ x, y, xx, yy, cell2->above, cell2->left
+ );
+ }
+ }
+ }
+ }
+ }
+ }
+ return 0;
+}
- if (make_lines(
- alloc,
- page->spans,
- page->spans_num,
- &page->lines,
- &page->lines_num
- )) goto end;
- if (make_paragraphs(
+static int table_find_cells_text(extract_alloc_t* alloc, extract_page_t* page,
+ cell_t** cells, int cells_num_x, int cells_num_y)
+/* Sets each cell to contain the text that is within the cell's boundary. We
+remove any found text from the page. */
+{
+ /* Find text within each cell. We don't attempt to handle images within
+ cells. */
+ int e = -1;
+ int i;
+ int cells_num = cells_num_x * cells_num_y;
+ for (i=0; i<cells_num; ++i)
+ {
+ cell_t* cell = cells[i];
+ if (!cell->above || !cell->left) continue;
+ if (s_join_page_rects(
alloc,
- page->lines,
- page->lines_num,
- &page->paragraphs,
- &page->paragraphs_num
- )) goto end;
+ page,
+ &cell->rect,
+ 1 /*rects_num*/,
+ &cell->lines,
+ &cell->lines_num,
+ &cell->paragraphs,
+ &cell->paragraphs_num
+ )) return -1;
}
+
+ /* Append the table we have found to page->tables[]. */
+ if (extract_realloc(alloc, &page->tables, sizeof(*page->tables) * (page->tables_num + 1))) goto end;
+ if (extract_malloc(alloc, &page->tables[page->tables_num], sizeof(*page->tables[page->tables_num]))) goto end;
+ page->tables[page->tables_num]->pos.x = cells[0]->rect.min.x;
+ page->tables[page->tables_num]->pos.y = cells[0]->rect.min.y;
+ page->tables[page->tables_num]->cells = cells;
+ page->tables[page->tables_num]->cells_num_x = cells_num_x;
+ page->tables[page->tables_num]->cells_num_y = cells_num_y;
+ page->tables_num += 1;
+
+ if (0)
+ {
+ /* For debugging. */
+ int y;
+ outf0("table:\n");
+ for (y=0; y<cells_num_y; ++y)
+ {
+ int x;
+ for (x=0; x<cells_num_x; ++x)
+ {
+ cell_t* cell = cells[cells_num_x * y + x];
+ fprintf(stderr, " %c%c x=%i y=% 3i 3i w=%i h=%i",
+ cell->left ? '|' : ' ',
+ cell->above ? '-' : ' ',
+ x,
+ y,
+ cell->extend_right,
+ cell->extend_down
+ );
+ }
+ fprintf(stderr, "\n");
+ }
+
+ }
+
+ e = 0;
+ end:
+ return e;
+}
- ret = 0;
+static int table_find(extract_alloc_t* alloc, extract_page_t* page, double y_min, double y_max)
+/* Finds single table made from lines whose y coordinates are in the range
+y_min..y_max. */
+{
+ tablelines_t* all_h = &page->tablelines_horizontal;
+ tablelines_t* all_v = &page->tablelines_vertical;
+ int e = -1;
+ int i;
+
+ /* Find subset of vertical and horizontal lines that are within range
+ y_min..y_max, and sort by y coordinate. */
+ tablelines_t tl_h = {NULL, 0};
+ tablelines_t tl_v = {NULL, 0};
+ cell_t** cells = NULL;
+ int cells_num = 0;
+ int cells_num_x = 0;
+ int cells_num_y = 0;
+ int x;
+ int y;
+
+ outf("y=(%f %f)", y_min, y_max);
+
+ if (table_find_y_range(alloc, all_h, y_min, y_max, &tl_h)) goto end;
+ if (table_find_y_range(alloc, all_v, y_min, y_max, &tl_v)) goto end;
+ /* Suppress false coverity warning - qsort() does not dereference null
+ pointer if nmemb is zero. */
+ /* coverity[var_deref_model] */
+ qsort(tl_v.tablelines, tl_v.tablelines_num, sizeof(*tl_v.tablelines), tablelines_compare_x);
+
+ if (0)
+ {
+ /* Show raw lines info. */
+ outf0("all_h->tablelines_num=%i tl_h.tablelines_num=%i", all_h->tablelines_num, tl_h.tablelines_num);
+ for (i=0; i<tl_h.tablelines_num; ++i)
+ {
+ outf0(" %i: %s", i, extract_rect_string(&tl_h.tablelines[i].rect));
+ }
+
+ outf0("all_v->tablelines_num=%i tl_v.tablelines_num=%i", all_v->tablelines_num, tl_v.tablelines_num);
+ for (i=0; i<tl_v.tablelines_num; ++i)
+ {
+ outf0(" %i: %s", i, extract_rect_string(&tl_v.tablelines[i].rect));
+ }
+ }
+ /* Find the cells defined by the vertical and horizontal lines.
+
+ It seems that lines can be disjoint, e.g. what looks like a single
+ horizontal line could be made up of multiple lines all with the same
+ y coordinate, so we use i_next and j_next to skip these sublines when
+ iterating. */
+ cells = NULL;
+ cells_num = 0;
+ cells_num_x = 0;
+ cells_num_y = 0;
+ for (i=0; i<tl_h.tablelines_num; )
+ {
+ int i_next;
+ int j;
+ for (i_next=i+1; i_next<tl_h.tablelines_num; ++i_next)
+ {
+ if (tl_h.tablelines[i_next].rect.min.y - tl_h.tablelines[i].rect.min.y > 5) break;
+ }
+ if (i_next == tl_h.tablelines_num)
+ {
+ /* Ignore last row of points - cells need another row below. */
+ break;
+ }
+ cells_num_y += 1;
+
+ for (j=0; j<tl_v.tablelines_num; )
+ {
+ int j_next;
+ int ii;
+ int jj;
+ cell_t* cell;
+
+ for (j_next = j+1; j_next<tl_v.tablelines_num; ++j_next)
+ {
+ if (tl_v.tablelines[j_next].rect.min.x - tl_v.tablelines[j].rect.min.x > 0.5) break;
+ }
+ outf("i=%i j=%i tl_v.tablelines[j].rect=%s", i, j, extract_rect_string(&tl_v.tablelines[j].rect));
+
+ if (j_next == tl_v.tablelines_num) break;
+
+ if (extract_realloc(alloc, &cells, sizeof(*cells) * (cells_num+1))) goto end;
+ if (extract_malloc(alloc, &cells[cells_num], sizeof(*cells[cells_num]))) goto end;
+ cell = cells[cells_num];
+ cells_num += 1;
+ if (i==0) cells_num_x += 1;
+
+ cell->rect.min.x = tl_v.tablelines[j].rect.min.x;
+ cell->rect.min.y = tl_h.tablelines[i].rect.min.y;
+ cell->rect.max.x = (j_next < tl_v.tablelines_num) ? tl_v.tablelines[j_next].rect.min.x : cell->rect.min.x;
+ cell->rect.max.y = (i_next < tl_h.tablelines_num) ? tl_h.tablelines[i_next].rect.min.y : cell->rect.min.y;
+ cell->above = (i==0);
+ cell->left = (j==0);
+ cell->extend_right = 1;
+ cell->extend_down = 1;
+ cell->lines = NULL;
+ cell->lines_num = 0;
+ cell->paragraphs = NULL;
+ cell->paragraphs_num = 0;
+
+ /* Set cell->above if there is a horizontal line above the cell. */
+ outf("Looking to set above for i=%i j=%i rect=%s", i, j, extract_rect_string(&cell->rect));
+ for (ii = i; ii < i_next; ++ii)
+ {
+ tableline_t* h = &tl_h.tablelines[ii];
+ if (overlap(
+ cell->rect.min.x,
+ cell->rect.max.x,
+ h->rect.min.x,
+ h->rect.max.x
+ ))
+ {
+ cell->above = 1;
+ break;
+ }
+ }
+
+ /* Set cell->left if there is a vertical line to the left of the cell. */
+ for (jj = j; jj < j_next; ++jj)
+ {
+ tableline_t* v = &tl_v.tablelines[jj];
+ if (overlap(
+ cell->rect.min.y,
+ cell->rect.max.y,
+ v->rect.min.y,
+ v->rect.max.y
+ ))
+ {
+ cell->left = 1;
+ break;
+ }
+ }
+
+ j = j_next;
+ }
+
+ i = i_next;
+ }
+
+ assert(cells_num == cells_num_x * cells_num_y);
+
+ /* Remove cols and rows where no cells have .above and .left - these
+ will not appear. It also avoids spurious empty columns when table uses
+ closely-spaced double lines as separators. */
+ for (x=0; x<cells_num_x; ++x)
+ {
+ int has_cells = 0;
+ for (y=0; y<cells_num_y; ++y)
+ {
+ cell_t* cell = cells[y * cells_num_x + x];
+ if (cell->above && cell->left)
+ {
+ has_cells = 1;
+ break;
+ }
+ }
+ if (!has_cells)
+ {
+ /* Remove column <x>. */
+ int j = 0;
+ outf("Removing column %i. cells_num=%i cells_num_x=%i cells_num_y=%i", x, cells_num, cells_num_x, cells_num_y);
+ for (i=0; i<cells_num; ++i)
+ {
+ if (i % cells_num_x == x)
+ {
+ extract_cell_free(alloc, &cells[i]);
+ continue;
+ }
+ cells[j] = cells[i];
+ j += 1;
+ }
+ cells_num -= cells_num_y;
+ cells_num_x -= 1;
+ }
+ }
+
+ if (cells_num == 0)
+ {
+ e = 0;
+ goto end;
+ }
+
+ if (table_find_extend(cells, cells_num_x, cells_num_y)) goto end;
+
+ if (table_find_cells_text(alloc, page, cells, cells_num_x, cells_num_y)) goto end;
+
+ e = 0;
end:
+ extract_free(alloc, &tl_h.tablelines);
+ extract_free(alloc, &tl_v.tablelines);
+ if (e)
+ {
+ for (i=0; i<cells_num; ++i)
+ {
+ extract_cell_free(alloc, &cells[i]);
+ }
+ extract_free(alloc, &cells);
+ }
+ return e;
+}
- return ret;
+
+static int extract_page_tables_find_lines(
+ extract_alloc_t* alloc,
+ extract_page_t* page
+ )
+/* Finds tables in <page> by looking for lines in page->tablelines_horizontal
+and page->tablelines_vertical that look like table dividers.
+
+Any text found inside tables is removed from page->spans[].
+*/
+{
+ double miny;
+ double maxy;
+ double margin = 1;
+ int iv;
+ int ih;
+ outf("page->tablelines_horizontal.tablelines_num=%i", page->tablelines_horizontal.tablelines_num);
+ outf("page->tablelines_vertical.tablelines_num=%i", page->tablelines_vertical.tablelines_num);
+
+ /* Sort all lines by y coordinate. */
+ qsort(
+ page->tablelines_horizontal.tablelines,
+ page->tablelines_horizontal.tablelines_num,
+ sizeof(*page->tablelines_horizontal.tablelines),
+ tablelines_compare_y
+ );
+ qsort(
+ page->tablelines_vertical.tablelines,
+ page->tablelines_vertical.tablelines_num,
+ sizeof(*page->tablelines_vertical.tablelines),
+ tablelines_compare_y
+ );
+
+ if (0)
+ {
+ /* Show info about lines. */
+ int i;
+ outf0("tablelines_horizontal:");
+ for (i=0; i<page->tablelines_horizontal.tablelines_num; ++i)
+ {
+ outf0(" color=%f: %s",
+ page->tablelines_horizontal.tablelines[i].color,
+ extract_rect_string(&page->tablelines_horizontal.tablelines[i].rect)
+ );
+ }
+ outf0("tablelines_vertical:");
+ for (i=0; i<page->tablelines_vertical.tablelines_num; ++i)
+ {
+ outf0(" color=%f: %s",
+ page->tablelines_vertical.tablelines[i].color,
+ extract_rect_string(&page->tablelines_vertical.tablelines[i].rect)
+ );
+ }
+ }
+
+ /* Look for completely separate vertical regions that define different
+ tables, by looking for vertical gaps between the rects of each
+ horizontal/vertical line. */
+ maxy = -DBL_MAX;
+ miny = -DBL_MAX;
+ iv = 0;
+ ih = 0;
+ for(;;)
+ {
+ tableline_t* tlv = NULL;
+ tableline_t* tlh = NULL;
+ tableline_t* tl;
+ if (iv < page->tablelines_vertical.tablelines_num)
+ {
+ tlv = &page->tablelines_vertical.tablelines[iv];
+ }
+ /* We only consider horizontal lines that are not white. This is a bit
+ of a cheat to get the right behaviour with twotables_2.pdf. */
+ while (ih < page->tablelines_horizontal.tablelines_num)
+ {
+ if (page->tablelines_horizontal.tablelines[ih].color == 1)
+ {
+ /* Ignore white horizontal lines. */
+ ++ih;
+ }
+ else
+ {
+ tlh = &page->tablelines_horizontal.tablelines[ih];
+ break;
+ }
+ }
+ if (tlv && tlh)
+ {
+ tl = (tlv->rect.min.y < tlh->rect.min.y) ? tlv : tlh;
+ }
+ else if (tlv) tl = tlv;
+ else if (tlh) tl = tlh;
+ else break;
+ if (tl == tlv) iv += 1;
+ else ih += 1;
+ if (tl->rect.min.y > maxy + margin)
+ {
+ if (maxy > miny)
+ {
+ outf("New table. maxy=%f miny=%f", maxy, miny);
+ /* Find table. */
+ table_find(alloc, page, miny - margin, maxy + margin);
+ }
+ miny = tl->rect.min.y;
+ }
+ if (tl->rect.max.y > maxy) maxy = tl->rect.max.y;
+ }
+
+ /* Find last table. */
+ table_find(alloc, page, miny - margin, maxy + margin);
+
+ return 0;
+}
+
+
+static void show_tables(table_t** tables, int tables_num)
+/* For debugging only. */
+{
+ int i;
+ outf0("tables_num=%i", tables_num);
+ for (i=0; i<tables_num; ++i)
+ {
+ table_t* table = tables[i];
+ int y;
+ outf0("table %i: cells_num_y=%i cells_num_x=%i", i, table->cells_num_y, table->cells_num_x);
+ for (y=0; y<table->cells_num_y; ++y)
+ {
+ int x;
+ for (x=0; x<table->cells_num_x; ++x)
+ {
+ cell_t* cell = table->cells[table->cells_num_x * y + x];
+ outf0("cell: y=% 3i x=% 3i: left=%i above=%i rect=%s",
+ y, x, cell->left, cell->above, extract_rect_string(&cell->rect));
+ }
+ }
+ }
+}
+
+static int extract_page_tables_find(
+ extract_alloc_t* alloc,
+ extract_page_t* page
+ )
+/* Find tables in <page>.
+
+At the moment this only calls extract_page_tables_find_lines(), but in future
+will call other functions that find tables in different ways, e.g. by analysing
+an image of a page, or looking for blocks of whitespace in between chunks of
+text. */
+{
+ if (extract_page_tables_find_lines(alloc, page)) return -1;
+
+ if (0)
+ {
+ outf0("=== tables from extract_page_tables_find_lines():");
+ show_tables(page->tables, page->tables_num);
+ }
+
+ return 0;
+}
+
+static int extract_document_join_page(
+ extract_alloc_t* alloc,
+ extract_page_t* page
+ )
+/* Finds tables and paragraphs on <page>. */
+{
+ /* Find tables on this page first. This will remove text that is within
+ tables from page->spans, so that text doesn't appearing more than once in
+ the final output. */
+ if (extract_page_tables_find(alloc, page)) return -1;
+
+ /* Now join remaining spans into lines and paragraphs. */
+ if (s_join_page_rects(
+ alloc,
+ page,
+ NULL /*rects*/,
+ 0 /*rects_num*/,
+ &page->lines,
+ &page->lines_num,
+ &page->paragraphs,
+ &page->paragraphs_num
+ ))
+ {
+ outf0("s_join_page_rects failed. page->spans_num=%i page->lines_num=%i page->paragraphs_num=%i",
+ page->spans_num,
+ page->lines_num,
+ page->paragraphs_num
+ );
+ return -1;
+ }
+
+ return 0;
+}
+
+
+int extract_document_join(extract_alloc_t* alloc, document_t* document)
+{
+ /* For each page in <document> we find tables and join spans into lines and paragraphs.
+
+ A line is a list of spans that are at the same angle and on the same
+ line. A paragraph is a list of lines that are at the same angle and close
+ together.
+ */
+ int p;
+ for (p=0; p<document->pages_num; ++p) {
+ extract_page_t* page = document->pages[p];
+
+ outf("processing page %i: num_spans=%i", p, page->spans_num);
+ if (extract_document_join_page(alloc, page)) return -1;
+ }
+
+ return 0;
}
diff --git a/extract/src/mem.c b/extract/src/mem.c
index 83b5032c..1c3c96e6 100644
--- a/extract/src/mem.c
+++ b/extract/src/mem.c
@@ -19,16 +19,26 @@ void extract_bzero(void *b, size_t len)
int extract_vasprintf(extract_alloc_t* alloc, char** out, const char* format, va_list va)
{
int n;
- int n2;
+ int ret;
va_list va2;
va_copy(va2, va);
n = vsnprintf(NULL, 0, format, va);
- if (n < 0) return n;
- if (extract_malloc(alloc, out, n + 1)) return -1;
- n2 = vsnprintf(*out, n + 1, format, va2);
+ if (n < 0)
+ {
+ ret = n;
+ goto end;
+ }
+ if (extract_malloc(alloc, out, n + 1))
+ {
+ ret = -1;
+ goto end;
+ }
+ vsnprintf(*out, n + 1, format, va2);
+ ret = 0;
+
+ end:
va_end(va2);
- assert(n2 == n);
- return n2;
+ return ret;
}
diff --git a/extract/src/mem.h b/extract/src/mem.h
index ffdcb049..2611b04f 100644
--- a/extract/src/mem.h
+++ b/extract/src/mem.h
@@ -8,8 +8,17 @@
void extract_bzero(void *b, size_t len);
-int extract_vasprintf(extract_alloc_t* alloc, char** out, const char* format, va_list va);
-int extract_asprintf(extract_alloc_t* alloc, char** out, const char* format, ...);
+int extract_vasprintf(extract_alloc_t* alloc, char** out, const char* format, va_list va)
+ #ifdef __GNUC__
+ __attribute__ ((format (printf, 3, 0)))
+ #endif
+ ;
+
+int extract_asprintf(extract_alloc_t* alloc, char** out, const char* format, ...)
+ #ifdef __GNUC__
+ __attribute__ ((format (printf, 3, 4)))
+ #endif
+ ;
int extract_strdup(extract_alloc_t* alloc, const char* s, char** o_out);
diff --git a/extract/src/memento.py b/extract/src/memento.py
index 987cd4fd..55171e39 100755
--- a/extract/src/memento.py
+++ b/extract/src/memento.py
@@ -3,20 +3,29 @@
'''
Post-processor for Memento.
+Usage:
+ memento.py <args> [<command> ...]
+
Args:
-q <quiet>
Controls how often we output 'Memory squeezing @ ...' lines. E.g. '-q
10' outputs for multiples of 10.
+
+If <command> is specified we run it and look at the output. Otherwise we assume
+that Memento output is available on our stdin.
'''
import os
import re
+import subprocess
import sys
def main():
quiet = 1
+ quiet_next = 0
out_raw = None
+ command = None
args = iter(sys.argv[1:])
while 1:
try:
@@ -29,15 +38,32 @@ def main():
out_raw = open(next(args), 'w')
elif arg == '-q':
quiet = int(next(args))
- else:
+ elif arg.startswith('-'):
raise Exception(f'unrecognised arg: {arg}')
+ else:
+ command = arg
+ for a in args:
+ command += f' {a}'
+
+ if command:
+ print(f'Running: {command}')
+ child = subprocess.Popen(
+ command,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.STDOUT,
+ shell=True,
+ text=True,
+ )
+ stdin = child.stdout
+ else:
+ stdin = sys.stdin
openbsd = os.uname()[0] == 'OpenBSD'
n = None
segv = 0
leaks = 0
lines = []
- for line in sys.stdin:
+ for line in stdin:
if out_raw:
out_raw.write(line)
m = re.match('^Memory squeezing @ ([0-9]+)( complete)?', line)
@@ -45,7 +71,7 @@ def main():
if not m.group(2):
# Start of squeeze.
- if not openbsd:
+ if 0 and not openbsd:
# Looks like memento's forked processes might terminate
# before they get to output the 'Memory squeezing @ <N>
# complete' line.
@@ -53,9 +79,10 @@ def main():
assert n is None, f'n={n} line={line!r}'
n = int(m.group(1))
- if n % quiet == 0:
- sys.stdout.write(line)
+ if n >= quiet_next:
+ sys.stdout.write(f'quiet_next={quiet_next!r} n={n!r}: {line}')
sys.stdout.flush()
+ quiet_next = (n + quiet) // quiet * quiet
else:
# End of squeeze.
assert n == int(m.group(1))
@@ -66,6 +93,8 @@ def main():
if l.endswith('\n'):
l = l[:-1]
print(f' {l}')
+ if command:
+ print(f'Examine with: MEMENTO_FAILAT={n} {command}')
lines = []
segv = 0
leaks = 0
diff --git a/extract/src/misc-test.c b/extract/src/misc-test.c
index 58b098ff..5e658e8f 100644
--- a/extract/src/misc-test.c
+++ b/extract/src/misc-test.c
@@ -35,6 +35,15 @@ static void s_check(
if (!ok) s_num_fails += 1;
}
+static void s_check_e( int e, const char* text)
+{
+ if (e)
+ {
+ s_num_fails += 1;
+ printf( "Error: e=%i: %s\n", e, text);
+ }
+}
+
static void s_check_int(const char* text, int value_expected, int expected_errno)
{
int value;
@@ -59,6 +68,53 @@ static void s_check_uint(const char* text, unsigned expected_value, int expected
return;
}
+static void s_check_xml_parse()
+{
+ int e;
+ extract_buffer_t* buffer;
+ extract_xml_tag_t tag;
+ unsigned i;
+ const char* texts[] = {
+ "<foo a=1>text</foo>",
+ "< >",
+ "<foo bar=>",
+ "< bar=>",
+ "< =>",
+ };
+
+ extract_xml_tag_init( &tag);
+
+ for (i=0; i<sizeof(texts) / sizeof(texts[0]); ++i)
+ {
+ const char* text = texts[i];
+ printf("testing extract_xml_pparse_*(): %s\n", text);
+ e = extract_buffer_open_simple(
+ NULL /*alloc*/,
+ text,
+ strlen(text),
+ NULL /*handle*/,
+ NULL /*fn_close*/,
+ &buffer
+ );
+ s_check_e( e, "extract_buffer_open_simple()");
+ e = extract_xml_pparse_init( NULL /*alloc*/, buffer, NULL /*first_line*/);
+ s_check_e( e, "extract_xml_pparse_init()");
+
+ e = extract_xml_pparse_next( buffer, &tag);
+ s_check_e( e, "extract_xml_pparse_next()");
+ s_check_e( tag.name ? 0 : 1, "tag.name is not null");
+
+ {
+ int j;
+ for (j=0; j<tag.attributes_num; ++j)
+ {
+ s_check_e( tag.attributes[j].name ? 0 : 1, "attribute is non-null");
+ s_check_e( tag.attributes[j].value ? 0 : 1, "attribute is non-null");
+ }
+ }
+ }
+}
+
int main(void)
{
printf("testing extract_xml_str_to_int():\n");
@@ -73,6 +129,8 @@ int main(void)
s_check_uint("-20b", 0, EINVAL);
s_check_uint("123456789123", 0, ERANGE);
+ s_check_xml_parse();
+
printf("s_num_fails=%i\n", s_num_fails);
if (s_num_fails) {
diff --git a/extract/src/odt.c b/extract/src/odt.c
index bacb362d..9e369078 100644
--- a/extract/src/odt.c
+++ b/extract/src/odt.c
@@ -21,6 +21,7 @@ odt_paragraph_finish(). */
#include <assert.h>
#include <errno.h>
+#include <float.h>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
@@ -29,17 +30,16 @@ odt_paragraph_finish(). */
#include <sys/stat.h>
-static int extract_odt_paragraph_start(extract_alloc_t* alloc, extract_astring_t* content)
+static int s_odt_paragraph_start(extract_alloc_t* alloc, extract_astring_t* content)
{
return extract_astring_cat(alloc, content, "\n\n<text:p>");
}
-static int extract_odt_paragraph_finish(extract_alloc_t* alloc, extract_astring_t* content)
+static int s_odt_paragraph_finish(extract_alloc_t* alloc, extract_astring_t* content)
{
return extract_astring_cat(alloc, content, "</text:p>");
}
-
/* ODT doesn't seem to support ad-hoc inline font specifications; instead
we have to define a style at the start of the content.xml file. So when
writing content we insert a style name and add the required styles to a
@@ -48,10 +48,7 @@ extract_odt_styles_t struct. */
struct extract_odt_style_t
{
int id; /* A unique id for this style. */
- char* font_name;
- double font_size;
- int font_bold;
- int font_italic;
+ font_t font;
};
struct extract_odt_styles_t
@@ -61,41 +58,47 @@ struct extract_odt_styles_t
int styles_num;
};
-static int extract_odt_style_compare(extract_odt_style_t* a, extract_odt_style_t*b)
+static int s_odt_style_compare(extract_odt_style_t* a, extract_odt_style_t*b)
{
int d;
double dd;
- if ((d = strcmp(a->font_name, b->font_name))) return d;
- if ((dd = a->font_size - b->font_size) != 0.0) return (dd > 0.0) ? 1 : -1;
- if ((d = a->font_bold - b->font_bold)) return d;
- if ((d = a->font_italic - b->font_italic)) return d;
+ if ((d = strcmp(a->font.name, b->font.name))) return d;
+ if ((dd = a->font.size - b->font.size) != 0.0) return (dd > 0.0) ? 1 : -1;
+ if ((d = a->font.bold - b->font.bold)) return d;
+ if ((d = a->font.italic - b->font.italic)) return d;
return 0;
}
-static int extract_odt_style_append_definition(extract_alloc_t* alloc, extract_odt_style_t* style, extract_astring_t* text)
+static int s_odt_style_append_definition(extract_alloc_t* alloc, extract_odt_style_t* style, extract_astring_t* text)
{
- const char* font_name = style->font_name;
+ const char* font_name = style->font.name;
/* This improves output e.g. for zlib.3.pdf, but clearly a hack. */
if (0 && strstr(font_name, "Helvetica"))
{
font_name = "Liberation Sans";
}
- outf("style->font_name=%s font_name=%s", style->font_name, font_name);
+ outf("style->font_name=%s font_name=%s", style->font.name, font_name);
if (extract_astring_catf(alloc, text, "<style:style style:name=\"T%i\" style:family=\"text\">", style->id)) return -1;
if (extract_astring_catf(alloc, text, "<style:text-properties style:font-name=\"%s\"", font_name)) return -1;
- if (extract_astring_catf(alloc, text, " fo:font-size=\"%.2fpt\"", style->font_size)) return -1;
- if (extract_astring_catf(alloc, text, " fo:font-weight=\"%s\"", style->font_bold ? "bold" : "normal")) return -1;
- if (extract_astring_catf(alloc, text, " fo:font-style=\"%s\"", style->font_italic ? "italic" : "normal")) return -1;
+ if (extract_astring_catf(alloc, text, " fo:font-size=\"%.2fpt\"", style->font.size)) return -1;
+ if (extract_astring_catf(alloc, text, " fo:font-weight=\"%s\"", style->font.bold ? "bold" : "normal")) return -1;
+ if (extract_astring_catf(alloc, text, " fo:font-style=\"%s\"", style->font.italic ? "italic" : "normal")) return -1;
if (extract_astring_cat(alloc, text, " /></style:style>")) return -1;
return 0;
}
void extract_odt_styles_free(extract_alloc_t* alloc, extract_odt_styles_t* styles)
{
+ int i;
+ for (i=0; i<styles->styles_num; ++i)
+ {
+ extract_odt_style_t* style = &styles->styles[i];
+ extract_free(alloc, &style->font.name);
+ }
extract_free(alloc, &styles->styles);
}
-static int extract_odt_styles_definitions(
+static int s_odt_styles_definitions(
extract_alloc_t* alloc,
extract_odt_styles_t* styles,
extract_astring_t* out
@@ -105,7 +108,7 @@ static int extract_odt_styles_definitions(
if (extract_astring_cat(alloc, out, "<office:automatic-styles>")) return -1;
for (i=0; i<styles->styles_num; ++i)
{
- if (extract_odt_style_append_definition(alloc, &styles->styles[i], out)) return -1;
+ if (s_odt_style_append_definition(alloc, &styles->styles[i], out)) return -1;
}
extract_astring_cat(alloc, out, "<style:style style:name=\"gr1\" style:family=\"graphic\">\n");
extract_astring_cat(alloc, out, "<style:graphic-properties"
@@ -159,25 +162,22 @@ static int extract_odt_styles_definitions(
return 0;
}
-static int styles_add(
+static int s_odt_styles_add(
extract_alloc_t* alloc,
extract_odt_styles_t* styles,
- const char* font_name,
- double font_size,
- int font_bold,
- int font_italic,
+ font_t* font,
extract_odt_style_t** o_style
)
/* Adds specified style to <styles> if not already present. Sets *o_style to
point to the style_t within <styles>. */
{
- extract_odt_style_t style = {0 /*id*/, (char*) font_name, font_size, font_bold, font_italic};
+ extract_odt_style_t style = {0 /*id*/, *font};
int i;
/* We keep styles->styles[] sorted; todo: use bsearch or similar when
searching. */
for (i=0; i<styles->styles_num; ++i)
{
- int d = extract_odt_style_compare(&style, &styles->styles[i]);
+ int d = s_odt_style_compare(&style, &styles->styles[i]);
if (d == 0)
{
*o_style = &styles->styles[i];
@@ -190,92 +190,79 @@ point to the style_t within <styles>. */
memmove(&styles->styles[i+1], &styles->styles[i], sizeof(styles->styles[0]) * (styles->styles_num - i));
styles->styles_num += 1;
styles->styles[i].id = styles->styles_num + 10; /* Leave space for template's built-in styles. */
- if (extract_strdup(alloc, font_name, &styles->styles[i].font_name)) return -1;
- styles->styles[i].font_size = font_size;
- styles->styles[i].font_bold = font_bold;
- styles->styles[i].font_italic = font_italic;
+ if (extract_strdup(alloc, font->name, &styles->styles[i].font.name)) return -1;
+ styles->styles[i].font.size = font->size;
+ styles->styles[i].font.bold = font->bold;
+ styles->styles[i].font.italic = font->italic;
*o_style = &styles->styles[i];
return 0;
}
static int extract_odt_run_start(
- extract_alloc_t* alloc,
- extract_astring_t* content,
- extract_odt_styles_t* styles,
- const char* font_name,
- double font_size,
- int bold,
- int italic
+ extract_alloc_t* alloc,
+ extract_astring_t* content,
+ extract_odt_styles_t* styles,
+ content_state_t* content_state
)
-/* Starts a new run. Caller must ensure that extract_odt_run_finish() was
+/* Starts a new run. Caller must ensure that s_odt_run_finish() was
called to terminate any previous run. */
{
extract_odt_style_t* style;
- if (styles_add(alloc, styles, font_name, font_size, bold, italic, &style)) return -1;
+ if (s_odt_styles_add(
+ alloc,
+ styles,
+ &content_state->font,
+ &style
+ )) return -1;
if (extract_astring_catf(alloc, content, "<text:span text:style-name=\"T%i\">", style->id)) return -1;
return 0;
}
-static int extract_odt_run_finish(extract_alloc_t* alloc, extract_astring_t* content)
+static int s_odt_run_finish(extract_alloc_t* alloc, content_state_t* content_state, extract_astring_t* content)
{
+ if (content_state) content_state->font.name = NULL;
return extract_astring_cat(alloc, content, "</text:span>");
}
-static int extract_odt_paragraph_empty(extract_alloc_t* alloc, extract_astring_t* content, extract_odt_styles_t* styles)
+static int s_odt_append_empty_paragraph(extract_alloc_t* alloc, extract_astring_t* content, extract_odt_styles_t* styles)
/* Append an empty paragraph to *content. */
{
int e = -1;
- if (extract_odt_paragraph_start(alloc, content)) goto end;
+ static char fontname[] = "OpenSans";
+ content_state_t content_state = {0};
+ if (s_odt_paragraph_start(alloc, content)) goto end;
/* [This comment is from docx, haven't checked odt.] It seems like our
- choice of font size here doesn't make any difference to the ammount of
+ choice of font size here doesn't make any difference to the amount of
vertical space, unless we include a non-space character. Presumably
something to do with the styles in the template document. */
- if (extract_odt_run_start(
- alloc,
- content,
- styles,
- "OpenSans",
- 10 /*font_size*/,
- 0 /*font_bold*/,
- 0 /*font_italic*/
- )) goto end;
+ content_state.font.name = fontname;
+ content_state.font.size = 10;
+ content_state.font.bold = 0;
+ content_state.font.italic = 0;
+ if (extract_odt_run_start(alloc, content, styles, &content_state)) goto end;
//docx_char_append_string(content, "&#160;"); /* &#160; is non-break space. */
- if (extract_odt_run_finish(alloc, content)) goto end;
- if (extract_odt_paragraph_finish(alloc, content)) goto end;
+ if (s_odt_run_finish(alloc, NULL /*content_state*/, content)) goto end;
+ if (s_odt_paragraph_finish(alloc, content)) goto end;
e = 0;
end:
return e;
}
-typedef struct
-{
- const char* font_name;
- double font_size;
- int font_bold;
- int font_italic;
- matrix_t* ctm_prev;
- /* todo: add extract_odt_styles_t member? */
-} content_state_t;
-/* Used to keep track of font information when writing paragraphs of odt
-content, e.g. so we know whether a font has changed so need to start a new odt
-span. */
-
-
-static int extract_document_to_odt_content_paragraph(
+static int s_document_to_odt_content_paragraph(
extract_alloc_t* alloc,
- content_state_t* state,
+ content_state_t* content_state,
paragraph_t* paragraph,
extract_astring_t* content,
extract_odt_styles_t* styles
)
-/* Append odt xml for <paragraph> to <content>. Updates *state if we change
-font. */
+/* Append odt xml for <paragraph> to <content>. Updates *content_state if we
+change font. */
{
int e = -1;
int l;
- if (extract_odt_paragraph_start(alloc, content)) goto end;
+ if (s_odt_paragraph_start(alloc, content)) goto end;
for (l=0; l<paragraph->lines_num; ++l)
{
@@ -286,50 +273,41 @@ font. */
int si;
span_t* span = line->spans[s];
double font_size_new;
- state->ctm_prev = &span->ctm;
+ content_state->ctm_prev = &span->ctm;
font_size_new = extract_matrices_to_font_size(&span->ctm, &span->trm);
- if (!state->font_name
- || strcmp(span->font_name, state->font_name)
- || span->flags.font_bold != state->font_bold
- || span->flags.font_italic != state->font_italic
- || font_size_new != state->font_size
+ if (!content_state->font.name
+ || strcmp(span->font_name, content_state->font.name)
+ || span->flags.font_bold != content_state->font.bold
+ || span->flags.font_italic != content_state->font.italic
+ || font_size_new != content_state->font.size
)
{
- if (state->font_name)
+ if (content_state->font.name)
{
- if (extract_odt_run_finish(alloc, content)) goto end;
+ if (s_odt_run_finish(alloc, content_state, content)) goto end;
}
- state->font_name = span->font_name;
- state->font_bold = span->flags.font_bold;
- state->font_italic = span->flags.font_italic;
- state->font_size = font_size_new;
- if (extract_odt_run_start(
- alloc,
- content,
- styles,
- state->font_name,
- state->font_size,
- state->font_bold,
- state->font_italic
- )) goto end;
+ content_state->font.name = span->font_name;
+ content_state->font.bold = span->flags.font_bold;
+ content_state->font.italic = span->flags.font_italic;
+ content_state->font.size = font_size_new;
+ if (extract_odt_run_start( alloc, content, styles, content_state)) goto end;
}
for (si=0; si<span->chars_num; ++si)
{
char_t* char_ = &span->chars[si];
int c = char_->ucs;
- if (extract_astring_cat_xmlc(alloc, content, c)) goto end;
+ if (extract_astring_catc_unicode_xml(alloc, content, c)) goto end;
}
/* Remove any trailing '-' at end of line. */
- if (astring_char_truncate_if(content, '-')) goto end;
+ if (extract_astring_char_truncate_if(content, '-')) goto end;
}
}
- if (state->font_name)
+ if (content_state->font.name)
{
- if (extract_odt_run_finish(alloc, content)) goto end;
- state->font_name = NULL;
+ if (s_odt_run_finish(alloc, content_state, content)) goto end;
}
- if (extract_odt_paragraph_finish(alloc, content)) goto end;
+ if (s_odt_paragraph_finish(alloc, content)) goto end;
e = 0;
@@ -337,7 +315,7 @@ font. */
return e;
}
-static int extract_document_append_image(
+static int s_odt_append_image(
extract_alloc_t* alloc,
extract_astring_t* content,
image_t* image
@@ -362,7 +340,7 @@ static int extract_document_append_image(
}
-static int extract_document_output_rotated_paragraphs(
+static int s_odt_output_rotated_paragraphs(
extract_alloc_t* alloc,
extract_page_t* page,
int paragraph_begin,
@@ -375,14 +353,14 @@ static int extract_document_output_rotated_paragraphs(
int text_box_id,
extract_astring_t* content,
extract_odt_styles_t* styles,
- content_state_t* state
+ content_state_t* content_state
)
/* Writes paragraph to content inside rotated text box. */
{
int e = 0;
int p;
double pt_to_inch = 1/72.0;
- outf("rotated paragraphs: rotation_rad=%f (x y)=(%i %i) (w h)=(%i %i)", rotation_rad, x_pt, y_pt, w_pt, h_pt);
+ outf("rotated paragraphs: rotation_rad=%f (x y)=(%f %f) (w h)=(%f %f)", rotation_rad, x_pt, y_pt, w_pt, h_pt);
// https://docs.oasis-open.org/office/OpenDocument/v1.3/cs02/part3-schema/OpenDocument-v1.3-cs02-part3-schema.html#attribute-draw_transform
// says rotation is in degrees, but we seem to require -radians.
@@ -414,7 +392,7 @@ static int extract_document_output_rotated_paragraphs(
for (p=paragraph_begin; p<paragraph_end; ++p)
{
paragraph_t* paragraph = page->paragraphs[p];
- if (!e) e = extract_document_to_odt_content_paragraph(alloc, state, paragraph, content, styles);
+ if (!e) e = s_document_to_odt_content_paragraph(alloc, content_state, paragraph, content, styles);
}
if (!e) e = extract_astring_cat(alloc, content, "\n");
@@ -427,6 +405,219 @@ static int extract_document_output_rotated_paragraphs(
}
+static int s_odt_append_table(extract_alloc_t* alloc, table_t* table, extract_astring_t* content, extract_odt_styles_t* styles)
+{
+ int e = -1;
+ int y;
+
+ {
+ int x;
+ static int table_number = 0;
+ table_number += 1;
+ if (extract_astring_catf(alloc, content,
+ "\n"
+ " <table:table text:style-name=\"extract.table\" table:name=\"extract.table.%i\">\n"
+ " <table:table-columns>\n"
+ ,
+ table_number
+ )) goto end;
+
+ for (x=0; x<table->cells_num_x; ++x)
+ {
+ if (extract_astring_cat(alloc, content,
+ " <table:table-column table:style-name=\"extract.table.column\"/>\n"
+ )) goto end;
+ }
+ if (extract_astring_cat(alloc, content,
+ " </table:table-columns>\n"
+ )) goto end;
+ }
+ for (y=0; y<table->cells_num_y; ++y)
+ {
+ int x;
+ if (extract_astring_cat(alloc, content,
+ " <table:table-row>\n"
+ )) goto end;
+
+ for (x=0; x<table->cells_num_x; ++x)
+ {
+ cell_t* cell = table->cells[y*table->cells_num_x + x];
+ if (!cell->above || !cell->left)
+ {
+ if (extract_astring_cat(alloc, content, " <table:covered-table-cell/>\n")) goto end;
+ continue;
+ }
+
+ if (extract_astring_cat(alloc, content, " <table:table-cell")) goto end;
+ if (cell->extend_right > 1)
+ {
+ if (extract_astring_catf(alloc, content, " table:number-columns-spanned=\"%i\"", cell->extend_right)) goto end;
+ }
+ if (cell->extend_down > 1)
+ {
+ if (extract_astring_catf(alloc, content, " table:number-rows-spanned=\"%i\"", cell->extend_down)) goto end;
+ }
+ if (extract_astring_catf(alloc, content, ">\n")) goto end;
+
+ /* Write contents of this cell. */
+ {
+ int p;
+ content_state_t content_state;
+ content_state.font.name = NULL;
+ content_state.ctm_prev = NULL;
+ for (p=0; p<cell->paragraphs_num; ++p)
+ {
+ paragraph_t* paragraph = cell->paragraphs[p];
+ if (s_document_to_odt_content_paragraph(alloc, &content_state, paragraph, content, styles)) goto end;
+ }
+ if (content_state.font.name)
+ {
+ if (s_odt_run_finish(alloc, &content_state, content)) goto end;
+ }
+ if (extract_astring_cat(alloc, content, "\n")) goto end;
+ }
+ if (extract_astring_cat(alloc, content, " </table:table-cell>\n")) goto end;
+ }
+ if (extract_astring_cat(alloc, content, " </table:table-row>\n")) goto end;
+ }
+ if (extract_astring_cat(alloc, content, " </table:table>\n")) goto end;
+ e = 0;
+
+ end:
+ return e;
+}
+
+
+static int s_odt_append_rotated_paragraphs(
+ extract_alloc_t* alloc,
+ extract_page_t* page,
+ content_state_t* content_state,
+ int* p,
+ int* text_box_id,
+ const matrix_t* ctm,
+ double rotate,
+ extract_astring_t* content,
+ extract_odt_styles_t* styles
+ )
+/* Appends paragraphs with same rotation, starting with page->paragraphs[*p]
+and updates *p. */
+{
+ /* Find extent of paragraphs with this same rotation. extent
+ will contain max width and max height of paragraphs, in units
+ before application of ctm, i.e. before rotation. */
+ int e = -1;
+ point_t extent = {0, 0};
+ int p0 = *p;
+ int p1;
+ paragraph_t* paragraph = page->paragraphs[*p];
+
+ outf("rotate=%.2frad=%.1fdeg ctm: ef=(%f %f) abcd=(%f %f %f %f)",
+ rotate, rotate * 180 / pi,
+ ctm->e,
+ ctm->f,
+ ctm->a,
+ ctm->b,
+ ctm->c,
+ ctm->d
+ );
+
+ {
+ /* We assume that first span is at origin of text
+ block. This assumes left-to-right text. */
+ double rotate0 = rotate;
+ const matrix_t* ctm0 = ctm;
+ point_t origin =
+ {
+ paragraph->lines[0]->spans[0]->chars[0].x,
+ paragraph->lines[0]->spans[0]->chars[0].y
+ };
+ matrix_t ctm_inverse = {1, 0, 0, 1, 0, 0};
+ double ctm_det = ctm->a*ctm->d - ctm->b*ctm->c;
+ if (ctm_det != 0)
+ {
+ ctm_inverse.a = +ctm->d / ctm_det;
+ ctm_inverse.b = -ctm->b / ctm_det;
+ ctm_inverse.c = -ctm->c / ctm_det;
+ ctm_inverse.d = +ctm->a / ctm_det;
+ }
+ else
+ {
+ outf("cannot invert ctm=(%f %f %f %f)",
+ ctm->a, ctm->b, ctm->c, ctm->d);
+ }
+
+ for (*p=p0; *p<page->paragraphs_num; ++*p)
+ {
+ paragraph = page->paragraphs[*p];
+ ctm = &paragraph->lines[0]->spans[0]->ctm;
+ rotate = atan2(ctm->b, ctm->a);
+ if (rotate != rotate0)
+ {
+ break;
+ }
+
+ /* Update <extent>. */
+ {
+ int l;
+ for (l=0; l<paragraph->lines_num; ++l)
+ {
+ line_t* line = paragraph->lines[l];
+ span_t* span = extract_line_span_last(line);
+ char_t* char_ = extract_span_char_last(span);
+ double adv = char_->adv * extract_matrix_expansion(span->trm);
+ double x = char_->x + adv * cos(rotate);
+ double y = char_->y + adv * sin(rotate);
+
+ double dx = x - origin.x;
+ double dy = y - origin.y;
+
+ /* Position relative to origin and before box rotation. */
+ double xx = ctm_inverse.a * dx + ctm_inverse.b * dy;
+ double yy = ctm_inverse.c * dx + ctm_inverse.d * dy;
+ yy = -yy;
+ if (xx > extent.x) extent.x = xx;
+ if (yy > extent.y) extent.y = yy;
+ if (0) outf("rotate=%f *p=%i: origin=(%f %f) xy=(%f %f) dxy=(%f %f) xxyy=(%f %f) span: %s",
+ rotate, *p, origin.x, origin.y, x, y, dx, dy, xx, yy, extract_span_string(alloc, span));
+ }
+ }
+ }
+ p1 = *p;
+ rotate = rotate0;
+ ctm = ctm0;
+ outf("rotate=%f p0=%i p1=%i. extent is: (%f %f)",
+ rotate, p0, p1, extent.x, extent.y);
+ }
+
+ /* Paragraphs p0..p1-1 have same rotation. We output them into
+ a single rotated text box. */
+
+ /* We need unique id for text box. */
+ *text_box_id += 1;
+
+ if (s_odt_output_rotated_paragraphs(
+ alloc,
+ page,
+ p0,
+ p1,
+ rotate,
+ ctm->e,
+ ctm->f,
+ extent.x,
+ extent.y,
+ *text_box_id,
+ content,
+ styles,
+ content_state
+ )) goto end;
+ *p = p1 - 1;
+ e = 0;
+
+ end:
+ return e;
+}
+
+
int extract_document_to_odt_content(
extract_alloc_t* alloc,
document_t* document,
@@ -445,156 +636,66 @@ int extract_document_to_odt_content(
for (p=0; p<document->pages_num; ++p)
{
extract_page_t* page = document->pages[p];
- int p;
- content_state_t state;
- state.font_name = NULL;
- state.font_size = 0;
- state.font_bold = 0;
- state.font_italic = 0;
- state.ctm_prev = NULL;
+ int p = 0;
+ int t = 0;
+ content_state_t content_state;
+ content_state.font.name = NULL;
+ content_state.font.size = 0;
+ content_state.font.bold = 0;
+ content_state.font.italic = 0;
+ content_state.ctm_prev = NULL;
- for (p=0; p<page->paragraphs_num; ++p)
+ for(;;)
{
- paragraph_t* paragraph = page->paragraphs[p];
- const matrix_t* ctm = &paragraph->lines[0]->spans[0]->ctm;
- double rotate = atan2(ctm->b, ctm->a);
+ paragraph_t* paragraph = (p == page->paragraphs_num) ? NULL : page->paragraphs[p];
+ table_t* table = (t == page->tables_num) ? NULL : page->tables[t];
+ double y_paragraph;
+ double y_table;
+ if (!paragraph && !table) break;
+ y_paragraph = (paragraph) ? paragraph->lines[0]->spans[0]->chars[0].y : DBL_MAX;
+ y_table = (table) ? table->pos.y : DBL_MAX;
- if (spacing
- && state.ctm_prev
- && paragraph->lines_num
- && paragraph->lines[0]->spans_num
- && matrix_cmp4(
- state.ctm_prev,
- &paragraph->lines[0]->spans[0]->ctm
- )
- )
+ if (paragraph && y_paragraph < y_table)
{
- /* Extra vertical space between paragraphs that were at
- different angles in the original document. */
- if (extract_odt_paragraph_empty(alloc, content, styles)) goto end;
- }
+ const matrix_t* ctm = &paragraph->lines[0]->spans[0]->ctm;
+ double rotate = atan2(ctm->b, ctm->a);
+
+ if (spacing
+ && content_state.ctm_prev
+ && paragraph->lines_num
+ && paragraph->lines[0]->spans_num
+ && extract_matrix_cmp4(
+ content_state.ctm_prev,
+ &paragraph->lines[0]->spans[0]->ctm
+ )
+ )
+ {
+ /* Extra vertical space between paragraphs that were at
+ different angles in the original document. */
+ if (s_odt_append_empty_paragraph(alloc, content, styles)) goto end;
+ }
- if (spacing)
- {
- /* Extra vertical space between paragraphs. */
- if (extract_odt_paragraph_empty(alloc, content, styles)) goto end;
- }
-
- if (rotation && rotate != 0)
- {
- /* Find extent of paragraphs with this same rotation. extent
- will contain max width and max height of paragraphs, in units
- before application of ctm, i.e. before rotation. */
- point_t extent = {0, 0};
- int p0 = p;
- int p1;
-
- outf("rotate=%.2frad=%.1fdeg ctm: ef=(%f %f) abcd=(%f %f %f %f)",
- rotate, rotate * 180 / pi,
- ctm->e,
- ctm->f,
- ctm->a,
- ctm->b,
- ctm->c,
- ctm->d
- );
-
+ if (spacing)
{
- /* We assume that first span is at origin of text
- block. This assumes left-to-right text. */
- double rotate0 = rotate;
- const matrix_t* ctm0 = ctm;
- point_t origin =
- {
- paragraph->lines[0]->spans[0]->chars[0].x,
- paragraph->lines[0]->spans[0]->chars[0].y
- };
- matrix_t ctm_inverse = {1, 0, 0, 1, 0, 0};
- double ctm_det = ctm->a*ctm->d - ctm->b*ctm->c;
- if (ctm_det != 0)
- {
- ctm_inverse.a = +ctm->d / ctm_det;
- ctm_inverse.b = -ctm->b / ctm_det;
- ctm_inverse.c = -ctm->c / ctm_det;
- ctm_inverse.d = +ctm->a / ctm_det;
- }
- else
- {
- outf("cannot invert ctm=(%f %f %f %f)",
- ctm->a, ctm->b, ctm->c, ctm->d);
- }
-
- for (p=p0; p<page->paragraphs_num; ++p)
- {
- paragraph = page->paragraphs[p];
- ctm = &paragraph->lines[0]->spans[0]->ctm;
- rotate = atan2(ctm->b, ctm->a);
- if (rotate != rotate0)
- {
- break;
- }
-
- /* Update <extent>. */
- {
- int l;
- for (l=0; l<paragraph->lines_num; ++l)
- {
- line_t* line = paragraph->lines[l];
- span_t* span = line_span_last(line);
- char_t* char_ = span_char_last(span);
- double adv = char_->adv * matrix_expansion(span->trm);
- double x = char_->x + adv * cos(rotate);
- double y = char_->y + adv * sin(rotate);
-
- double dx = x - origin.x;
- double dy = y - origin.y;
-
- /* Position relative to origin and before box rotation. */
- double xx = ctm_inverse.a * dx + ctm_inverse.b * dy;
- double yy = ctm_inverse.c * dx + ctm_inverse.d * dy;
- yy = -yy;
- if (xx > extent.x) extent.x = xx;
- if (yy > extent.y) extent.y = yy;
- if (0) outf("rotate=%f p=%i: origin=(%f %f) xy=(%f %f) dxy=(%f %f) xxyy=(%f %f) span: %s",
- rotate, p, origin.x, origin.y, x, y, dx, dy, xx, yy, span_string(alloc, span));
- }
- }
- }
- p1 = p;
- rotate = rotate0;
- ctm = ctm0;
- outf("rotate=%f p0=%i p1=%i. extent is: (%f %f)",
- rotate, p0, p1, extent.x, extent.y);
+ /* Extra vertical space between paragraphs. */
+ if (s_odt_append_empty_paragraph(alloc, content, styles)) goto end;
}
-
- /* Paragraphs p0..p1-1 have same rotation. We output them into
- a single rotated text box. */
-
- /* We need unique id for text box. */
- text_box_id += 1;
-
- if (extract_document_output_rotated_paragraphs(
- alloc,
- page,
- p0,
- p1,
- rotate,
- ctm->e,
- ctm->f,
- extent.x,
- extent.y,
- text_box_id,
- content,
- styles,
- &state
- )) goto end;
- p = p1 - 1;
+
+ if (rotation && rotate != 0)
+ {
+ if (s_odt_append_rotated_paragraphs(alloc, page, &content_state, &p, &text_box_id, ctm, rotate, content, styles)) goto end;
+ }
+ else
+ {
+ if (s_document_to_odt_content_paragraph(alloc, &content_state, paragraph, content, styles)) goto end;
+ }
+ p += 1;
}
- else
+ else if (table)
{
- if (extract_document_to_odt_content_paragraph(alloc, &state, paragraph, content, styles)) goto end;
+ if (s_odt_append_table(alloc, table, content, styles)) goto end;
+ t += 1;
}
-
}
outf("images=%i", images);
@@ -604,7 +705,7 @@ int extract_document_to_odt_content(
outf("page->images_num=%i", page->images_num);
for (i=0; i<page->images_num; ++i)
{
- extract_document_append_image(alloc, content, &page->images[i]);
+ s_odt_append_image(alloc, content, &page->images[i]);
}
}
}
@@ -658,26 +759,39 @@ int extract_odt_content_item(
char* text_intermediate = NULL;
extract_astring_t styles_definitions = {0};
+ /* Insert content before '</office:text>'. */
if (extract_content_insert(
alloc,
text,
NULL /*single*/,
- NULL,
- "</office:text>",
+ NULL /*mid_begin_name*/,
+ "</office:text>" /*mid_end_name*/,
contentss,
contentss_num,
&text_intermediate
)) goto end;
outf("text_intermediate: %s", text_intermediate);
- if (extract_odt_styles_definitions(alloc, styles, &styles_definitions)) goto end;
+ /* Convert <styles> to text. */
+ if (s_odt_styles_definitions(alloc, styles, &styles_definitions)) goto end;
+ /* To make tables work, we seem to need to specify table and column
+ styles, and these can be empty. todo: maybe specify exact sizes based
+ on the pdf table and cell dimensions. */
+ if (extract_astring_cat(alloc, &styles_definitions,
+ "\n"
+ "<style:style style:name=\"extract.table\" style:family=\"table\"/>\n"
+ "<style:style style:name=\"extract.table.column\" style:family=\"table-column\"/>\n"
+ )) goto end;
+
+ /* Replace '<office:automatic-styles/>' with text from
+ <styles_definitions>. */
e = extract_content_insert(
alloc,
text_intermediate,
"<office:automatic-styles/>" /*single*/,
- NULL,
- NULL, //"</office:automatic-styles>",
+ NULL /*mid_begin_name*/,
+ NULL /*mid_end_name*/,
&styles_definitions,
1,
text2
@@ -719,14 +833,14 @@ int extract_odt_content_item(
}
e = 0;
end:
- outf("e=%i errno=%i text2=%s", e, errno, text2);
+ outf("e=%i errno=%i text2=%s", e, errno, text2 ? *text2 : "");
if (e)
{
/* We might have set <text2> to new content. */
extract_free(alloc, text2);
/* We might have used <temp> as a temporary buffer. */
- extract_astring_free(alloc, &temp);
}
+ extract_astring_free(alloc, &temp);
extract_astring_init(&temp);
return e;
}
@@ -747,7 +861,6 @@ int extract_odt_write_template(
int e = -1;
int i;
char* path_tempdir = NULL;
- FILE* f = NULL;
char* path = NULL;
char* text = NULL;
char* text2 = NULL;
@@ -827,7 +940,6 @@ int extract_odt_write_template(
}
/* Copy images into <path_tempdir>/Pictures/. */
- outf("");
extract_free(alloc, &path);
if (extract_asprintf(alloc, &path, "%s/Pictures", path_tempdir) < 0) goto end;
if (extract_mkdir(path, 0777))
@@ -835,7 +947,6 @@ int extract_odt_write_template(
outf("Failed to mkdir %s", path);
goto end;
}
- outf("");
for (i=0; i<images->images_num; ++i)
{
image_t* image = &images->images[i];
@@ -869,8 +980,6 @@ int extract_odt_write_template(
extract_free(alloc, &path);
extract_free(alloc, &text);
extract_free(alloc, &text2);
- //extract_odt_styles_free(alloc, &styles);
- if (f) fclose(f);
if (e)
{
diff --git a/extract/src/outf.c b/extract/src/outf.c
index 95575c16..de7662f6 100644
--- a/extract/src/outf.c
+++ b/extract/src/outf.c
@@ -5,14 +5,14 @@
#include <stdio.h>
#include <string.h>
-static int s_verbose = 0;
+int extract_outf_verbose = 0;
-void outf_verbose_set(int verbose)
+void extract_outf_verbose_set(int verbose)
{
- s_verbose = verbose;
+ extract_outf_verbose = verbose;
}
-void (outf)(
+void (extract_outf)(
int level,
const char* file,
int line,
@@ -23,7 +23,7 @@ void (outf)(
)
{
va_list va;
- if (level > s_verbose) {
+ if (level > extract_outf_verbose) {
return;
}
diff --git a/extract/src/outf.h b/extract/src/outf.h
index a2b6c078..f9b97a93 100644
--- a/extract/src/outf.h
+++ b/extract/src/outf.h
@@ -1,32 +1,42 @@
#ifndef ARTIFEX_EXTRACT_OUTF_H
#define ARTIFEX_EXTRACT_OUTF_H
+/* Simple printf-style debug output. */
+
+#if defined(__GNUC__) || defined(__clang__) || defined(_WIN32)
+ #define extract_FUNCTION __FUNCTION__
+#else
+ #define extract_FUNCTION ""
+#endif
+
+#define outf(format, ...) \
+ (1 > extract_outf_verbose) ? (void) 0 : (extract_outf)(1, __FILE__, __LINE__, extract_FUNCTION, 1 /*ln*/, format, ##__VA_ARGS__)
+
+#define outf0(format, ...) \
+ (0 > extract_outf_verbose) ? (void) 0 : (extract_outf)(0, __FILE__, __LINE__, extract_FUNCTION, 1 /*ln*/, format, ##__VA_ARGS__)
+
+#define outfx(format, ...)
+
/* Only for internal use by extract code. */
-void (outf)(
+extern int extract_outf_verbose;
+
+void (extract_outf)(
int level,
const char* file, int line,
const char* fn,
int ln,
const char* format,
...
- );
+ )
+ #ifdef __GNUC__
+ __attribute__ ((format (printf, 6, 7)))
+ #endif
+ ;
/* Outputs text if <level> is less than or equal to verbose value set by
outf_level_set(). */
-#define outf(format, ...) \
- (outf)(1, __FILE__, __LINE__, __FUNCTION__, 1 /*ln*/, format, ##__VA_ARGS__)
-
-#define outf0(format, ...) \
- (outf)(0, __FILE__, __LINE__, __FUNCTION__, 1 /*ln*/, format, ##__VA_ARGS__)
-
-#define outfx(format, ...)
-
-/* Simple printf-style debug output. */
-
-#define outfx(format, ...)
-
-void outf_verbose_set(int verbose);
+void extract_outf_verbose_set(int verbose);
/* Set verbose value. Higher values are more verbose. Initial value is 0. */
#endif
diff --git a/extract/src/sys.c b/extract/src/sys.c
index 131f6312..2359acab 100644
--- a/extract/src/sys.c
+++ b/extract/src/sys.c
@@ -82,7 +82,7 @@ int extract_read_all_path(extract_alloc_t* alloc, const char* path, char** o_te
e = 0;
end:
if (f) fclose(f);
- if (e) extract_free(alloc, &o_text);
+ if (e) extract_free(alloc, o_text);
return e;
}
diff --git a/extract/src/text.c b/extract/src/text.c
index f832baa2..e75e3e69 100644
--- a/extract/src/text.c
+++ b/extract/src/text.c
@@ -18,23 +18,6 @@ int extract_content_insert(
int contentss_num,
char** o_out
)
-/* Creates a new string by inserting sequence of strings into a template
-string.
-
-If <single_name> is in <original>, it is replaced by <contentss>.
-
-Otherwise the text between the end of <mid_begin_name> and beginning of
-<mid_end_name> is replaced by <contentss>.
-
-If <mid_begin_name> is NULL, we insert into the zero-length region before
-<mid_end_name>.
-
-If <mid_end_name> is NULL, we insert into the zero-length region after
-<mid_begin_name>.
-
-At least one of <single_name>, <mid_begin_name> and <mid_end_name> must be
-non-NULL.
-*/
{
int e = -1;
const char* mid_begin = NULL;
@@ -92,6 +75,11 @@ non-NULL.
if (extract_astring_catl(alloc, &out, contentss[i].chars, contentss[i].chars_num)) goto end;
}
}
+ assert( mid_end);
+ /* As per docs, at least one of <single_name>, <mid_begin_name> and
+ <mid_end_name> is non-null, and this ensures that mid_end must not be null.
+ */
+ /* coverity[var_deref_model] */
if (extract_astring_cat(alloc, &out, mid_end)) goto end;
*o_out = out.chars;
diff --git a/extract/src/xml.c b/extract/src/xml.c
index 8dab511b..24116f6d 100644
--- a/extract/src/xml.c
+++ b/extract/src/xml.c
@@ -349,7 +349,7 @@ int extract_xml_pparse_init(extract_alloc_t* alloc, extract_buffer_t* buffer, co
}
first_line_buffer[actual] = 0;
if (strcmp(first_line, first_line_buffer)) {
- outf("Unrecognised prefix: ", first_line_buffer);
+ outf("Unrecognised prefix: %s", first_line_buffer);
errno = ESRCH;
goto end;
}
@@ -393,7 +393,10 @@ static const char* extract_xml_tag_string(extract_alloc_t* alloc, extract_xml_ta
{
static char* buffer = NULL;
extract_free(alloc, &buffer);
- extract_asprintf(alloc, &buffer, "<name=%s>", tag->name ? tag->name : "");
+ if (extract_asprintf(alloc, &buffer, "<name=%s>", tag->name ? tag->name : ""))
+ {
+ return "";
+ }
return buffer;
}
@@ -410,7 +413,9 @@ int extract_xml_pparse_next(extract_buffer_t* buffer, extract_xml_tag_t* out)
assert(buffer);
extract_xml_tag_free(alloc, out);
- /* Read tag name. */
+ /* Read tag name. Initialise it to empty string so we never return
+ out->name==null on success. */
+ if (str_catl( alloc, &out->name, NULL, 0)) goto end;
for( i=0;; ++i) {
int e = extract_buffer_read(buffer, &c, 1, NULL);
if (e) {
@@ -438,6 +443,7 @@ int extract_xml_pparse_next(extract_buffer_t* buffer, extract_xml_tag_t* out)
int quote_single = 0;
int quote_double = 0;
size_t l;
+ if (str_catl( alloc, &attribute_value, NULL, 0)) goto end;
for(;;) {
if (s_next(buffer, &ret, &c)) goto end;
if (c == '\'') quote_single = !quote_single;
@@ -469,6 +475,10 @@ int extract_xml_pparse_next(extract_buffer_t* buffer, extract_xml_tag_t* out)
}
}
+ /* Ensure name and value are not NULL. */
+ if (str_catl( alloc, &attribute_name, NULL, 0)) goto end;
+ if (str_catl( alloc, &attribute_value, NULL, 0)) goto end;
+
if (extract_xml_tag_attributes_append(alloc, out, attribute_name, attribute_value)) goto end;
attribute_name = NULL;
attribute_value = NULL;
diff --git a/extract/src/xml.h b/extract/src/xml.h
index d11fd886..8bc4dae2 100644
--- a/extract/src/xml.h
+++ b/extract/src/xml.h
@@ -35,6 +35,9 @@ void extract_xml_tag_free(extract_alloc_t* alloc, extract_xml_tag_t* tag);
int extract_xml_pparse_init(extract_alloc_t* alloc, extract_buffer_t* buffer, const char* first_line);
/* extract_xml_pparse_*(): simple XML 'pull' parser.
+If <first_line> is not NULL, we require that <buffer> starts with the specified
+text. Usually one would include a final newline in <first_line>.
+
extract_xml_pparse_init() merely consumes the initial '<'. Thereafter
extract_xml_pparse_next() consumes the next '<' before returning the previous
tag. */
@@ -53,6 +56,9 @@ int extract_xml_pparse_next(extract_buffer_t* buffer, extract_xml_tag_t* out);
Returns 0 with *out containing next tag; or -1 with errno set if error; or +1
with errno=ESRCH if EOF.
+If we return 0, we guarantee that out->name points to valid string and that
+each item in out->attributes has similarly valid name and value members.
+
*out is initially passed to extract_xml_tag_free(), so *out must have been
initialised, e.g. by by extract_xml_tag_init(). */
diff --git a/extract/src/zip.c b/extract/src/zip.c
index 03bfd024..691b743b 100644
--- a/extract/src/zip.c
+++ b/extract/src/zip.c
@@ -10,6 +10,7 @@
#include <assert.h>
#include <errno.h>
#include <limits.h>
+#include <time.h>
#ifdef _MSC_VER
#include "compat_stdint.h"
@@ -74,8 +75,38 @@ int extract_zip_open(extract_buffer_t* buffer, extract_zip_t** o_zip)
/* We could maybe convert current date/time to the ms-dos format required
here, but using zeros doesn't seem to make a difference to Word etc. */
- zip->mtime = 0;
- zip->mdate = 0;
+
+ {
+ time_t t = time(NULL);
+ struct tm* tm;
+ #ifdef _POSIX_SOURCE
+ struct tm tm_local;
+ tm = gmtime_r(&t, &tm_local);
+ #else
+ tm = gmtime(&t);
+ #endif
+ if (tm)
+ {
+ /* mdate and mtime are in MS DOS format:
+ mtime:
+ bits 0-4: seconds / 2.
+ bits 5-10: minute (0-59).
+ bits 11-15: hour (0-23).
+ mdate:
+ bits 0-4: day of month (1-31).
+ bits 5-8: month (1=jan, 2=feb, etc).
+ bits 9-15: year - 1980.
+ */
+ zip->mtime = (uint16_t) ((tm->tm_hour << 11) | (tm->tm_min << 5) | (tm->tm_sec / 2));
+ zip->mdate = (uint16_t) (((1900 + tm->tm_year - 1980) << 9) | ((tm->tm_mon + 1) << 5) | tm->tm_mday);
+ }
+ else
+ {
+ outf0("*** gmtime_r() failed");
+ zip->mtime = 0;
+ zip->mdate = 0;
+ }
+ }
/* These are all copied from command-line zip on unix. */
zip->version_creator = (0x3 << 8) + 30; /* 0x3 is unix, 30 means 3.0. */
@@ -115,7 +146,9 @@ static int s_native_little_endinesss(void)
/* Native big-endiness. */
return 0;
}
- abort();
+ /* Would like to call abort() here, but that breaks on AIX/gcc. */
+ assert(0);
+ return 0;
}
@@ -148,7 +181,7 @@ static int s_write_compressed(
/* Uses zlib to write raw deflate compressed data to zip->buffer. */
{
int ze;
- z_stream zstream;
+ z_stream zstream = {0}; /* Initialise to keep Coverity quiet. */
if (zip->errno_) return -1;
if (zip->eof) return +1;
@@ -313,7 +346,7 @@ int extract_zip_write_file(
cd_file->name = NULL;
cd_file->mtime = zip->mtime;
- cd_file->mdate = zip->mtime;
+ cd_file->mdate = zip->mdate;
cd_file->crc_sum = (int32_t) crc32(crc32(0, NULL, 0), data, (int) data_length);
cd_file->size_uncompressed = (int) data_length;
if (zip->compression_method == 0)