diff options
Diffstat (limited to 'extract/src/extract.c')
-rw-r--r-- | extract/src/extract.c | 891 |
1 files changed, 804 insertions, 87 deletions
diff --git a/extract/src/extract.c b/extract/src/extract.c index 9eb85d2f..2c375571 100644 --- a/extract/src/extract.c +++ b/extract/src/extract.c @@ -5,6 +5,7 @@ #include "document.h" #include "docx.h" #include "docx_template.h" +#include "html.h" #include "mem.h" #include "memento.h" #include "odt.h" @@ -25,7 +26,7 @@ -double matrix_expansion(matrix_t m) +double extract_matrix_expansion(matrix_t m) { return sqrt(fabs(m.a * m.d - m.b * m.c)); } @@ -41,14 +42,31 @@ static void char_init(char_t* item) item->adv = 0; } +const char* extract_point_string(const point_t* point) +{ + static char buffer[128]; + snprintf(buffer, sizeof(buffer), "(%f %f)", point->x, point->y); + return buffer; +} + +const char* extract_rect_string(const rect_t* rect) +{ + static char buffer[2][256]; + static int i = 0; + i = (i + 1) % 2; + snprintf(buffer[i], sizeof(buffer[i]), "((%f %f) (%f %f))", rect->min.x, rect->min.y, rect->max.x, rect->max.y); + return buffer[i]; +} -const char* span_string(extract_alloc_t* alloc, span_t* span) +const char* extract_span_string(extract_alloc_t* alloc, span_t* span) { static extract_astring_t ret = {0}; double x0 = 0; double y0 = 0; + point_t pre0 = {0, 0}; double x1 = 0; double y1 = 0; + point_t pre1 = {0, 0}; int c0 = 0; int c1 = 0; int i; @@ -62,17 +80,23 @@ const char* span_string(extract_alloc_t* alloc, span_t* span) c0 = span->chars[0].ucs; x0 = span->chars[0].x; y0 = span->chars[0].y; + pre0.x = span->chars[0].pre_x; + pre0.y = span->chars[0].pre_y; c1 = span->chars[span->chars_num-1].ucs; x1 = span->chars[span->chars_num-1].x; y1 = span->chars[span->chars_num-1].y; + pre1.x = span->chars[span->chars_num-1].pre_x; + pre1.y = span->chars[span->chars_num-1].pre_y; } { - char buffer[200]; + char buffer[400]; snprintf(buffer, sizeof(buffer), - "span chars_num=%i (%c:%f,%f)..(%c:%f,%f) font=%s:(%f,%f) wmode=%i chars_num=%i: ", + "span ctm=%s trm=%s chars_num=%i (%c:%f,%f pre(%f %f))..(%c:%f,%f pre(%f %f)) font=%s:(%f,%f) wmode=%i chars_num=%i: ", + extract_matrix_string(&span->ctm), + extract_matrix_string(&span->trm), span->chars_num, - c0, x0, y0, - c1, x1, y1, + c0, x0, y0, pre0.x, pre0.y, + c1, x1, y1, pre1.x, pre1.y, span->font_name, span->trm.a, span->trm.d, @@ -84,9 +108,11 @@ const char* span_string(extract_alloc_t* alloc, span_t* span) snprintf( buffer, sizeof(buffer), - " i=%i {x=%f adv=%f}", + " i=%i {x=%f y=%f ucs=%i adv=%f}", i, span->chars[i].x, + span->chars[i].y, + span->chars[i].ucs, span->chars[i].adv ); extract_astring_cat(alloc, &ret, buffer); @@ -101,7 +127,7 @@ const char* span_string(extract_alloc_t* alloc, span_t* span) return ret.chars; } -int span_append_c(extract_alloc_t* alloc, span_t* span, int c) +int extract_span_append_c(extract_alloc_t* alloc, span_t* span, int c) { char_t* item; if (extract_realloc2( @@ -119,7 +145,7 @@ int span_append_c(extract_alloc_t* alloc, span_t* span, int c) return 0; } -char_t* span_char_last(span_t* span) +char_t* extract_span_char_last(span_t* span) { assert(span->chars_num > 0); return &span->chars[span->chars_num-1]; @@ -138,58 +164,62 @@ static const char* line_string(line_t* line) int i; for (i=0; i<line->spans_num; ++i) { extract_astring_cat(&ret, " "); - extract_astring_cat(&ret, span_string(line->spans[i])); + extract_astring_cat(&ret, extract_span_string(line->spans[i])); } return ret.chars; } #endif /* Returns first span in a line. */ -span_t* line_span_last(line_t* line) +span_t* extract_line_span_last(line_t* line) { assert(line->spans_num > 0); return line->spans[line->spans_num - 1]; } -span_t* line_span_first(line_t* line) +span_t* extract_line_span_first(line_t* line) { assert(line->spans_num > 0); return line->spans[0]; } -static void page_free(extract_alloc_t* alloc, extract_page_t* page) + +static void table_free(extract_alloc_t* alloc, table_t** ptable) +{ + int c; + table_t* table = *ptable; + outf("table->cells_num_x=%i table->cells_num_y=%i", + table->cells_num_x, + table->cells_num_y + ); + for (c = 0; c< table->cells_num_x * table->cells_num_y; ++c) + { + extract_cell_free(alloc, &table->cells[c]); + } + extract_free(alloc, &table->cells); + extract_free(alloc, ptable); +} + +static void page_free(extract_alloc_t* alloc, extract_page_t** ppage) { - int s; + extract_page_t* page = *ppage; if (!page) return; - for (s=0; s<page->spans_num; ++s) { - span_t* span = page->spans[s]; - if (span) { - extract_free(alloc, &span->chars); - extract_free(alloc, &span->font_name); - } - extract_free(alloc, &span); - } - extract_free(alloc, &page->spans); + outf0("page=%p page->spans_num=%i page->lines_num=%i", + page, page->spans_num, page->lines_num); + extract_spans_free(alloc, &page->spans, page->spans_num); - { - int l; - for (l=0; l<page->lines_num; ++l) { - line_t* line = page->lines[l]; - extract_free(alloc, &line->spans); - extract_free(alloc, &line); - /* We don't free line->spans->chars[] because already freed via - page->spans. */ - } - } - extract_free(alloc, &page->lines); + extract_lines_free(alloc, &page->lines, page->lines_num); { int p; for (p=0; p<page->paragraphs_num; ++p) { paragraph_t* paragraph = page->paragraphs[p]; + /* We don't call extract_lines_free(¶graph->lines) because + these point into the same data as page->lines, which we have + already freed above. */ if (paragraph) extract_free(alloc, ¶graph->lines); - extract_free(alloc, ¶graph); + extract_free(alloc, &page->paragraphs[p]); } } extract_free(alloc, &page->paragraphs); @@ -197,13 +227,26 @@ static void page_free(extract_alloc_t* alloc, extract_page_t* page) { int i; for (i=0; i<page->images_num; ++i) { - extract_free(alloc, &page->images[i].data); - extract_free(alloc, &page->images[i].type); - extract_free(alloc, &page->images[i].id); - extract_free(alloc, &page->images[i].name); + extract_image_clear(alloc, &page->images[i]); } + extract_free(alloc, &page->images); } extract_free(alloc, &page->images); + + extract_free(alloc, &page->tablelines_horizontal.tablelines); + extract_free(alloc, &page->tablelines_vertical.tablelines); + + { + int t; + outf("page=%p page->tables_num=%i", page, page->tables_num); + for (t=0; t<page->tables_num; ++t) + { + table_free(alloc, &page->tables[t]); + } + extract_free(alloc, &page->tables); + } + + extract_free(alloc, ppage); } static span_t* page_span_append(extract_alloc_t* alloc, extract_page_t* page) @@ -212,9 +255,7 @@ error. */ { span_t* span; if (extract_malloc(alloc, &span, sizeof(*span))) return NULL; - span->font_name = NULL; - span->chars = NULL; - span->chars_num = 0; + extract_span_init(span); if (extract_realloc2( alloc, &page->spans, @@ -234,14 +275,7 @@ static void extract_images_free(extract_alloc_t* alloc, images_t* images) { int i; for (i=0; i<images->images_num; ++i) { - image_t* image = &images->images[i]; - extract_free(alloc, &image->type); - extract_free(alloc, &image->name); - extract_free(alloc, &image->id); - if (image->data_free) { - image->data_free(image->data_free_handle, image->data); - } - extract_free(alloc, &images->images[i]); + extract_image_clear(alloc, &images->images[i]); } extract_free(alloc, &images->images); extract_free(alloc, &images->imagetypes); @@ -260,10 +294,12 @@ On return document->page[].images* will be NULL etc. int p; images_t images = {0}; outf("extract_document_images(): images.images_num=%i", images.images_num); - for (p=0; p<document->pages_num; ++p) { + for (p=0; p<document->pages_num; ++p) + { extract_page_t* page = document->pages[p]; int i; - for (i=0; i<page->images_num; ++i) { + for (i=0; i<page->images_num; ++i) + { image_t* image; if (extract_realloc2( alloc, @@ -280,14 +316,17 @@ On return document->page[].images* will be NULL etc. /* Add image type if we haven't seen it before. */ { int it; - for (it=0; it<images.imagetypes_num; ++it) { + for (it=0; it<images.imagetypes_num; ++it) + { outf("it=%i images.imagetypes[it]=%s image->type=%s", it, images.imagetypes[it], image->type); if (!strcmp(images.imagetypes[it], image->type)) { break; } } - if (it == images.imagetypes_num) { + if (it == images.imagetypes_num) + { + /* We haven't seen this image type before. */ if (extract_realloc2( alloc, &images.imagetypes, @@ -314,9 +353,12 @@ On return document->page[].images* will be NULL etc. } e = 0; end: - if (e) { + if (e) + { + extract_free(alloc, &images.images); } - else { + else + { *o_images = images; } return e; @@ -330,8 +372,7 @@ static void extract_document_free(extract_alloc_t* alloc, document_t* document) } for (p=0; p<document->pages_num; ++p) { extract_page_t* page = document->pages[p]; - page_free(alloc, page); - extract_free(alloc, &page); + page_free(alloc, &page); } extract_free(alloc, &document->pages); document->pages = NULL; @@ -347,7 +388,7 @@ static int s_sign(double x) return 0; } -int matrix_cmp4(const matrix_t* lhs, const matrix_t* rhs) +int extract_matrix_cmp4(const matrix_t* lhs, const matrix_t* rhs) { int ret; ret = s_sign(lhs->a - rhs->a); if (ret) return ret; @@ -358,7 +399,7 @@ int matrix_cmp4(const matrix_t* lhs, const matrix_t* rhs) } -static point_t multiply_matrix_point(matrix_t m, point_t p) +point_t extract_multiply_matrix_point(matrix_t m, point_t p) { double x = p.x; p.x = m.a * x + m.c * p.y; @@ -366,6 +407,18 @@ static point_t multiply_matrix_point(matrix_t m, point_t p) return p; } +matrix_t extract_multiply_matrix_matrix(matrix_t m1, matrix_t m2) +{ + matrix_t ret; + ret.a = m1.a * m2.a + m1.b * m2.c; + ret.b = m1.a * m2.b + m1.b * m2.d; + ret.c = m1.c * m2.a + m1.d * m2.c; + ret.d = m1.c * m2.b + m1.d * m2.d; + ret.e = m1.e + m2.e; + ret.f = m1.f + m2.f; + return ret; +} + static int s_matrix_read(const char* text, matrix_t* matrix) { int n; @@ -427,8 +480,8 @@ char_t into a new span_t. */ return 0; } - font_size = matrix_expansion(span->trm) - * matrix_expansion(span->ctm); + font_size = extract_matrix_expansion(span->trm) + * extract_matrix_expansion(span->ctm); if (span->flags.wmode) { dir.x = 0; @@ -438,7 +491,7 @@ char_t into a new span_t. */ dir.x = 1; dir.y = 0; } - dir = multiply_matrix_point(span->trm, dir); + dir = extract_multiply_matrix_point(span->trm, dir); x = char_[-2].pre_x + char_[-2].adv * dir.x; y = char_[-2].pre_y + char_[-2].adv * dir.y; @@ -470,10 +523,10 @@ char_t into a new span_t. */ sometimes seem to appear in the middle of words for some reason. */ outfx("removing space before final char in: %s", - span_string(span)); + extract_span_string(span)); span->chars[span->chars_num-2] = span->chars[span->chars_num-1]; span->chars_num -= 1; - outfx("span is now: %s", span_string(span)); + outfx("span is now: %s", extract_span_string(span)); return 0; } } @@ -536,9 +589,42 @@ struct extract_t int contentss_num; images_t images; - + extract_format_t format; extract_odt_styles_t odt_styles; + + char* tables_csv_format; + int tables_csv_i; + + enum + { + path_type_NONE, + path_type_FILL, + path_type_STROKE, + } path_type; + + union + { + struct + { + matrix_t ctm; + double color; + point_t points[4]; + int n; + } fill; + + struct + { + matrix_t ctm; + double color; + double width; + point_t point0; + int point0_set; + point_t point; + int point_set; + } stroke; + + } path; }; @@ -551,7 +637,12 @@ int extract_begin( int e = -1; extract_t* extract; - if (format != extract_format_ODT && format != extract_format_DOCX) + if (1 + && format != extract_format_ODT + && format != extract_format_DOCX + && format != extract_format_HTML + && format != extract_format_TEXT + ) { outf0("Invalid format=%i\n", format); errno = EINVAL; @@ -570,6 +661,8 @@ int extract_begin( extract->image_n = 10; extract->format = format; + extract->tables_csv_format = NULL; + extract->tables_csv_i = 0; e = 0; @@ -578,6 +671,11 @@ int extract_begin( return e; } +int extract_tables_csv_format(extract_t* extract, const char* path_format) +{ + return extract_strdup(extract->alloc, path_format, &extract->tables_csv_format); +} + static void image_free_fn(void* handle, void* image_data) { @@ -872,6 +970,22 @@ int extract_span_begin( span_t* span; assert(extract->document.pages_num > 0); page = extract->document.pages[extract->document.pages_num-1]; + outf("extract_span_begin(): ctm=(%f %f %f %f %f %f) trm=(%f %f %f %f %f %f) font_name=%s, wmode=%i", + ctm_a, + ctm_b, + ctm_c, + ctm_d, + ctm_e, + ctm_f, + trm_a, + trm_b, + trm_c, + trm_d, + trm_e, + trm_f, + font_name, + wmode + ); span = page_span_append(extract->alloc, page); if (!span) goto end; span->ctm.a = ctm_a; @@ -880,12 +994,14 @@ int extract_span_begin( span->ctm.d = ctm_d; span->ctm.e = ctm_e; span->ctm.f = ctm_f; + span->trm.a = trm_a; span->trm.b = trm_b; span->trm.c = trm_c; span->trm.d = trm_d; span->trm.e = trm_e; span->trm.f = trm_f; + { const char* ff = strchr(font_name, '+'); const char* f = (ff) ? ff+1 : font_name; @@ -916,7 +1032,49 @@ int extract_add_char( extract_page_t* page = extract->document.pages[extract->document.pages_num-1]; span_t* span = page->spans[page->spans_num - 1]; - if (autosplit && y - extract->span_offset_y != 0) { + outf("(%f %f) ucs=% 5i=%c adv=%f", x, y, ucs, (ucs >=32 && ucs< 127) ? ucs : ' ', adv); + /* Ignore the specified <autosplit> - there seems no advantage to not + splitting spans on multiple lines, and not doing so causes problems with + missing spaces in the output. */ + autosplit = 1; + + if (span->chars_num) + { + char_t* char_prev = &span->chars[span->chars_num - 1]; + double xx = span->ctm.a * x + span->ctm.c * y + span->ctm.e; + double yy = span->ctm.b * x + span->ctm.d * y + span->ctm.f; + double dx = xx - char_prev->x; + double dy = yy - char_prev->y; + double a = atan2(dy, dx); + double span_a; + matrix_t m = extract_multiply_matrix_matrix(span->trm, span->ctm); + point_t dir = {1 - span->flags.wmode, span->flags.wmode}; + dir = extract_multiply_matrix_point(m, dir); + span_a = atan2(dir.y, dir.x); + if (fabs(span_a - a) > 0.01) + { + /* Create new span. */ + span_t* span0 = span; + outf("chars_num=%i prev=(%f %f) => (%f %f) xy=(%f %f) => xxyy=(%f %f) delta=(%f %f) a=%f not in line with dir=(%f %f) a=%f: ", + span->chars_num, + char_prev->pre_x, char_prev->pre_y, + char_prev->x, char_prev->y, + x, y, + xx, yy, + dx, dy, a, + dir.x, dir.y, span_a + ); + extract->num_spans_autosplit += 1; + span = page_span_append(extract->alloc, page); + if (!span) goto end; + *span = *span0; + span->chars = NULL; + span->chars_num = 0; + if (extract_strdup(extract->alloc, span0->font_name, &span->font_name)) goto end; + } + } + + if (0 && autosplit && y - extract->span_offset_y != 0) { double e = span->ctm.e + span->ctm.a * (x - extract->span_offset_x) + span->ctm.b * (y - extract->span_offset_y); @@ -949,21 +1107,20 @@ int extract_add_char( char_pre_y, offset_y); } - if (span_append_c(extract->alloc, span, 0 /*c*/)) goto end; + if (extract_span_append_c(extract->alloc, span, 0 /*c*/)) goto end; + /* Coverity warns, but extract_span_append_c() will have appended an item. */ + /* coverity[var_deref_op] */ char_ = &span->chars[ span->chars_num-1]; - char_->pre_x = x - extract->span_offset_x; - char_->pre_y = y - extract->span_offset_y; + char_->pre_x = x; + char_->pre_y = y; - char_->x = span->ctm.a * char_->pre_x + span->ctm.b * char_->pre_y; - char_->y = span->ctm.c * char_->pre_x + span->ctm.d * char_->pre_y; + char_->x = span->ctm.a * char_->pre_x + span->ctm.c * char_->pre_y + span->ctm.e; + char_->y = span->ctm.b * char_->pre_x + span->ctm.d * char_->pre_y + span->ctm.f; char_->adv = adv; char_->ucs = ucs; - char_->x += span->ctm.e; - char_->y += span->ctm.f; - { int page_spans_num_old = page->spans_num; if (page_span_end_clean(extract->alloc, page)) goto end; @@ -1049,6 +1206,174 @@ int extract_add_image( return e; } + +static int tablelines_append(extract_alloc_t* alloc, tablelines_t* tablelines, rect_t* rect, double color) +{ + if (extract_realloc( + alloc, + &tablelines->tablelines, + sizeof(*tablelines->tablelines) * (tablelines->tablelines_num + 1) + )) return -1; + tablelines->tablelines[ tablelines->tablelines_num].rect = *rect; + tablelines->tablelines[ tablelines->tablelines_num].color = (float) color; + tablelines->tablelines_num += 1; + return 0; +} + +static point_t transform(double x, double y, + double ctm_a, + double ctm_b, + double ctm_c, + double ctm_d, + double ctm_e, + double ctm_f + ) +{ + point_t ret; + ret.x = ctm_a * x + ctm_b * y + ctm_e; + ret.y = ctm_c * x + ctm_d * y + ctm_f; + return ret; +} + +static double s_min(double a, double b) +{ + return (a < b) ? a : b; +} + +static double s_max(double a, double b) +{ + return (a > b) ? a : b; +} + +int extract_add_path4( + extract_t* extract, + double ctm_a, + double ctm_b, + double ctm_c, + double ctm_d, + double ctm_e, + double ctm_f, + double x0, + double y0, + double x1, + double y1, + double x2, + double y2, + double x3, + double y3, + double color + ) +{ + extract_page_t* page = extract->document.pages[extract->document.pages_num-1]; + point_t points[4] = { + transform(x0, y0, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f), + transform(x1, y1, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f), + transform(x2, y2, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f), + transform(x3, y3, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f) + }; + rect_t rect; + int i; + double dx; + double dy; + if (0 && color == 1) + { + return 0; + } + outf("cmt=(%f %f %f %f %f %f) points=[(%f %f) (%f %f) (%f %f) (%f %f)]", + ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f, + x0, y0, x1, y1, x2, y2, x3, y3 + ); + outf("extract_add_path4(): [(%f %f) (%f %f) (%f %f) (%f %f)]", + x0, y0, x1, y1, x2, y2, x3, y3); + /* Find first step with dx > 0. */ + for (i=0; i<4; ++i) + { + if (points[(i+1) % 4].x > points[(i+0) % 4].x) break; + } + outf("i=%i", i); + if (i == 4) return 0; + rect.min.x = points[(i+0) % 4].x; + rect.max.x = points[(i+1) % 4].x; + if (points[(i+2) % 4].x != rect.max.x) return 0; + if (points[(i+3) % 4].x != rect.min.x) return 0; + y0 = points[(i+1) % 4].y; + y1 = points[(i+2) % 4].y; + if (y0 == y1) return 0; + if (points[(i+3) % 4].y != y1) return 0; + if (points[(i+4) % 4].y != y0) return 0; + rect.min.y = (y1 > y0) ? y0 : y1; + rect.max.y = (y1 > y0) ? y1 : y0; + + dx = rect.max.x - rect.min.x; + dy = rect.max.y - rect.min.y; + if (dx / dy > 5) + { + /* Horizontal line. */ + outf("have found horizontal line: %s", extract_rect_string(&rect)); + if (tablelines_append(extract->alloc, &page->tablelines_horizontal, &rect, color)) return -1; + } + else if (dy / dx > 5) + { + /* Vertical line. */ + outf("have found vertical line: %s", extract_rect_string(&rect)); + if (tablelines_append(extract->alloc, &page->tablelines_vertical, &rect, color)) return -1; + } + return 0; +} + + +int extract_add_line( + extract_t* extract, + double ctm_a, + double ctm_b, + double ctm_c, + double ctm_d, + double ctm_e, + double ctm_f, + double width, + double x0, + double y0, + double x1, + double y1, + double color + ) +{ + extract_page_t* page = extract->document.pages[extract->document.pages_num-1]; + point_t p0 = transform(x0, y0, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f); + point_t p1 = transform(x1, y1, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f); + double width2 = width * sqrt( fabs( ctm_a * ctm_d - ctm_b * ctm_c)); + rect_t rect; + (void) color; + rect.min.x = s_min(p0.x, p1.x); + rect.min.y = s_min(p0.y, p1.y); + rect.max.x = s_max(p0.x, p1.x); + rect.max.y = s_max(p0.y, p1.y); + + outf("%s: width=%f ((%f %f)(%f %f)) rect=%s", + extract_FUNCTION, + width, + x0, y0, x1, y1, + extract_rect_string(&rect) + ); + if (rect.min.x == rect.max.x && rect.min.y == rect.max.y) + { + } + else if (rect.min.x == rect.max.x) + { + rect.min.x -= width2 / 2; + rect.max.x += width2 / 2; + return tablelines_append(extract->alloc, &page->tablelines_vertical, &rect, color); + } + else if (rect.min.y == rect.max.y) + { + rect.min.y -= width2 / 2; + rect.max.y += width2 / 2; + return tablelines_append(extract->alloc, &page->tablelines_horizontal, &rect, color); + } + return 0; +} + + int extract_page_begin(extract_t* extract) { /* Appends new empty extract_page_t to an extract->document. */ @@ -1062,6 +1387,13 @@ int extract_page_begin(extract_t* extract) page->paragraphs_num = 0; page->images = NULL; page->images_num = 0; + page->tablelines_horizontal.tablelines = NULL; + page->tablelines_horizontal.tablelines_num = 0; + page->tablelines_vertical.tablelines = NULL; + page->tablelines_vertical.tablelines_num = 0; + page->tables = NULL; + page->tables_num = 0; + if (extract_realloc2( extract->alloc, &extract->document.pages, @@ -1076,6 +1408,231 @@ int extract_page_begin(extract_t* extract) return 0; } +int extract_fill_begin( + extract_t* extract, + double ctm_a, + double ctm_b, + double ctm_c, + double ctm_d, + double ctm_e, + double ctm_f, + double color + ) +{ + assert(extract->path_type == path_type_NONE); + extract->path_type = path_type_FILL; + extract->path.fill.color = color; + extract->path.fill.n = 0; + extract->path.fill.ctm.a = ctm_a; + extract->path.fill.ctm.b = ctm_b; + extract->path.fill.ctm.c = ctm_c; + extract->path.fill.ctm.d = ctm_d; + extract->path.fill.ctm.e = ctm_e; + extract->path.fill.ctm.f = ctm_f; + return 0; +} + +int extract_stroke_begin( + extract_t* extract, + double ctm_a, + double ctm_b, + double ctm_c, + double ctm_d, + double ctm_e, + double ctm_f, + double line_width, + double color + ) +{ + assert(extract->path_type == path_type_NONE); + extract->path_type = path_type_STROKE; + extract->path.stroke.ctm.a = ctm_a; + extract->path.stroke.ctm.b = ctm_b; + extract->path.stroke.ctm.c = ctm_c; + extract->path.stroke.ctm.d = ctm_d; + extract->path.stroke.ctm.e = ctm_e; + extract->path.stroke.ctm.f = ctm_f; + extract->path.stroke.width = line_width; + extract->path.stroke.color = color; + extract->path.stroke.point0_set = 0; + extract->path.stroke.point_set = 0; + return 0; +} + +int extract_moveto(extract_t* extract, double x, double y) +{ + if (extract->path_type == path_type_FILL) + { + if (extract->path.fill.n == -1) return 0; + if (extract->path.fill.n != 0) + { + outf0("returning error. extract->path.fill.n=%i", extract->path.fill.n); + extract->path.fill.n = -1; + return 0; + } + extract->path.fill.points[extract->path.fill.n].x = x; + extract->path.fill.points[extract->path.fill.n].y = y; + extract->path.fill.n += 1; + return 0; + } + else if (extract->path_type == path_type_STROKE) + { + extract->path.stroke.point.x = x; + extract->path.stroke.point.y = y; + extract->path.stroke.point_set = 1; + if (!extract->path.stroke.point0_set) + { + extract->path.stroke.point0 = extract->path.stroke.point; + extract->path.stroke.point0_set = 1; + } + return 0; + } + else + { + assert(0); + return -1; + } +} + +int extract_lineto(extract_t* extract, double x, double y) +{ + if (extract->path_type == path_type_FILL) + { + if (extract->path.fill.n == -1) return 0; + if (extract->path.fill.n == 0 || extract->path.fill.n >= 4) + { + outf0("returning error. extract->path.fill.n=%i", extract->path.fill.n); + extract->path.fill.n = -1; + return 0; + } + extract->path.fill.points[extract->path.fill.n].x = x; + extract->path.fill.points[extract->path.fill.n].y = y; + extract->path.fill.n += 1; + return 0; + } + else if (extract->path_type == path_type_STROKE) + { + if (extract->path.stroke.point_set) + { + if (extract_add_line( + extract, + extract->path.stroke.ctm.a, + extract->path.stroke.ctm.b, + extract->path.stroke.ctm.c, + extract->path.stroke.ctm.d, + extract->path.stroke.ctm.e, + extract->path.stroke.ctm.f, + extract->path.stroke.width, + extract->path.stroke.point.x, + extract->path.stroke.point.y, + x, + y, + extract->path.stroke.color + )) + { + return -1; + } + } + extract->path.stroke.point.x = x; + extract->path.stroke.point.y = y; + extract->path.stroke.point_set = 1; + if (!extract->path.stroke.point0_set) + { + extract->path.stroke.point0 = extract->path.stroke.point; + extract->path.stroke.point0_set = 1; + } + return 0; + } + else + { + assert(0); + return -1; + } +} + +int extract_closepath(extract_t* extract) +{ + if (extract->path_type == path_type_FILL) + { + if (extract->path.fill.n == 4) + { + /* We are closing a four-element path, so this could be a thin + rectangle that defines a line in a table. */ + int e; + e = extract_add_path4( + extract, + extract->path.fill.ctm.a, + extract->path.fill.ctm.b, + extract->path.fill.ctm.c, + extract->path.fill.ctm.d, + extract->path.fill.ctm.e, + extract->path.fill.ctm.f, + extract->path.fill.points[0].x, + extract->path.fill.points[0].y, + extract->path.fill.points[1].x, + extract->path.fill.points[1].y, + extract->path.fill.points[2].x, + extract->path.fill.points[2].y, + extract->path.fill.points[3].x, + extract->path.fill.points[3].y, + extract->path.fill.color + ); + if (e) return e; + } + extract->path.fill.n = 0; + return 0; + } + else if (extract->path_type == path_type_STROKE) + { + if (extract->path.stroke.point0_set && extract->path.stroke.point_set) + { + if (extract_add_line( + extract, + extract->path.stroke.ctm.a, + extract->path.stroke.ctm.b, + extract->path.stroke.ctm.c, + extract->path.stroke.ctm.d, + extract->path.stroke.ctm.e, + extract->path.stroke.ctm.f, + extract->path.stroke.width, + extract->path.stroke.point.x, + extract->path.stroke.point.y, + extract->path.stroke.point0.x, + extract->path.stroke.point0.y, + extract->path.stroke.color + )) + { + return -1; + } + return 0; + } + extract->path.stroke.point = extract->path.stroke.point0; + return 0; + } + else + { + assert(0); + return -1; + } +} + + +int extract_fill_end(extract_t* extract) +{ + assert(extract->path_type == path_type_FILL); + extract->path_type = path_type_NONE; + return 0; +} + + +int extract_stroke_end(extract_t* extract) +{ + assert(extract->path_type == path_type_STROKE); + extract->path_type = path_type_NONE; + return 0; +} + + int extract_page_end(extract_t* extract) { @@ -1083,6 +1640,118 @@ int extract_page_end(extract_t* extract) return 0; } + +static int paragraphs_to_text_content( + extract_alloc_t* alloc, + paragraph_t** paragraphs, + int paragraphs_num, + extract_astring_t* text + ) +{ + int p; + for (p=0; p<paragraphs_num; ++p) + { + paragraph_t* paragraph = paragraphs[p]; + int l; + for (l=0; l<paragraph->lines_num; ++l) + { + line_t* line = paragraph->lines[l]; + int s; + for (s=0; s<line->spans_num; ++s) + { + span_t* span = line->spans[s]; + int c; + for (c=0; c<span->chars_num; ++c) + { + /* We encode each character as utf8. */ + char_t* char_ = &span->chars[c]; + unsigned cc = char_->ucs; + if (extract_astring_catc_unicode( + alloc, + text, + cc, + 0 /*xml*/, + 1 /*ascii_ligatures*/, + 1 /*ascii_dash*/, + 1 /*ascii_apostrophe*/ + )) return -1; + } + } + } + if (extract_astring_catc(alloc, text, '\n')) return -1; + } + return 0; +} + + +static int extract_write_tables_csv(extract_t* extract) +{ + int ret = -1; + int p; + char* path = NULL; + FILE* f = NULL; + extract_astring_t text = {NULL, 0}; + if (!extract->tables_csv_format) return 0; + + outf("extract_write_tables_csv(): path_format=%s", extract->tables_csv_format); + outf("extract->document.pages_num=%i", extract->document.pages_num); + for (p=0; p<extract->document.pages_num; ++p) + { + extract_page_t* page = extract->document.pages[p]; + int t; + outf("p=%i page->tables_num=%i", p, page->tables_num); + for (t=0; t<page->tables_num; ++t) + { + table_t* table = page->tables[t]; + int y; + extract_free(extract->alloc, &path); + if (extract_asprintf(extract->alloc, &path, extract->tables_csv_format, extract->tables_csv_i) < 0) goto end; + extract->tables_csv_i += 1; + outf("Writing table %i to: %s", t, path); + outf("table->cells_num_x=%i", table->cells_num_x); + outf("table->cells_num_y=%i", table->cells_num_y); + f = fopen(path, "w"); + if (!f) goto end; + for (y=0; y<table->cells_num_y; ++y) + { + int x; + int have_output = 0; + for (x=0; x<table->cells_num_x; ++x) + { + cell_t* cell = table->cells[table->cells_num_x * y + x]; + extract_astring_free(extract->alloc, &text); + if (y==0) + { + outf("y=0 x=%i cell->rect=%s", x, extract_rect_string(&cell->rect)); + } + if (have_output) fprintf(f, ","); + have_output = 1; + if (paragraphs_to_text_content( + extract->alloc, + cell->paragraphs, + cell->paragraphs_num, + &text + )) goto end; + /* Reference cvs output trims trailing spaces. */ + extract_astring_char_truncate_if(&text, ' '); + fprintf(f, "\"%s\"", text.chars ? text.chars : ""); + } + fprintf(f, "\n"); + } + fclose(f); + f = NULL; + } + } + ret = 0; + + end: + if (f) fclose(f); + extract_free(extract->alloc, &path); + extract_astring_free(extract->alloc, &text); + return ret; +} + + int extract_process( extract_t* extract, int spacing, @@ -1126,6 +1795,30 @@ int extract_process( &extract->contentss[extract->contentss_num - 1] )) goto end; } + else if (extract->format == extract_format_HTML) + { + if (extract_document_to_html_content( + extract->alloc, + &extract->document, + rotation, + images, + &extract->contentss[extract->contentss_num - 1] + )) goto end; + } + else if (extract->format == extract_format_TEXT) + { + int p; + for (p=0; p<extract->document.pages_num; ++p) + { + extract_page_t* page = extract->document.pages[p]; + if (paragraphs_to_text_content( + extract->alloc, + page->paragraphs, + page->paragraphs_num, + &extract->contentss[extract->contentss_num - 1] + )) goto end; + } + } else { outf0("Invalid format=%i", extract->format); @@ -1136,11 +1829,15 @@ int extract_process( if (extract_document_images(extract->alloc, &extract->document, &extract->images)) goto end; + if (extract->tables_csv_format) + { + extract_write_tables_csv(extract); + } + { int i; for (i=0; i<extract->document.pages_num; ++i) { - page_free(extract->alloc, extract->document.pages[i]); - extract_free(extract->alloc, &extract->document.pages[i]); + page_free(extract->alloc, &extract->document.pages[i]); } extract_free(extract->alloc, &extract->document.pages); extract->document.pages_num = 0; @@ -1159,9 +1856,9 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer) char* text2 = NULL; int i; - if (extract_zip_open(buffer, &zip)) goto end; if (extract->format == extract_format_ODT) { + if (extract_zip_open(buffer, &zip)) goto end; for (i=0; i<odt_template_items_num; ++i) { const odt_template_item_t* item = &odt_template_items[i]; extract_free(extract->alloc, &text2); @@ -1191,9 +1888,11 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer) if (extract_asprintf(extract->alloc, &text2, "Pictures/%s", image->name) < 0) goto end; if (extract_zip_write_file(zip, image->data, image->data_size, text2)) goto end; } + if (extract_zip_close(&zip)) goto end; } else if (extract->format == extract_format_DOCX) { + if (extract_zip_open(buffer, &zip)) goto end; for (i=0; i<docx_template_items_num; ++i) { const docx_template_item_t* item = &docx_template_items[i]; extract_free(extract->alloc, &text2); @@ -1222,6 +1921,22 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer) if (extract_asprintf(extract->alloc, &text2, "word/media/%s", image->name) < 0) goto end; if (extract_zip_write_file(zip, image->data, image->data_size, text2)) goto end; } + if (extract_zip_close(&zip)) goto end; + + } + else if (extract->format == extract_format_HTML) + { + for (i=0; i<extract->contentss_num; ++i) + { + if (extract_buffer_write(buffer, extract->contentss[i].chars, extract->contentss[i].chars_num, NULL)) goto end; + } + } + else if (extract->format == extract_format_TEXT) + { + for (i=0; i<extract->contentss_num; ++i) + { + if (extract_buffer_write(buffer, extract->contentss[i].chars, extract->contentss[i].chars_num, NULL)) goto end; + } } else { @@ -1231,15 +1946,15 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer) return 1; } - if (extract_zip_close(&zip)) goto end; - assert(!zip); - e = 0; end: - if (e) outf("failed: %s", strerror(errno)); + if (e) + { + outf("failed: %s", strerror(errno)); + extract_zip_close(&zip); + } extract_free(extract->alloc, &text2); - extract_zip_close(&zip); return e; } @@ -1300,6 +2015,7 @@ int extract_write_template( } } + void extract_end(extract_t** pextract) { extract_t* extract = *pextract; @@ -1314,12 +2030,13 @@ void extract_end(extract_t** pextract) extract_free(extract->alloc, &extract->contentss); } extract_images_free(extract->alloc, &extract->images); + extract_odt_styles_free(extract->alloc, &extract->odt_styles); extract_free(extract->alloc, pextract); } void extract_internal_end(void) { - span_string(NULL, NULL); + extract_span_string(NULL, NULL); } void extract_exp_min(extract_t* extract, size_t size) @@ -1329,8 +2046,8 @@ void extract_exp_min(extract_t* extract, size_t size) double extract_matrices_to_font_size(matrix_t* ctm, matrix_t* trm) { - double font_size = matrix_expansion(*trm) - * matrix_expansion(*ctm); + double font_size = extract_matrix_expansion(*trm) + * extract_matrix_expansion(*ctm); /* Round font_size to nearest 0.01. */ font_size = (double) (int) (font_size * 100.0f + 0.5f) / 100.0f; return font_size; |