diff options
Diffstat (limited to 'extract/src/extract.c')
-rw-r--r-- | extract/src/extract.c | 662 |
1 files changed, 393 insertions, 269 deletions
diff --git a/extract/src/extract.c b/extract/src/extract.c index 2c375571..42f888f3 100644 --- a/extract/src/extract.c +++ b/extract/src/extract.c @@ -25,6 +25,9 @@ +const rect_t extract_rect_infinite = { { DBL_MIN, DBL_MIN }, { DBL_MAX, DBL_MAX } }; +const rect_t extract_rect_empty = { { DBL_MAX, DBL_MAX }, { DBL_MIN, DBL_MIN } }; + double extract_matrix_expansion(matrix_t m) { @@ -200,74 +203,97 @@ static void table_free(extract_alloc_t* alloc, table_t** ptable) extract_free(alloc, ptable); } -static void page_free(extract_alloc_t* alloc, extract_page_t** ppage) +void extract_subpage_free(extract_alloc_t* alloc, subpage_t** psubpage) { - extract_page_t* page = *ppage; - if (!page) return; + subpage_t* subpage = *psubpage; + if (!subpage) return; - outf0("page=%p page->spans_num=%i page->lines_num=%i", - page, page->spans_num, page->lines_num); - extract_spans_free(alloc, &page->spans, page->spans_num); + outf0("subpage=%p subpage->spans_num=%i subpage->lines_num=%i", + subpage, subpage->spans_num, subpage->lines_num); + extract_spans_free(alloc, &subpage->spans, subpage->spans_num); - extract_lines_free(alloc, &page->lines, page->lines_num); + extract_lines_free(alloc, &subpage->lines, subpage->lines_num); { int p; - for (p=0; p<page->paragraphs_num; ++p) { - paragraph_t* paragraph = page->paragraphs[p]; + for (p=0; p<subpage->paragraphs_num; ++p) { + paragraph_t* paragraph = subpage->paragraphs[p]; /* We don't call extract_lines_free(¶graph->lines) because - these point into the same data as page->lines, which we have + these point into the same data as subpage->lines, which we have already freed above. */ if (paragraph) extract_free(alloc, ¶graph->lines); - extract_free(alloc, &page->paragraphs[p]); + extract_free(alloc, &subpage->paragraphs[p]); } } - extract_free(alloc, &page->paragraphs); - + extract_free(alloc, &subpage->paragraphs); + { int i; - for (i=0; i<page->images_num; ++i) { - extract_image_clear(alloc, &page->images[i]); + for (i=0; i<subpage->images_num; ++i) { + extract_image_clear(alloc, &subpage->images[i]); } - extract_free(alloc, &page->images); + extract_free(alloc, &subpage->images); } - extract_free(alloc, &page->images); + extract_free(alloc, &subpage->images); + + extract_free(alloc, &subpage->tablelines_horizontal.tablelines); + extract_free(alloc, &subpage->tablelines_vertical.tablelines); - extract_free(alloc, &page->tablelines_horizontal.tablelines); - extract_free(alloc, &page->tablelines_vertical.tablelines); - { int t; - outf("page=%p page->tables_num=%i", page, page->tables_num); - for (t=0; t<page->tables_num; ++t) + outf("subpage=%p subpage->tables_num=%i", subpage, subpage->tables_num); + for (t=0; t<subpage->tables_num; ++t) { - table_free(alloc, &page->tables[t]); + table_free(alloc, &subpage->tables[t]); } - extract_free(alloc, &page->tables); + extract_free(alloc, &subpage->tables); + } + + extract_free(alloc, psubpage); +} + +static void page_free(extract_alloc_t* alloc, extract_page_t** ppage) +{ + int c; + extract_page_t* page = *ppage; + if (!page) return; + + for (c=0; c<page->subpages_num; ++c) + { + subpage_t *subpage = page->subpages[c]; + extract_subpage_free(alloc, &subpage); } - + extract_free(alloc, &page->subpages); extract_free(alloc, ppage); } -static span_t* page_span_append(extract_alloc_t* alloc, extract_page_t* page) -/* Appends new empty span_ to an extract_page_t; returns NULL with errno set on -error. */ +int subpage_span_append(extract_alloc_t *alloc, subpage_t *subpage, span_t *span) { - span_t* span; - if (extract_malloc(alloc, &span, sizeof(*span))) return NULL; - extract_span_init(span); if (extract_realloc2( alloc, - &page->spans, - sizeof(*page->spans) * page->spans_num, - sizeof(*page->spans) * (page->spans_num + 1) + &subpage->spans, + sizeof(*subpage->spans) * subpage->spans_num, + sizeof(*subpage->spans) * (subpage->spans_num + 1) )) { - extract_free(alloc, &span); - return NULL; + return -1; } - page->spans[page->spans_num] = span; - page->spans_num += 1; - return span; + subpage->spans[subpage->spans_num] = span; + subpage->spans_num += 1; + + return 0; +} + + +static int subpage_span_append_new(extract_alloc_t* alloc, subpage_t *subpage, span_t** pspan) +/* Appends new empty span_ to a subpage_t; returns -1 with errno set on error. */ +{ + if (extract_malloc(alloc, pspan, sizeof(**pspan))) return -1; + extract_span_init(*pspan); + if (subpage_span_append(alloc, subpage, *pspan)) { + extract_free(alloc, pspan); + return -1; + } + return 0; } @@ -285,9 +311,9 @@ static void extract_images_free(extract_alloc_t* alloc, images_t* images) static int extract_document_images(extract_alloc_t* alloc, document_t* document, images_t* o_images) -/* Moves image_t's from document->page[] to *o_images. +/* Moves image_t's from document->subpage[] to *o_images. -On return document->page[].images* will be NULL etc. +On return document->subpage[].images* will be NULL etc. */ { int e = -1; @@ -297,59 +323,65 @@ On return document->page[].images* will be NULL etc. for (p=0; p<document->pages_num; ++p) { extract_page_t* page = document->pages[p]; - int i; - for (i=0; i<page->images_num; ++i) + int c; + for (c=0; c<page->subpages_num; ++c) { - image_t* image; - if (extract_realloc2( - alloc, - &images.images, - sizeof(image_t) * images.images_num, - sizeof(image_t) * (images.images_num + 1) - )) goto end; - image = &page->images[i]; - outf("p=%i i=%i image->name=%s image->id=%s", p, i, image->name, image->id); - assert(image->name); - images.images[images.images_num] = *image; - images.images_num += 1; - - /* Add image type if we haven't seen it before. */ + subpage_t* subpage = page->subpages[c]; + int i; + for (i=0; i<subpage->images_num; ++i) { - int it; - for (it=0; it<images.imagetypes_num; ++it) + image_t* image; + if (extract_realloc2( + alloc, + &images.images, + sizeof(image_t) * images.images_num, + sizeof(image_t) * (images.images_num + 1) + )) goto end; + image = &subpage->images[i]; + outf("p=%i i=%i image->name=%s image->id=%s", p, i, image->name, image->id); + assert(image->name); + images.images[images.images_num] = *image; + images.images_num += 1; + + /* Add image type if we haven't seen it before. */ { - outf("it=%i images.imagetypes[it]=%s image->type=%s", - it, images.imagetypes[it], image->type); - if (!strcmp(images.imagetypes[it], image->type)) { - break; + int it; + for (it=0; it<images.imagetypes_num; ++it) + { + outf("it=%i images.imagetypes[it]=%s image->type=%s", + it, images.imagetypes[it], image->type); + if (!strcmp(images.imagetypes[it], image->type)) + { + break; + } } - } - if (it == images.imagetypes_num) - { - /* We haven't seen this image type before. */ - if (extract_realloc2( - alloc, - &images.imagetypes, - sizeof(char*) * images.imagetypes_num, - sizeof(char*) * (images.imagetypes_num + 1) + if (it == images.imagetypes_num) + { + /* We haven't seen this image type before. */ + if (extract_realloc2( + alloc, + &images.imagetypes, + sizeof(char*) * images.imagetypes_num, + sizeof(char*) * (images.imagetypes_num + 1) )) goto end; - assert(image->type); - images.imagetypes[images.imagetypes_num] = image->type; - images.imagetypes_num += 1; - outf("have added images.imagetypes_num=%i", images.imagetypes_num); + assert(image->type); + images.imagetypes[images.imagetypes_num] = image->type; + images.imagetypes_num += 1; + outf("have added images.imagetypes_num=%i", images.imagetypes_num); + } } + + /* We've taken ownership of image->* so NULL the original values + here to ensure we can't use things after free. */ + image->type = NULL; + image->name = NULL; + image->id = NULL; + image->data = NULL; + image->data_size = 0; } - - /* We've taken ownership of image->* so NULL the original values - here to ensure we can't use things after free. */ - image->type = NULL; - image->name = NULL; - image->id = NULL; - image->data = NULL; - image->data_size = 0; + extract_free(alloc, &subpage->images); + subpage->images_num = 0; } - extract_free(alloc, &page->images); - page->images_num = 0; } e = 0; end: @@ -367,12 +399,11 @@ On return document->page[].images* will be NULL etc. static void extract_document_free(extract_alloc_t* alloc, document_t* document) { int p; - if (!document) { - return; - } - for (p=0; p<document->pages_num; ++p) { - extract_page_t* page = document->pages[p]; - page_free(alloc, &page); + if (!document) return; + + for (p=0; p<document->pages_num; ++p) + { + page_free(alloc, &document->pages[p]); } extract_free(alloc, &document->pages); document->pages = NULL; @@ -451,11 +482,11 @@ static void s_document_init(document_t* document) } -static int page_span_end_clean(extract_alloc_t* alloc, extract_page_t* page) -/* Does preliminary processing of the end of the last span in a page; intended +static int subpage_span_end_clean(extract_alloc_t* alloc, subpage_t* subpage) +/* Does preliminary processing of the end of the last span in a subpage; intended to be called as we load span information. -Looks at last two char_t's in last span_t of <page>, and either +Looks at last two char_t's in last span_t of <subpage>, and either leaves unchanged, or removes space in last-but-one position, or moves last char_t into a new span_t. */ { @@ -468,9 +499,9 @@ char_t into a new span_t. */ double err_x; double err_y; point_t dir; - - assert(page->spans_num); - span = page->spans[page->spans_num-1]; + + assert(subpage->spans_num); + span = subpage->spans[subpage->spans_num-1]; assert(span->chars_num); /* Last two char_t's are char_[-2] and char_[-1]. */ @@ -547,8 +578,8 @@ char_t into a new span_t. */ span_string2(span) ); { - span_t* span2 = page_span_append(alloc, page); - if (!span2) goto end; + span_t* span2; + if (subpage_span_append_new(alloc, subpage, &span2)) goto end; *span2 = *span; if (extract_strdup(alloc, span->font_name, &span2->font_name)) goto end; span2->chars_num = 1; @@ -567,42 +598,44 @@ char_t into a new span_t. */ struct extract_t { extract_alloc_t* alloc; - + + int layout_analysis; + document_t document; - + int num_spans_split; - /* Number of extra spans from page_span_end_clean(). */ - + /* Number of extra spans from subpage_span_end_clean(). */ + int num_spans_autosplit; /* Number of extra spans from autosplit=1. */ - + double span_offset_x; double span_offset_y; /* Only used if autosplit is non-zero. */ - + int image_n; /* Used to generate unique ids for images. */ - + /* List of strings that are the generated docx content for each page. When zip_* can handle appending of data, we will be able to remove this list. */ extract_astring_t* contentss; int contentss_num; - + images_t images; - + extract_format_t format; extract_odt_styles_t odt_styles; - + char* tables_csv_format; int tables_csv_i; - + enum { path_type_NONE, path_type_FILL, path_type_STROKE, } path_type; - + union { struct @@ -612,7 +645,7 @@ struct extract_t point_t points[4]; int n; } fill; - + struct { matrix_t ctm; @@ -623,7 +656,7 @@ struct extract_t point_t point; int point_set; } stroke; - + } path; }; @@ -636,7 +669,7 @@ int extract_begin( { int e = -1; extract_t* extract; - + if (1 && format != extract_format_ODT && format != extract_format_DOCX @@ -648,29 +681,35 @@ int extract_begin( errno = EINVAL; return -1; } - + /* Use a temporary extract_alloc_t to allocate space for the extract_t. */ if (extract_malloc(alloc, &extract, sizeof(*extract))) goto end; - + extract_bzero(extract, sizeof(*extract)); extract->alloc = alloc; s_document_init(&extract->document); - + /* Start at 10 because template document might use some low-numbered IDs. */ extract->image_n = 10; - + extract->format = format; extract->tables_csv_format = NULL; extract->tables_csv_i = 0; - + e = 0; - + end: *pextract = (e) ? NULL : extract; return e; } +int extract_set_layout_analysis(extract_t *extract, int enable) +{ + extract->layout_analysis = enable; + return 0; +} + int extract_tables_csv_format(extract_t* extract, const char* path_format) { return extract_strdup(extract->alloc, path_format, &extract->tables_csv_format); @@ -686,7 +725,7 @@ static void image_free_fn(void* handle, void* image_data) int extract_read_intermediate(extract_t* extract, extract_buffer_t* buffer, int autosplit) { int ret = -1; - + document_t* document = &extract->document; char* image_data = NULL; int num_spans = 0; @@ -716,16 +755,18 @@ int extract_read_intermediate(extract_t* extract, extract_buffer_t* buffer, int </page> ... - We convert this into a list of extract_page_t's, each containing a list of + We convert this into a list of subpage_t's, each containing a list of span_t's, each containing a list of char_t's. While doing this, we do some within-span processing by calling - page_span_end_clean(): + subpage_span_end_clean(): Remove spurious spaces. Split spans in two where there seem to be large gaps between glyphs. */ for(;;) { extract_page_t* page; + subpage_t* subpage; + rect_t mediabox = extract_rect_infinite; /* Fake mediabox */ int e = extract_xml_pparse_next(buffer, &tag); if (e == 1) break; /* EOF. */ if (e) goto end; @@ -741,14 +782,16 @@ int extract_read_intermediate(extract_t* extract, extract_buffer_t* buffer, int goto end; } outfx("loading spans for page %i...", document->pages_num); - if (extract_page_begin(extract)) goto end; + if (extract_page_begin(extract, mediabox.min.x, mediabox.min.y, mediabox.max.x, mediabox.max.y)) goto end; page = extract->document.pages[extract->document.pages_num-1]; if (!page) goto end; + subpage = page->subpages[page->subpages_num-1]; + if (!subpage) goto end; for(;;) { if (extract_xml_pparse_next(buffer, &tag)) goto end; if (!strcmp(tag.name, "/page")) { - num_spans += page->spans_num; + num_spans += subpage->spans_num; break; } if (!strcmp(tag.name, "image")) { @@ -804,20 +847,20 @@ int extract_read_intermediate(extract_t* extract, extract_buffer_t* buffer, int else if (cc >= 'a' && cc <= 'f') byte += 10 + cc - 'a'; else goto compressed_error; byte *= 16; - + cc = *c; c += 1; if (cc >= '0' && cc <= '9') byte += cc-'0'; else if (cc >= 'a' && cc <= 'f') byte += 10 + cc - 'a'; else goto compressed_error; - + image_data[i] = (char) byte; i += 1; if (i == image_data_size) { break; } continue; - + compressed_error: outf("Unrecognised hex character '%x' at offset %lli in image data", cc, (long long) (c-tag.text.chars)); errno = EINVAL; @@ -893,12 +936,12 @@ int extract_read_intermediate(extract_t* extract, extract_buffer_t* buffer, int trm.e, trm.f )) goto end; - + for(;;) { - double x; - double y; - double adv; - unsigned ucs; + double x; + double y; + double adv; + unsigned ucs; if (extract_xml_pparse_next(buffer, &tag)) { outf("Failed to find <char or </span"); @@ -917,16 +960,17 @@ int extract_read_intermediate(extract_t* extract, extract_buffer_t* buffer, int if (extract_xml_tag_attributes_find_double(&tag, "y", &y)) goto end; if (extract_xml_tag_attributes_find_double(&tag, "adv", &adv)) goto end; if (extract_xml_tag_attributes_find_uint(&tag, "ucs", &ucs)) goto end; - - if (extract_add_char(extract, x, y, ucs, adv, autosplit)) goto end; + + /* BBox is bogus here. Analysis will fail. */ + if (extract_add_char(extract, x, y, ucs, adv, autosplit, x, y, x + adv, y + adv)) goto end; } extract_xml_tag_free(extract->alloc, &tag); } } if (extract_page_end(extract)) goto end; - outf("page=%i page->num_spans=%i", - document->pages_num, page->spans_num); + outf("page=%i subpage->num_spans=%i", + document->pages_num, subpage->spans_num); } outf("num_spans=%i num_spans_split=%i num_spans_autosplit=%i", @@ -940,7 +984,7 @@ int extract_read_intermediate(extract_t* extract, extract_buffer_t* buffer, int end: extract_xml_tag_free(extract->alloc, &tag); extract_free(extract->alloc, &image_data); - + return ret; } @@ -967,9 +1011,11 @@ int extract_span_begin( { int e = -1; extract_page_t* page; + subpage_t* subpage; span_t* span; assert(extract->document.pages_num > 0); page = extract->document.pages[extract->document.pages_num-1]; + subpage = page->subpages[page->subpages_num-1]; outf("extract_span_begin(): ctm=(%f %f %f %f %f %f) trm=(%f %f %f %f %f %f) font_name=%s, wmode=%i", ctm_a, ctm_b, @@ -986,22 +1032,21 @@ int extract_span_begin( font_name, wmode ); - span = page_span_append(extract->alloc, page); - if (!span) goto end; + if (subpage_span_append_new(extract->alloc, subpage, &span)) goto end; span->ctm.a = ctm_a; span->ctm.b = ctm_b; span->ctm.c = ctm_c; span->ctm.d = ctm_d; span->ctm.e = ctm_e; span->ctm.f = ctm_f; - + span->trm.a = trm_a; span->trm.b = trm_b; span->trm.c = trm_c; span->trm.d = trm_d; span->trm.e = trm_e; span->trm.f = trm_f; - + { const char* ff = strchr(font_name, '+'); const char* f = (ff) ? ff+1 : font_name; @@ -1019,25 +1064,30 @@ int extract_span_begin( int extract_add_char( - extract_t* extract, - double x, - double y, - unsigned ucs, - double adv, - int autosplit + extract_t* extract, + double x, + double y, + unsigned ucs, + double adv, + int autosplit, + double x0, + double y0, + double x1, + double y1 ) { int e = -1; char_t* char_; extract_page_t* page = extract->document.pages[extract->document.pages_num-1]; - span_t* span = page->spans[page->spans_num - 1]; - + subpage_t* subpage = page->subpages[page->subpages_num-1]; + span_t* span = subpage->spans[subpage->spans_num - 1]; + outf("(%f %f) ucs=% 5i=%c adv=%f", x, y, ucs, (ucs >=32 && ucs< 127) ? ucs : ' ', adv); /* Ignore the specified <autosplit> - there seems no advantage to not splitting spans on multiple lines, and not doing so causes problems with missing spaces in the output. */ autosplit = 1; - + if (span->chars_num) { char_t* char_prev = &span->chars[span->chars_num - 1]; @@ -1065,17 +1115,16 @@ int extract_add_char( dir.x, dir.y, span_a ); extract->num_spans_autosplit += 1; - span = page_span_append(extract->alloc, page); - if (!span) goto end; + if (subpage_span_append_new(extract->alloc, subpage, &span)) goto end; *span = *span0; span->chars = NULL; span->chars_num = 0; if (extract_strdup(extract->alloc, span0->font_name, &span->font_name)) goto end; } } - + if (0 && autosplit && y - extract->span_offset_y != 0) { - + double e = span->ctm.e + span->ctm.a * (x - extract->span_offset_x) + span->ctm.b * (y - extract->span_offset_y); double f = span->ctm.f + span->ctm.c * (x - extract->span_offset_x) @@ -1094,8 +1143,7 @@ int extract_add_char( /* Create new span. */ span_t* span0 = span; extract->num_spans_autosplit += 1; - span = page_span_append(extract->alloc, page); - if (!span) goto end; + if (subpage_span_append_new(extract->alloc, subpage, &span)) goto end; *span = *span0; span->chars = NULL; span->chars_num = 0; @@ -1106,31 +1154,35 @@ int extract_add_char( outfx("autosplit: char_pre_y=%f offset_y=%f", char_pre_y, offset_y); } - + if (extract_span_append_c(extract->alloc, span, 0 /*c*/)) goto end; /* Coverity warns, but extract_span_append_c() will have appended an item. */ /* coverity[var_deref_op] */ char_ = &span->chars[ span->chars_num-1]; - + char_->pre_x = x; char_->pre_y = y; char_->x = span->ctm.a * char_->pre_x + span->ctm.c * char_->pre_y + span->ctm.e; char_->y = span->ctm.b * char_->pre_x + span->ctm.d * char_->pre_y + span->ctm.f; - + char_->adv = adv; char_->ucs = ucs; + char_->bbox.min.x = x0; + char_->bbox.min.y = y0; + char_->bbox.max.x = x1; + char_->bbox.max.y = y1; { - int page_spans_num_old = page->spans_num; - if (page_span_end_clean(extract->alloc, page)) goto end; - span = page->spans[page->spans_num-1]; /* fixme: unnecessary. */ - if (page->spans_num != page_spans_num_old) { + int subpage_spans_num_old = subpage->spans_num; + if (subpage_span_end_clean(extract->alloc, subpage)) goto end; + span = subpage->spans[subpage->spans_num-1]; /* fixme: unnecessary. */ + if (subpage->spans_num != subpage_spans_num_old) { extract->num_spans_split += 1; } } e = 0; - + end: return e; } @@ -1139,13 +1191,14 @@ int extract_add_char( int extract_span_end(extract_t* extract) { extract_page_t* page = extract->document.pages[extract->document.pages_num-1]; - span_t* span = page->spans[page->spans_num - 1]; + subpage_t* subpage = page->subpages[page->subpages_num-1]; + span_t* span = subpage->spans[subpage->spans_num - 1]; if (span->chars_num == 0) { /* Calling code called extract_span_begin() then extract_span_end() without any call to extract_add_char(). Our joining code assumes that all spans are non-empty, so we need to delete this span. */ - extract_free(extract->alloc, &page->spans[page->spans_num - 1]); - page->spans_num -= 1; + extract_free(extract->alloc, &subpage->spans[subpage->spans_num - 1]); + subpage->spans_num -= 1; } return 0; } @@ -1166,8 +1219,9 @@ int extract_add_image( { int e = -1; extract_page_t* page = extract->document.pages[extract->document.pages_num-1]; + subpage_t* subpage = page->subpages[page->subpages_num-1]; image_t image_temp = {0}; - + extract->image_n += 1; image_temp.x = x; image_temp.y = y; @@ -1180,29 +1234,29 @@ int extract_add_image( if (extract_strdup(extract->alloc, type, &image_temp.type)) goto end; if (extract_asprintf(extract->alloc, &image_temp.id, "rId%i", extract->image_n) < 0) goto end; if (extract_asprintf(extract->alloc, &image_temp.name, "image%i.%s", extract->image_n, image_temp.type) < 0) goto end; - + if (extract_realloc2( extract->alloc, - &page->images, - sizeof(image_t) * page->images_num, - sizeof(image_t) * (page->images_num + 1) + &subpage->images, + sizeof(image_t) * subpage->images_num, + sizeof(image_t) * (subpage->images_num + 1) )) goto end; - - page->images[page->images_num] = image_temp; - page->images_num += 1; - outf("page->images_num=%i", page->images_num); - + + subpage->images[subpage->images_num] = image_temp; + subpage->images_num += 1; + outf("subpage->images_num=%i", subpage->images_num); + e = 0; - + end: - + if (e) { extract_free(extract->alloc, &image_temp.type); extract_free(extract->alloc, &image_temp.data); extract_free(extract->alloc, &image_temp.id); extract_free(extract->alloc, &image_temp.name); } - + return e; } @@ -1220,7 +1274,7 @@ static int tablelines_append(extract_alloc_t* alloc, tablelines_t* tablelines, r return 0; } -static point_t transform(double x, double y, +static point_t transform(double x, double y, double ctm_a, double ctm_b, double ctm_c, @@ -1265,6 +1319,7 @@ int extract_add_path4( ) { extract_page_t* page = extract->document.pages[extract->document.pages_num-1]; + subpage_t* subpage = page->subpages[page->subpages_num-1]; point_t points[4] = { transform(x0, y0, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f), transform(x1, y1, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f), @@ -1303,20 +1358,20 @@ int extract_add_path4( if (points[(i+4) % 4].y != y0) return 0; rect.min.y = (y1 > y0) ? y0 : y1; rect.max.y = (y1 > y0) ? y1 : y0; - + dx = rect.max.x - rect.min.x; dy = rect.max.y - rect.min.y; if (dx / dy > 5) { /* Horizontal line. */ outf("have found horizontal line: %s", extract_rect_string(&rect)); - if (tablelines_append(extract->alloc, &page->tablelines_horizontal, &rect, color)) return -1; + if (tablelines_append(extract->alloc, &subpage->tablelines_horizontal, &rect, color)) return -1; } else if (dy / dx > 5) { /* Vertical line. */ outf("have found vertical line: %s", extract_rect_string(&rect)); - if (tablelines_append(extract->alloc, &page->tablelines_vertical, &rect, color)) return -1; + if (tablelines_append(extract->alloc, &subpage->tablelines_vertical, &rect, color)) return -1; } return 0; } @@ -1339,6 +1394,7 @@ int extract_add_line( ) { extract_page_t* page = extract->document.pages[extract->document.pages_num-1]; + subpage_t* subpage = page->subpages[page->subpages_num-1]; point_t p0 = transform(x0, y0, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f); point_t p1 = transform(x1, y1, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f); double width2 = width * sqrt( fabs( ctm_a * ctm_d - ctm_b * ctm_c)); @@ -1348,7 +1404,7 @@ int extract_add_line( rect.min.y = s_min(p0.y, p1.y); rect.max.x = s_max(p0.x, p1.x); rect.max.y = s_max(p0.y, p1.y); - + outf("%s: width=%f ((%f %f)(%f %f)) rect=%s", extract_FUNCTION, width, @@ -1362,49 +1418,98 @@ int extract_add_line( { rect.min.x -= width2 / 2; rect.max.x += width2 / 2; - return tablelines_append(extract->alloc, &page->tablelines_vertical, &rect, color); + return tablelines_append(extract->alloc, &subpage->tablelines_vertical, &rect, color); } else if (rect.min.y == rect.max.y) { rect.min.y -= width2 / 2; rect.max.y += width2 / 2; - return tablelines_append(extract->alloc, &page->tablelines_horizontal, &rect, color); + return tablelines_append(extract->alloc, &subpage->tablelines_horizontal, &rect, color); } return 0; } +int extract_subpage_alloc(extract_alloc_t* alloc, rect_t mediabox, extract_page_t* page, subpage_t** psubpage) +{ + subpage_t* subpage; + if (extract_malloc(alloc, psubpage, sizeof(subpage_t))) + { + return -1; + } + subpage = *psubpage; + subpage->mediabox = mediabox; + subpage->spans = NULL; + subpage->spans_num = 0; + subpage->lines = NULL; + subpage->lines_num = 0; + subpage->paragraphs = NULL; + subpage->paragraphs_num = 0; + subpage->images = NULL; + subpage->images_num = 0; + subpage->tablelines_horizontal.tablelines = NULL; + subpage->tablelines_horizontal.tablelines_num = 0; + subpage->tablelines_vertical.tablelines = NULL; + subpage->tablelines_vertical.tablelines_num = 0; + subpage->tables = NULL; + subpage->tables_num = 0; + + if (extract_realloc2( + alloc, + &page->subpages, + sizeof(subpage_t*) * page->subpages_num, + sizeof(subpage_t*) * (page->subpages_num + 1) + )) { + extract_free(alloc, psubpage); + return -1; + } + page->subpages[page->subpages_num] = subpage; + page->subpages_num += 1; + return 0; +} + +static int extract_subpage_begin(extract_t* extract, double x0, double y0, double x1, double y1) +/* Appends new empty subpage_t to the last page of an extract->document. */ +{ + extract_page_t* page = extract->document.pages[extract->document.pages_num - 1]; + subpage_t* subpage; + rect_t mediabox = { { x0, y0 }, { x1, y1 } }; + + return extract_subpage_alloc(extract->alloc, mediabox, page, &subpage); +} -int extract_page_begin(extract_t* extract) +int extract_page_begin(extract_t *extract, double x0, double y0, double x1, double y1) { - /* Appends new empty extract_page_t to an extract->document. */ + /* Appends new empty page_t to an extract->document. */ extract_page_t* page; - if (extract_malloc(extract->alloc, &page, sizeof(extract_page_t))) return -1; - page->spans = NULL; - page->spans_num = 0; - page->lines = NULL; - page->lines_num = 0; - page->paragraphs = NULL; - page->paragraphs_num = 0; - page->images = NULL; - page->images_num = 0; - page->tablelines_horizontal.tablelines = NULL; - page->tablelines_horizontal.tablelines_num = 0; - page->tablelines_vertical.tablelines = NULL; - page->tablelines_vertical.tablelines_num = 0; - page->tables = NULL; - page->tables_num = 0; - + + if (extract_malloc(extract->alloc, &page, sizeof(*page))) return -1; + page->mediabox.min.x = x0; + page->mediabox.min.y = y0; + page->mediabox.max.x = x1; + page->mediabox.max.y = y1; + page->subpages = NULL; + page->subpages_num = 0; + page->split = NULL; + if (extract_realloc2( extract->alloc, &extract->document.pages, - sizeof(extract_page_t*) * extract->document.pages_num + 1, - sizeof(extract_page_t*) * (extract->document.pages_num + 1) + sizeof(subpage_t*) * extract->document.pages_num, + sizeof(subpage_t*) * (extract->document.pages_num + 1) )) { extract_free(extract->alloc, &page); return -1; } + extract->document.pages[extract->document.pages_num] = page; extract->document.pages_num += 1; + + if (extract_subpage_begin(extract, x0, y0, x1, y1)) { + extract->document.pages_num--; + page_free(extract->alloc, &extract->document.pages[extract->document.pages_num]); + return -1; + } + return 0; } @@ -1634,13 +1739,22 @@ int extract_stroke_end(extract_t* extract) -int extract_page_end(extract_t* extract) +static int extract_subpage_end(extract_t* extract) { (void) extract; return 0; } +int extract_page_end(extract_t* extract) +{ + if (extract_subpage_end(extract)) + return -1; + + return 0; +} + + static int paragraphs_to_text_content( extract_alloc_t* alloc, paragraph_t** paragraphs, @@ -1692,54 +1806,59 @@ static int extract_write_tables_csv(extract_t* extract) FILE* f = NULL; extract_astring_t text = {NULL, 0}; if (!extract->tables_csv_format) return 0; - + outf("extract_write_tables_csv(): path_format=%s", extract->tables_csv_format); outf("extract->document.pages_num=%i", extract->document.pages_num); for (p=0; p<extract->document.pages_num; ++p) { + int c; extract_page_t* page = extract->document.pages[p]; - int t; - outf("p=%i page->tables_num=%i", p, page->tables_num); - for (t=0; t<page->tables_num; ++t) + for (c=0; c<page->subpages_num; ++c) { - table_t* table = page->tables[t]; - int y; - extract_free(extract->alloc, &path); - if (extract_asprintf(extract->alloc, &path, extract->tables_csv_format, extract->tables_csv_i) < 0) goto end; - extract->tables_csv_i += 1; - outf("Writing table %i to: %s", t, path); - outf("table->cells_num_x=%i", table->cells_num_x); - outf("table->cells_num_y=%i", table->cells_num_y); - f = fopen(path, "w"); - if (!f) goto end; - for (y=0; y<table->cells_num_y; ++y) + subpage_t* subpage = page->subpages[c]; + int t; + outf("p=%i subpage->tables_num=%i", p, subpage->tables_num); + for (t=0; t<subpage->tables_num; ++t) { - int x; - int have_output = 0; - for (x=0; x<table->cells_num_x; ++x) + table_t* table = subpage->tables[t]; + int y; + extract_free(extract->alloc, &path); + if (extract_asprintf(extract->alloc, &path, extract->tables_csv_format, extract->tables_csv_i) < 0) goto end; + extract->tables_csv_i += 1; + outf("Writing table %i to: %s", t, path); + outf("table->cells_num_x=%i", table->cells_num_x); + outf("table->cells_num_y=%i", table->cells_num_y); + f = fopen(path, "w"); + if (!f) goto end; + for (y=0; y<table->cells_num_y; ++y) { - cell_t* cell = table->cells[table->cells_num_x * y + x]; - extract_astring_free(extract->alloc, &text); - if (y==0) + int x; + int have_output = 0; + for (x=0; x<table->cells_num_x; ++x) { - outf("y=0 x=%i cell->rect=%s", x, extract_rect_string(&cell->rect)); + cell_t* cell = table->cells[table->cells_num_x * y + x]; + extract_astring_free(extract->alloc, &text); + if (y==0) + { + outf("y=0 x=%i cell->rect=%s", x, extract_rect_string(&cell->rect)); + } + if (have_output) fprintf(f, ","); + have_output = 1; + if (paragraphs_to_text_content( + extract->alloc, + cell->paragraphs, + cell->paragraphs_num, + &text + )) goto end; + /* Reference cvs output trims trailing spaces. */ + extract_astring_char_truncate_if(&text, ' '); + fprintf(f, "\"%s\"", text.chars ? text.chars : ""); } - if (have_output) fprintf(f, ","); - have_output = 1; - if (paragraphs_to_text_content( - extract->alloc, - cell->paragraphs, - cell->paragraphs_num, - &text - )) goto end; - /* Reference cvs output trims trailing spaces. */ - extract_astring_char_truncate_if(&text, ' '); - fprintf(f, "\"%s\"", text.chars ? text.chars : ""); + fprintf(f, "\n"); } - fprintf(f, "\n"); + fclose(f); + f = NULL; } - fclose(f); - f = NULL; } } ret = 0; @@ -1760,7 +1879,7 @@ int extract_process( ) { int e = -1; - + if (extract_realloc2( extract->alloc, &extract->contentss, @@ -1769,9 +1888,9 @@ int extract_process( )) goto end; extract_astring_init(&extract->contentss[extract->contentss_num]); extract->contentss_num += 1; - - if (extract_document_join(extract->alloc, &extract->document)) goto end; - + + if (extract_document_join(extract->alloc, &extract->document, extract->layout_analysis)) goto end; + if (extract->format == extract_format_ODT) { if (extract_document_to_odt_content( @@ -1811,12 +1930,17 @@ int extract_process( for (p=0; p<extract->document.pages_num; ++p) { extract_page_t* page = extract->document.pages[p]; - if (paragraphs_to_text_content( - extract->alloc, - page->paragraphs, - page->paragraphs_num, - &extract->contentss[extract->contentss_num - 1] + int c; + for (c=0; c<page->subpages_num; ++c) + { + subpage_t* subpage = page->subpages[c]; + if (paragraphs_to_text_content( + extract->alloc, + subpage->paragraphs, + subpage->paragraphs_num, + &extract->contentss[extract->contentss_num - 1] )) goto end; + } } } else @@ -1828,23 +1952,23 @@ int extract_process( } if (extract_document_images(extract->alloc, &extract->document, &extract->images)) goto end; - + if (extract->tables_csv_format) { extract_write_tables_csv(extract); } - + { - int i; - for (i=0; i<extract->document.pages_num; ++i) { - page_free(extract->alloc, &extract->document.pages[i]); + int p; + for (p=0; p<extract->document.pages_num; ++p) { + page_free(extract->alloc, &extract->document.pages[p]); } extract_free(extract->alloc, &extract->document.pages); extract->document.pages_num = 0; } - + e = 0; - + end: return e; } @@ -1855,7 +1979,7 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer) extract_zip_t* zip = NULL; char* text2 = NULL; int i; - + if (extract->format == extract_format_ODT) { if (extract_zip_open(buffer, &zip)) goto end; @@ -1922,7 +2046,7 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer) if (extract_zip_write_file(zip, image->data, image->data_size, text2)) goto end; } if (extract_zip_close(&zip)) goto end; - + } else if (extract->format == extract_format_HTML) { @@ -1945,9 +2069,9 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer) errno = EINVAL; return 1; } - + e = 0; - + end: if (e) { @@ -1955,7 +2079,7 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer) extract_zip_close(&zip); } extract_free(extract->alloc, &text2); - + return e; } @@ -1982,7 +2106,7 @@ static int string_ends_with(const char* string, const char* end) } int extract_write_template( - extract_t* extract, + extract_t* extract, const char* path_template, const char* path_out, int preserve_dir @@ -2021,7 +2145,7 @@ void extract_end(extract_t** pextract) extract_t* extract = *pextract; if (!extract) return; extract_document_free(extract->alloc, &extract->document); - + { int i; for (i=0; i<extract->contentss_num; ++i) { |