1 files changed, 393 insertions, 269 deletions
diff --git a/extract/src/extract.c b/extract/src/extract.c
index 2c375571..42f888f3 100644
--- a/extract/src/extract.c
+++ b/extract/src/extract.c
@@ -25,6 +25,9 @@
 
 
 
+const rect_t extract_rect_infinite = { { DBL_MIN, DBL_MIN }, { DBL_MAX, DBL_MAX } };
+const rect_t extract_rect_empty    = { { DBL_MAX, DBL_MAX }, { DBL_MIN, DBL_MIN } };
+
 
 double extract_matrix_expansion(matrix_t m)
 {
@@ -200,74 +203,97 @@ static void table_free(extract_alloc_t* alloc, table_t** ptable)
     extract_free(alloc, ptable);
 }
 
-static void page_free(extract_alloc_t* alloc, extract_page_t** ppage)
+void extract_subpage_free(extract_alloc_t* alloc, subpage_t** psubpage)
 {
-    extract_page_t* page = *ppage;
-    if (!page) return;
+    subpage_t* subpage = *psubpage;
+    if (!subpage) return;
 
-    outf0("page=%p page->spans_num=%i page->lines_num=%i",
-            page, page->spans_num, page->lines_num);
-    extract_spans_free(alloc, &page->spans, page->spans_num);
+    outf0("subpage=%p subpage->spans_num=%i subpage->lines_num=%i",
+            subpage, subpage->spans_num, subpage->lines_num);
+    extract_spans_free(alloc, &subpage->spans, subpage->spans_num);
 
-    extract_lines_free(alloc, &page->lines, page->lines_num);
+    extract_lines_free(alloc, &subpage->lines, subpage->lines_num);
 
     {
         int p;
-        for (p=0; p<page->paragraphs_num; ++p) {
-            paragraph_t* paragraph = page->paragraphs[p];
+        for (p=0; p<subpage->paragraphs_num; ++p) {
+            paragraph_t* paragraph = subpage->paragraphs[p];
             /* We don't call extract_lines_free(&paragraph->lines) because
-            these point into the same data as page->lines, which we have
+            these point into the same data as subpage->lines, which we have
             already freed above. */
             if (paragraph) extract_free(alloc, &paragraph->lines);
-            extract_free(alloc, &page->paragraphs[p]);
+            extract_free(alloc, &subpage->paragraphs[p]);
         }
     }
-    extract_free(alloc, &page->paragraphs);
-    
+    extract_free(alloc, &subpage->paragraphs);
+
     {
         int i;
-        for (i=0; i<page->images_num; ++i) {
-            extract_image_clear(alloc, &page->images[i]);
+        for (i=0; i<subpage->images_num; ++i) {
+            extract_image_clear(alloc, &subpage->images[i]);
         }
-        extract_free(alloc, &page->images);
+        extract_free(alloc, &subpage->images);
     }
-    extract_free(alloc, &page->images);
+    extract_free(alloc, &subpage->images);
+
+    extract_free(alloc, &subpage->tablelines_horizontal.tablelines);
+    extract_free(alloc, &subpage->tablelines_vertical.tablelines);
 
-    extract_free(alloc, &page->tablelines_horizontal.tablelines);
-    extract_free(alloc, &page->tablelines_vertical.tablelines);
-    
     {
         int t;
-        outf("page=%p page->tables_num=%i", page, page->tables_num);
-        for (t=0; t<page->tables_num; ++t)
+        outf("subpage=%p subpage->tables_num=%i", subpage, subpage->tables_num);
+        for (t=0; t<subpage->tables_num; ++t)
         {
-            table_free(alloc, &page->tables[t]);
+            table_free(alloc, &subpage->tables[t]);
         }
-        extract_free(alloc, &page->tables);
+        extract_free(alloc, &subpage->tables);
+    }
+
+    extract_free(alloc, psubpage);
+}
+
+static void page_free(extract_alloc_t* alloc, extract_page_t** ppage)
+{
+    int c;
+    extract_page_t* page = *ppage;
+    if (!page) return;
+
+    for (c=0; c<page->subpages_num; ++c)
+    {
+        subpage_t *subpage = page->subpages[c];
+        extract_subpage_free(alloc, &subpage);
     }
-    
+    extract_free(alloc, &page->subpages);
     extract_free(alloc, ppage);
 }
 
-static span_t* page_span_append(extract_alloc_t* alloc, extract_page_t* page)
-/* Appends new empty span_ to an extract_page_t; returns NULL with errno set on
-error. */
+int subpage_span_append(extract_alloc_t *alloc, subpage_t *subpage, span_t *span)
 {
-    span_t* span;
-    if (extract_malloc(alloc, &span, sizeof(*span))) return NULL;
-    extract_span_init(span);
     if (extract_realloc2(
             alloc,
-            &page->spans,
-            sizeof(*page->spans) * page->spans_num,
-            sizeof(*page->spans) * (page->spans_num + 1)
+            &subpage->spans,
+            sizeof(*subpage->spans) * subpage->spans_num,
+            sizeof(*subpage->spans) * (subpage->spans_num + 1)
             )) {
-        extract_free(alloc, &span);
-        return NULL;
+        return -1;
     }
-    page->spans[page->spans_num] = span;
-    page->spans_num += 1;
-    return span;
+    subpage->spans[subpage->spans_num] = span;
+    subpage->spans_num += 1;
+
+    return 0;
+}
+
+
+static int subpage_span_append_new(extract_alloc_t* alloc, subpage_t *subpage, span_t** pspan)
+/* Appends new empty span_ to a subpage_t; returns -1 with errno set on error. */
+{
+    if (extract_malloc(alloc, pspan, sizeof(**pspan))) return -1;
+    extract_span_init(*pspan);
+    if (subpage_span_append(alloc, subpage, *pspan)) {
+        extract_free(alloc, pspan);
+        return -1;
+    }
+    return 0;
 }
 
 
@@ -285,9 +311,9 @@ static void extract_images_free(extract_alloc_t* alloc, images_t* images)
 
 
 static int extract_document_images(extract_alloc_t* alloc, document_t* document, images_t* o_images)
-/* Moves image_t's from document->page[] to *o_images.
+/* Moves image_t's from document->subpage[] to *o_images.
 
-On return document->page[].images* will be NULL etc.
+On return document->subpage[].images* will be NULL etc.
 */
 {
     int e = -1;
@@ -297,59 +323,65 @@ On return document->page[].images* will be NULL etc.
     for (p=0; p<document->pages_num; ++p)
     {
         extract_page_t* page = document->pages[p];
-        int i;
-        for (i=0; i<page->images_num; ++i)
+        int c;
+        for (c=0; c<page->subpages_num; ++c)
         {
-            image_t* image;
-            if (extract_realloc2(
-                    alloc,
-                    &images.images,
-                    sizeof(image_t) * images.images_num,
-                    sizeof(image_t) * (images.images_num + 1)
-                    )) goto end;
-            image = &page->images[i];
-            outf("p=%i i=%i image->name=%s image->id=%s", p, i, image->name, image->id);
-            assert(image->name);
-            images.images[images.images_num] = *image;
-            images.images_num += 1;
-            
-            /* Add image type if we haven't seen it before. */
+            subpage_t* subpage = page->subpages[c];
+            int i;
+            for (i=0; i<subpage->images_num; ++i)
             {
-                int it;
-                for (it=0; it<images.imagetypes_num; ++it)
+                image_t* image;
+                if (extract_realloc2(
+                        alloc,
+                        &images.images,
+                        sizeof(image_t) * images.images_num,
+                        sizeof(image_t) * (images.images_num + 1)
+                    )) goto end;
+                image = &subpage->images[i];
+                outf("p=%i i=%i image->name=%s image->id=%s", p, i, image->name, image->id);
+                assert(image->name);
+                images.images[images.images_num] = *image;
+                images.images_num += 1;
+
+                /* Add image type if we haven't seen it before. */
                 {
-                    outf("it=%i images.imagetypes[it]=%s image->type=%s",
-                            it, images.imagetypes[it], image->type);
-                    if (!strcmp(images.imagetypes[it], image->type)) {
-                        break;
+                    int it;
+                    for (it=0; it<images.imagetypes_num; ++it)
+                    {
+                        outf("it=%i images.imagetypes[it]=%s image->type=%s",
+                                it, images.imagetypes[it], image->type);
+                        if (!strcmp(images.imagetypes[it], image->type))
+                        {
+                            break;
+                        }
                     }
-                }
-                if (it == images.imagetypes_num)
-                {
-                    /* We haven't seen this image type before. */
-                    if (extract_realloc2(
-                            alloc,
-                            &images.imagetypes,
-                            sizeof(char*) * images.imagetypes_num,
-                            sizeof(char*) * (images.imagetypes_num + 1)
+                    if (it == images.imagetypes_num)
+                    {
+                        /* We haven't seen this image type before. */
+                        if (extract_realloc2(
+                                alloc,
+                                &images.imagetypes,
+                                sizeof(char*) * images.imagetypes_num,
+                                sizeof(char*) * (images.imagetypes_num + 1)
                             )) goto end;
-                    assert(image->type);
-                    images.imagetypes[images.imagetypes_num] = image->type;
-                    images.imagetypes_num += 1;
-                    outf("have added images.imagetypes_num=%i", images.imagetypes_num);
+                        assert(image->type);
+                        images.imagetypes[images.imagetypes_num] = image->type;
+                        images.imagetypes_num += 1;
+                        outf("have added images.imagetypes_num=%i", images.imagetypes_num);
+                    }
                 }
+
+                /* We've taken ownership of image->* so NULL the original values
+                here to ensure we can't use things after free. */
+                image->type = NULL;
+                image->name = NULL;
+                image->id = NULL;
+                image->data = NULL;
+                image->data_size = 0;
             }
-            
-            /* We've taken ownership of image->* so NULL the original values
-            here to ensure we can't use things after free. */
-            image->type = NULL;
-            image->name = NULL;
-            image->id = NULL;
-            image->data = NULL;
-            image->data_size = 0;
+            extract_free(alloc, &subpage->images);
+            subpage->images_num = 0;
         }
-        extract_free(alloc, &page->images);
-        page->images_num = 0;
     }
     e = 0;
     end:
@@ -367,12 +399,11 @@ On return document->page[].images* will be NULL etc.
 static void extract_document_free(extract_alloc_t* alloc, document_t* document)
 {
     int p;
-    if (!document) {
-        return;
-    }
-    for (p=0; p<document->pages_num; ++p) {
-        extract_page_t* page = document->pages[p];
-        page_free(alloc, &page);
+    if (!document) return;
+
+    for (p=0; p<document->pages_num; ++p)
+    {
+        page_free(alloc, &document->pages[p]);
     }
     extract_free(alloc, &document->pages);
     document->pages = NULL;
@@ -451,11 +482,11 @@ static void s_document_init(document_t* document)
 }
 
 
-static int page_span_end_clean(extract_alloc_t* alloc, extract_page_t* page)
-/* Does preliminary processing of the end of the last span in a page; intended
+static int subpage_span_end_clean(extract_alloc_t* alloc, subpage_t* subpage)
+/* Does preliminary processing of the end of the last span in a subpage; intended
 to be called as we load span information.
 
-Looks at last two char_t's in last span_t of <page>, and either
+Looks at last two char_t's in last span_t of <subpage>, and either
 leaves unchanged, or removes space in last-but-one position, or moves last
 char_t into a new span_t. */
 {
@@ -468,9 +499,9 @@ char_t into a new span_t. */
     double err_x;
     double err_y;
     point_t dir;
-    
-    assert(page->spans_num);
-    span = page->spans[page->spans_num-1];
+
+    assert(subpage->spans_num);
+    span = subpage->spans[subpage->spans_num-1];
     assert(span->chars_num);
 
     /* Last two char_t's are char_[-2] and char_[-1]. */
@@ -547,8 +578,8 @@ char_t into a new span_t. */
                 span_string2(span)
                 );
         {
-            span_t* span2 = page_span_append(alloc, page);
-            if (!span2) goto end;
+            span_t* span2;
+            if (subpage_span_append_new(alloc, subpage, &span2)) goto end;
             *span2 = *span;
             if (extract_strdup(alloc, span->font_name, &span2->font_name)) goto end;
             span2->chars_num = 1;
@@ -567,42 +598,44 @@ char_t into a new span_t. */
 struct extract_t
 {
     extract_alloc_t*    alloc;
-    
+
+    int                 layout_analysis;
+
     document_t          document;
-    
+
     int                 num_spans_split;
-    /* Number of extra spans from page_span_end_clean(). */
-    
+    /* Number of extra spans from subpage_span_end_clean(). */
+
     int                 num_spans_autosplit;
     /* Number of extra spans from autosplit=1. */
-    
+
     double              span_offset_x;
     double              span_offset_y;
     /* Only used if autosplit is non-zero. */
-    
+
     int                 image_n;
     /* Used to generate unique ids for images. */
-    
+
     /* List of strings that are the generated docx content for each page. When
     zip_* can handle appending of data, we will be able to remove this list. */
     extract_astring_t*  contentss;
     int                 contentss_num;
-    
+
     images_t            images;
-    
+
     extract_format_t    format;
     extract_odt_styles_t odt_styles;
-    
+
     char*               tables_csv_format;
     int                 tables_csv_i;
-    
+
     enum
     {
         path_type_NONE,
         path_type_FILL,
         path_type_STROKE,
     } path_type;
-    
+
     union
     {
         struct
@@ -612,7 +645,7 @@ struct extract_t
             point_t     points[4];
             int         n;
         } fill;
-        
+
         struct
         {
             matrix_t    ctm;
@@ -623,7 +656,7 @@ struct extract_t
             point_t     point;
             int         point_set;
         } stroke;
-    
+
     } path;
 };
 
@@ -636,7 +669,7 @@ int extract_begin(
 {
     int e = -1;
     extract_t*  extract;
-    
+
     if (1
             && format != extract_format_ODT
             && format != extract_format_DOCX
@@ -648,29 +681,35 @@ int extract_begin(
         errno = EINVAL;
         return -1;
     }
-    
+
     /* Use a temporary extract_alloc_t to allocate space for the extract_t. */
     if (extract_malloc(alloc, &extract, sizeof(*extract))) goto end;
-    
+
     extract_bzero(extract, sizeof(*extract));
     extract->alloc = alloc;
     s_document_init(&extract->document);
-    
+
     /* Start at 10 because template document might use some low-numbered IDs.
     */
     extract->image_n = 10;
-    
+
     extract->format = format;
     extract->tables_csv_format = NULL;
     extract->tables_csv_i = 0;
-    
+
     e = 0;
-    
+
     end:
     *pextract = (e) ? NULL : extract;
     return e;
 }
 
+int extract_set_layout_analysis(extract_t *extract, int enable)
+{
+    extract->layout_analysis = enable;
+    return 0;
+}
+
 int extract_tables_csv_format(extract_t* extract, const char* path_format)
 {
     return extract_strdup(extract->alloc, path_format, &extract->tables_csv_format);
@@ -686,7 +725,7 @@ static void image_free_fn(void* handle, void* image_data)
 int extract_read_intermediate(extract_t* extract, extract_buffer_t* buffer, int autosplit)
 {
     int ret = -1;
-    
+
     document_t* document = &extract->document;
     char*   image_data = NULL;
     int     num_spans = 0;
@@ -716,16 +755,18 @@ int extract_read_intermediate(extract_t* extract, extract_buffer_t* buffer, int
     </page>
     ...
 
-    We convert this into a list of extract_page_t's, each containing a list of
+    We convert this into a list of subpage_t's, each containing a list of
     span_t's, each containing a list of char_t's.
 
     While doing this, we do some within-span processing by calling
-    page_span_end_clean():
+    subpage_span_end_clean():
         Remove spurious spaces.
         Split spans in two where there seem to be large gaps between glyphs.
     */
     for(;;) {
         extract_page_t* page;
+        subpage_t* subpage;
+        rect_t mediabox = extract_rect_infinite; /* Fake mediabox */
         int e = extract_xml_pparse_next(buffer, &tag);
         if (e == 1) break; /* EOF. */
         if (e) goto end;
@@ -741,14 +782,16 @@ int extract_read_intermediate(extract_t* extract, extract_buffer_t* buffer, int
             goto end;
         }
         outfx("loading spans for page %i...", document->pages_num);
-        if (extract_page_begin(extract)) goto end;
+        if (extract_page_begin(extract, mediabox.min.x, mediabox.min.y, mediabox.max.x, mediabox.max.y)) goto end;
         page = extract->document.pages[extract->document.pages_num-1];
         if (!page) goto end;
+        subpage = page->subpages[page->subpages_num-1];
+        if (!subpage) goto end;
 
         for(;;) {
             if (extract_xml_pparse_next(buffer, &tag)) goto end;
             if (!strcmp(tag.name, "/page")) {
-                num_spans += page->spans_num;
+                num_spans += subpage->spans_num;
                 break;
             }
             if (!strcmp(tag.name, "image")) {
@@ -804,20 +847,20 @@ int extract_read_intermediate(extract_t* extract, extract_buffer_t* buffer, int
                         else if (cc >= 'a' && cc <= 'f') byte += 10 + cc - 'a';
                         else goto compressed_error;
                         byte *= 16;
-                        
+
                         cc = *c;
                         c += 1;
                         if (cc >= '0' && cc <= '9') byte += cc-'0';
                         else if (cc >= 'a' && cc <= 'f') byte += 10 + cc - 'a';
                         else goto compressed_error;
-                        
+
                         image_data[i] = (char) byte;
                         i += 1;
                         if (i == image_data_size) {
                             break;
                         }
                         continue;
-                        
+
                         compressed_error:
                         outf("Unrecognised hex character '%x' at offset %lli in image data", cc, (long long) (c-tag.text.chars));
                         errno = EINVAL;
@@ -893,12 +936,12 @@ int extract_read_intermediate(extract_t* extract, extract_buffer_t* buffer, int
                         trm.e,
                         trm.f
                         )) goto end;
-                
+
                 for(;;) {
-                    double      x;
-                    double      y;
-                    double      adv;
-                    unsigned    ucs;
+                    double         x;
+                    double         y;
+                    double         adv;
+                    unsigned       ucs;
 
                     if (extract_xml_pparse_next(buffer, &tag)) {
                         outf("Failed to find <char or </span");
@@ -917,16 +960,17 @@ int extract_read_intermediate(extract_t* extract, extract_buffer_t* buffer, int
                     if (extract_xml_tag_attributes_find_double(&tag, "y", &y)) goto end;
                     if (extract_xml_tag_attributes_find_double(&tag, "adv", &adv)) goto end;
                     if (extract_xml_tag_attributes_find_uint(&tag, "ucs", &ucs)) goto end;
-                    
-                    if (extract_add_char(extract, x, y, ucs, adv, autosplit)) goto end;
+
+                    /* BBox is bogus here. Analysis will fail. */
+                    if (extract_add_char(extract, x, y, ucs, adv, autosplit, x, y, x + adv, y + adv)) goto end;
                 }
 
                 extract_xml_tag_free(extract->alloc, &tag);
             }
         }
         if (extract_page_end(extract)) goto end;
-        outf("page=%i page->num_spans=%i",
-                document->pages_num, page->spans_num);
+        outf("page=%i subpage->num_spans=%i",
+                document->pages_num, subpage->spans_num);
     }
 
     outf("num_spans=%i num_spans_split=%i num_spans_autosplit=%i",
@@ -940,7 +984,7 @@ int extract_read_intermediate(extract_t* extract, extract_buffer_t* buffer, int
     end:
     extract_xml_tag_free(extract->alloc, &tag);
     extract_free(extract->alloc, &image_data);
-    
+
     return ret;
 }
 
@@ -967,9 +1011,11 @@ int extract_span_begin(
 {
     int e = -1;
     extract_page_t* page;
+    subpage_t* subpage;
     span_t* span;
     assert(extract->document.pages_num > 0);
     page = extract->document.pages[extract->document.pages_num-1];
+    subpage = page->subpages[page->subpages_num-1];
     outf("extract_span_begin(): ctm=(%f %f %f %f %f %f) trm=(%f %f %f %f %f %f) font_name=%s, wmode=%i",
             ctm_a,
             ctm_b,
@@ -986,22 +1032,21 @@ int extract_span_begin(
             font_name,
             wmode
             );
-    span = page_span_append(extract->alloc, page);
-    if (!span) goto end;
+    if (subpage_span_append_new(extract->alloc, subpage, &span)) goto end;
     span->ctm.a = ctm_a;
     span->ctm.b = ctm_b;
     span->ctm.c = ctm_c;
     span->ctm.d = ctm_d;
     span->ctm.e = ctm_e;
     span->ctm.f = ctm_f;
-    
+
     span->trm.a = trm_a;
     span->trm.b = trm_b;
     span->trm.c = trm_c;
     span->trm.d = trm_d;
     span->trm.e = trm_e;
     span->trm.f = trm_f;
-    
+
     {
         const char* ff = strchr(font_name, '+');
         const char* f = (ff) ? ff+1 : font_name;
@@ -1019,25 +1064,30 @@ int extract_span_begin(
 
 
 int extract_add_char(
-        extract_t*  extract,
-        double      x,
-        double      y,
-        unsigned    ucs,
-        double      adv,
-        int         autosplit
+        extract_t*     extract,
+        double         x,
+        double         y,
+        unsigned       ucs,
+        double         adv,
+        int            autosplit,
+        double         x0,
+        double         y0,
+        double         x1,
+        double         y1
         )
 {
     int e = -1;
     char_t* char_;
     extract_page_t* page = extract->document.pages[extract->document.pages_num-1];
-    span_t* span = page->spans[page->spans_num - 1];
-    
+    subpage_t* subpage = page->subpages[page->subpages_num-1];
+    span_t* span = subpage->spans[subpage->spans_num - 1];
+
     outf("(%f %f) ucs=% 5i=%c adv=%f", x, y, ucs, (ucs >=32 && ucs< 127) ? ucs : ' ', adv);
     /* Ignore the specified <autosplit> - there seems no advantage to not
     splitting spans on multiple lines, and not doing so causes problems with
     missing spaces in the output. */
     autosplit = 1;
-    
+
     if (span->chars_num)
     {
         char_t* char_prev = &span->chars[span->chars_num - 1];
@@ -1065,17 +1115,16 @@ int extract_add_char(
                     dir.x, dir.y, span_a
                     );
             extract->num_spans_autosplit += 1;
-            span = page_span_append(extract->alloc, page);
-            if (!span) goto end;
+            if (subpage_span_append_new(extract->alloc, subpage, &span)) goto end;
             *span = *span0;
             span->chars = NULL;
             span->chars_num = 0;
             if (extract_strdup(extract->alloc, span0->font_name, &span->font_name)) goto end;
         }
     }
-    
+
     if (0 && autosplit && y - extract->span_offset_y != 0) {
-        
+
         double e = span->ctm.e + span->ctm.a * (x - extract->span_offset_x)
                 + span->ctm.b * (y - extract->span_offset_y);
         double f = span->ctm.f + span->ctm.c * (x - extract->span_offset_x)
@@ -1094,8 +1143,7 @@ int extract_add_char(
             /* Create new span. */
             span_t* span0 = span;
             extract->num_spans_autosplit += 1;
-            span = page_span_append(extract->alloc, page);
-            if (!span) goto end;
+            if (subpage_span_append_new(extract->alloc, subpage, &span)) goto end;
             *span = *span0;
             span->chars = NULL;
             span->chars_num = 0;
@@ -1106,31 +1154,35 @@ int extract_add_char(
         outfx("autosplit: char_pre_y=%f offset_y=%f",
                 char_pre_y, offset_y);
     }
-    
+
     if (extract_span_append_c(extract->alloc, span, 0 /*c*/)) goto end;
     /* Coverity warns, but extract_span_append_c() will have appended an item. */
     /* coverity[var_deref_op] */
     char_ = &span->chars[ span->chars_num-1];
-    
+
     char_->pre_x = x;
     char_->pre_y = y;
 
     char_->x = span->ctm.a * char_->pre_x + span->ctm.c * char_->pre_y + span->ctm.e;
     char_->y = span->ctm.b * char_->pre_x + span->ctm.d * char_->pre_y + span->ctm.f;
-    
+
     char_->adv = adv;
     char_->ucs = ucs;
+    char_->bbox.min.x = x0;
+    char_->bbox.min.y = y0;
+    char_->bbox.max.x = x1;
+    char_->bbox.max.y = y1;
 
     {
-        int page_spans_num_old = page->spans_num;
-        if (page_span_end_clean(extract->alloc, page)) goto end;
-        span = page->spans[page->spans_num-1];  /* fixme: unnecessary. */
-        if (page->spans_num != page_spans_num_old) {
+        int subpage_spans_num_old = subpage->spans_num;
+        if (subpage_span_end_clean(extract->alloc, subpage)) goto end;
+        span = subpage->spans[subpage->spans_num-1];  /* fixme: unnecessary. */
+        if (subpage->spans_num != subpage_spans_num_old) {
             extract->num_spans_split += 1;
         }
     }
     e = 0;
-    
+
     end:
     return e;
 }
@@ -1139,13 +1191,14 @@ int extract_add_char(
 int extract_span_end(extract_t* extract)
 {
     extract_page_t* page = extract->document.pages[extract->document.pages_num-1];
-    span_t* span = page->spans[page->spans_num - 1];
+    subpage_t* subpage = page->subpages[page->subpages_num-1];
+    span_t* span = subpage->spans[subpage->spans_num - 1];
     if (span->chars_num == 0) {
         /* Calling code called extract_span_begin() then extract_span_end()
         without any call to extract_add_char(). Our joining code assumes that
         all spans are non-empty, so we need to delete this span. */
-        extract_free(extract->alloc, &page->spans[page->spans_num - 1]);
-        page->spans_num -= 1;
+        extract_free(extract->alloc, &subpage->spans[subpage->spans_num - 1]);
+        subpage->spans_num -= 1;
     }
     return 0;
 }
@@ -1166,8 +1219,9 @@ int extract_add_image(
 {
     int e = -1;
     extract_page_t* page = extract->document.pages[extract->document.pages_num-1];
+    subpage_t* subpage = page->subpages[page->subpages_num-1];
     image_t image_temp = {0};
-    
+
     extract->image_n += 1;
     image_temp.x = x;
     image_temp.y = y;
@@ -1180,29 +1234,29 @@ int extract_add_image(
     if (extract_strdup(extract->alloc, type, &image_temp.type)) goto end;
     if (extract_asprintf(extract->alloc, &image_temp.id, "rId%i", extract->image_n) < 0) goto end;
     if (extract_asprintf(extract->alloc, &image_temp.name, "image%i.%s", extract->image_n, image_temp.type) < 0) goto end;
-    
+
     if (extract_realloc2(
             extract->alloc,
-            &page->images,
-            sizeof(image_t) * page->images_num,
-            sizeof(image_t) * (page->images_num + 1)
+            &subpage->images,
+            sizeof(image_t) * subpage->images_num,
+            sizeof(image_t) * (subpage->images_num + 1)
             )) goto end;
-    
-    page->images[page->images_num] = image_temp;
-    page->images_num += 1;
-    outf("page->images_num=%i", page->images_num);
-    
+
+    subpage->images[subpage->images_num] = image_temp;
+    subpage->images_num += 1;
+    outf("subpage->images_num=%i", subpage->images_num);
+
     e = 0;
-    
+
     end:
-    
+
     if (e) {
         extract_free(extract->alloc, &image_temp.type);
         extract_free(extract->alloc, &image_temp.data);
         extract_free(extract->alloc, &image_temp.id);
         extract_free(extract->alloc, &image_temp.name);
     }
-    
+
     return e;
 }
 
@@ -1220,7 +1274,7 @@ static int tablelines_append(extract_alloc_t* alloc, tablelines_t* tablelines, r
     return 0;
 }
 
-static point_t transform(double x, double y, 
+static point_t transform(double x, double y,
         double ctm_a,
         double ctm_b,
         double ctm_c,
@@ -1265,6 +1319,7 @@ int extract_add_path4(
         )
 {
     extract_page_t* page = extract->document.pages[extract->document.pages_num-1];
+    subpage_t* subpage = page->subpages[page->subpages_num-1];
     point_t points[4] = {
             transform(x0, y0, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f),
             transform(x1, y1, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f),
@@ -1303,20 +1358,20 @@ int extract_add_path4(
     if (points[(i+4) % 4].y != y0)  return 0;
     rect.min.y = (y1 > y0) ? y0 : y1;
     rect.max.y = (y1 > y0) ? y1 : y0;
-    
+
     dx = rect.max.x - rect.min.x;
     dy = rect.max.y - rect.min.y;
     if (dx / dy > 5)
     {
         /* Horizontal line. */
         outf("have found horizontal line: %s", extract_rect_string(&rect));
-        if (tablelines_append(extract->alloc, &page->tablelines_horizontal, &rect, color)) return -1;
+        if (tablelines_append(extract->alloc, &subpage->tablelines_horizontal, &rect, color)) return -1;
     }
     else if (dy / dx > 5)
     {
         /* Vertical line. */
         outf("have found vertical line: %s", extract_rect_string(&rect));
-        if (tablelines_append(extract->alloc, &page->tablelines_vertical, &rect, color)) return -1;
+        if (tablelines_append(extract->alloc, &subpage->tablelines_vertical, &rect, color)) return -1;
     }
     return 0;
 }
@@ -1339,6 +1394,7 @@ int extract_add_line(
         )
 {
     extract_page_t* page = extract->document.pages[extract->document.pages_num-1];
+    subpage_t* subpage = page->subpages[page->subpages_num-1];
     point_t p0 = transform(x0, y0, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f);
     point_t p1 = transform(x1, y1, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f);
     double width2 = width * sqrt( fabs( ctm_a * ctm_d - ctm_b * ctm_c));
@@ -1348,7 +1404,7 @@ int extract_add_line(
     rect.min.y = s_min(p0.y, p1.y);
     rect.max.x = s_max(p0.x, p1.x);
     rect.max.y = s_max(p0.y, p1.y);
-    
+
     outf("%s: width=%f ((%f %f)(%f %f)) rect=%s",
             extract_FUNCTION,
             width,
@@ -1362,49 +1418,98 @@ int extract_add_line(
     {
         rect.min.x -= width2 / 2;
         rect.max.x += width2 / 2;
-        return tablelines_append(extract->alloc, &page->tablelines_vertical, &rect, color);
+        return tablelines_append(extract->alloc, &subpage->tablelines_vertical, &rect, color);
     }
     else if (rect.min.y == rect.max.y)
     {
         rect.min.y -= width2 / 2;
         rect.max.y += width2 / 2;
-        return tablelines_append(extract->alloc, &page->tablelines_horizontal, &rect, color);
+        return tablelines_append(extract->alloc, &subpage->tablelines_horizontal, &rect, color);
     }
     return 0;
 }
 
+int extract_subpage_alloc(extract_alloc_t* alloc, rect_t mediabox, extract_page_t* page, subpage_t** psubpage)
+{
+    subpage_t* subpage;
+    if (extract_malloc(alloc, psubpage, sizeof(subpage_t)))
+    {
+        return -1;
+    }
+    subpage = *psubpage;
+    subpage->mediabox = mediabox;
+    subpage->spans = NULL;
+    subpage->spans_num = 0;
+    subpage->lines = NULL;
+    subpage->lines_num = 0;
+    subpage->paragraphs = NULL;
+    subpage->paragraphs_num = 0;
+    subpage->images = NULL;
+    subpage->images_num = 0;
+    subpage->tablelines_horizontal.tablelines = NULL;
+    subpage->tablelines_horizontal.tablelines_num = 0;
+    subpage->tablelines_vertical.tablelines = NULL;
+    subpage->tablelines_vertical.tablelines_num = 0;
+    subpage->tables = NULL;
+    subpage->tables_num = 0;
+
+    if (extract_realloc2(
+            alloc,
+            &page->subpages,
+            sizeof(subpage_t*) * page->subpages_num,
+            sizeof(subpage_t*) * (page->subpages_num + 1)
+            )) {
+        extract_free(alloc, psubpage);
+        return -1;
+    }
+    page->subpages[page->subpages_num] = subpage;
+    page->subpages_num += 1;
+    return 0;
+}
+
+static int extract_subpage_begin(extract_t* extract, double x0, double y0, double x1, double y1)
+/* Appends new empty subpage_t to the last page of an extract->document. */
+{
+    extract_page_t* page = extract->document.pages[extract->document.pages_num - 1];
+    subpage_t* subpage;
+    rect_t mediabox = { { x0, y0 }, { x1, y1 } };
+
+    return extract_subpage_alloc(extract->alloc, mediabox, page, &subpage);
+}
 
-int extract_page_begin(extract_t* extract)
+int extract_page_begin(extract_t *extract, double x0, double y0, double x1, double y1)
 {
-    /* Appends new empty extract_page_t to an extract->document. */
+    /* Appends new empty page_t to an extract->document. */
     extract_page_t* page;
-    if (extract_malloc(extract->alloc, &page, sizeof(extract_page_t))) return -1;
-    page->spans = NULL;
-    page->spans_num = 0;
-    page->lines = NULL;
-    page->lines_num = 0;
-    page->paragraphs = NULL;
-    page->paragraphs_num = 0;
-    page->images = NULL;
-    page->images_num = 0;
-    page->tablelines_horizontal.tablelines = NULL;
-    page->tablelines_horizontal.tablelines_num = 0;
-    page->tablelines_vertical.tablelines = NULL;
-    page->tablelines_vertical.tablelines_num = 0;
-    page->tables = NULL;
-    page->tables_num = 0;
-    
+
+    if (extract_malloc(extract->alloc, &page, sizeof(*page))) return -1;
+    page->mediabox.min.x = x0;
+    page->mediabox.min.y = y0;
+    page->mediabox.max.x = x1;
+    page->mediabox.max.y = y1;
+    page->subpages = NULL;
+    page->subpages_num = 0;
+    page->split = NULL;
+
     if (extract_realloc2(
             extract->alloc,
             &extract->document.pages,
-            sizeof(extract_page_t*) * extract->document.pages_num + 1,
-            sizeof(extract_page_t*) * (extract->document.pages_num + 1)
+            sizeof(subpage_t*) * extract->document.pages_num,
+            sizeof(subpage_t*) * (extract->document.pages_num + 1)
             )) {
         extract_free(extract->alloc, &page);
         return -1;
     }
+
     extract->document.pages[extract->document.pages_num] = page;
     extract->document.pages_num += 1;
+
+    if (extract_subpage_begin(extract, x0, y0, x1, y1)) {
+        extract->document.pages_num--;
+        page_free(extract->alloc, &extract->document.pages[extract->document.pages_num]);
+        return -1;
+    }
+
     return 0;
 }
 
@@ -1634,13 +1739,22 @@ int extract_stroke_end(extract_t* extract)
 
 
 
-int extract_page_end(extract_t* extract)
+static int extract_subpage_end(extract_t* extract)
 {
     (void) extract;
     return 0;
 }
 
 
+int extract_page_end(extract_t* extract)
+{
+    if (extract_subpage_end(extract))
+        return -1;
+
+    return 0;
+}
+
+
 static int paragraphs_to_text_content(
         extract_alloc_t* alloc,
         paragraph_t** paragraphs,
@@ -1692,54 +1806,59 @@ static int extract_write_tables_csv(extract_t* extract)
     FILE* f = NULL;
     extract_astring_t text = {NULL, 0};
     if (!extract->tables_csv_format) return 0;
-    
+
     outf("extract_write_tables_csv(): path_format=%s", extract->tables_csv_format);
     outf("extract->document.pages_num=%i", extract->document.pages_num);
     for (p=0; p<extract->document.pages_num; ++p)
     {
+        int c;
         extract_page_t* page = extract->document.pages[p];
-        int t;
-        outf("p=%i page->tables_num=%i", p, page->tables_num);
-        for (t=0; t<page->tables_num; ++t)
+        for (c=0; c<page->subpages_num; ++c)
         {
-            table_t* table = page->tables[t];
-            int y;
-            extract_free(extract->alloc, &path);
-            if (extract_asprintf(extract->alloc, &path, extract->tables_csv_format, extract->tables_csv_i) < 0) goto end;
-            extract->tables_csv_i += 1;
-            outf("Writing table %i to: %s", t, path);
-            outf("table->cells_num_x=%i", table->cells_num_x);
-            outf("table->cells_num_y=%i", table->cells_num_y);
-            f = fopen(path, "w");
-            if (!f) goto end;
-            for (y=0; y<table->cells_num_y; ++y)
+            subpage_t* subpage = page->subpages[c];
+            int t;
+            outf("p=%i subpage->tables_num=%i", p, subpage->tables_num);
+            for (t=0; t<subpage->tables_num; ++t)
             {
-                int x;
-                int have_output = 0;
-                for (x=0; x<table->cells_num_x; ++x)
+                table_t* table = subpage->tables[t];
+                int y;
+                extract_free(extract->alloc, &path);
+                if (extract_asprintf(extract->alloc, &path, extract->tables_csv_format, extract->tables_csv_i) < 0) goto end;
+                extract->tables_csv_i += 1;
+                outf("Writing table %i to: %s", t, path);
+                outf("table->cells_num_x=%i", table->cells_num_x);
+                outf("table->cells_num_y=%i", table->cells_num_y);
+                f = fopen(path, "w");
+                if (!f) goto end;
+                for (y=0; y<table->cells_num_y; ++y)
                 {
-                    cell_t* cell = table->cells[table->cells_num_x * y + x];
-                    extract_astring_free(extract->alloc, &text);
-                    if (y==0)
+                    int x;
+                    int have_output = 0;
+                    for (x=0; x<table->cells_num_x; ++x)
                     {
-                        outf("y=0 x=%i cell->rect=%s", x, extract_rect_string(&cell->rect));
+                        cell_t* cell = table->cells[table->cells_num_x * y + x];
+                        extract_astring_free(extract->alloc, &text);
+                        if (y==0)
+                        {
+                            outf("y=0 x=%i cell->rect=%s", x, extract_rect_string(&cell->rect));
+                        }
+                        if (have_output) fprintf(f, ",");
+                        have_output = 1;
+                        if (paragraphs_to_text_content(
+                                extract->alloc,
+                                cell->paragraphs,
+                                cell->paragraphs_num,
+                                &text
+                                )) goto end;
+                        /* Reference cvs output trims trailing spaces. */
+                        extract_astring_char_truncate_if(&text, ' ');
+                        fprintf(f, "\"%s\"", text.chars ? text.chars : "");
                     }
-                    if (have_output) fprintf(f, ",");
-                    have_output = 1;
-                    if (paragraphs_to_text_content(
-                            extract->alloc,
-                            cell->paragraphs,
-                            cell->paragraphs_num,
-                            &text
-                            )) goto end;
-                    /* Reference cvs output trims trailing spaces. */
-                    extract_astring_char_truncate_if(&text, ' ');
-                    fprintf(f, "\"%s\"", text.chars ? text.chars : "");
+                    fprintf(f, "\n");
                 }
-                fprintf(f, "\n");
+                fclose(f);
+                f = NULL;
             }
-            fclose(f);
-            f = NULL;
         }
     }
     ret = 0;
@@ -1760,7 +1879,7 @@ int extract_process(
         )
 {
     int e = -1;
-    
+
     if (extract_realloc2(
             extract->alloc,
             &extract->contentss,
@@ -1769,9 +1888,9 @@ int extract_process(
             )) goto end;
     extract_astring_init(&extract->contentss[extract->contentss_num]);
     extract->contentss_num += 1;
-    
-    if (extract_document_join(extract->alloc, &extract->document)) goto end;
-    
+
+    if (extract_document_join(extract->alloc, &extract->document, extract->layout_analysis)) goto end;
+
     if (extract->format == extract_format_ODT)
     {
         if (extract_document_to_odt_content(
@@ -1811,12 +1930,17 @@ int extract_process(
         for (p=0; p<extract->document.pages_num; ++p)
         {
             extract_page_t* page = extract->document.pages[p];
-            if (paragraphs_to_text_content(
-                    extract->alloc,
-                    page->paragraphs,
-                    page->paragraphs_num,
-                    &extract->contentss[extract->contentss_num - 1]
+            int c;
+            for (c=0; c<page->subpages_num; ++c)
+            {
+                subpage_t* subpage = page->subpages[c];
+                if (paragraphs_to_text_content(
+                        extract->alloc,
+                        subpage->paragraphs,
+                        subpage->paragraphs_num,
+                        &extract->contentss[extract->contentss_num - 1]
                     )) goto end;
+            }
         }
     }
     else
@@ -1828,23 +1952,23 @@ int extract_process(
     }
 
     if (extract_document_images(extract->alloc, &extract->document, &extract->images)) goto end;
-    
+
     if (extract->tables_csv_format)
     {
         extract_write_tables_csv(extract);
     }
-    
+
     {
-        int i;
-        for (i=0; i<extract->document.pages_num; ++i) {
-            page_free(extract->alloc, &extract->document.pages[i]);
+        int p;
+        for (p=0; p<extract->document.pages_num; ++p) {
+            page_free(extract->alloc, &extract->document.pages[p]);
         }
         extract_free(extract->alloc, &extract->document.pages);
         extract->document.pages_num = 0;
     }
-    
+
     e = 0;
-    
+
     end:
     return e;
 }
@@ -1855,7 +1979,7 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer)
     extract_zip_t*  zip = NULL;
     char*           text2 = NULL;
     int             i;
-    
+
     if (extract->format == extract_format_ODT)
     {
         if (extract_zip_open(buffer, &zip)) goto end;
@@ -1922,7 +2046,7 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer)
             if (extract_zip_write_file(zip, image->data, image->data_size, text2)) goto end;
         }
         if (extract_zip_close(&zip)) goto end;
-        
+
     }
     else if (extract->format == extract_format_HTML)
     {
@@ -1945,9 +2069,9 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer)
         errno = EINVAL;
         return 1;
     }
-    
+
     e = 0;
-    
+
     end:
     if (e)
     {
@@ -1955,7 +2079,7 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer)
         extract_zip_close(&zip);
     }
     extract_free(extract->alloc, &text2);
-    
+
     return e;
 }
 
@@ -1982,7 +2106,7 @@ static int string_ends_with(const char* string, const char* end)
 }
 
 int extract_write_template(
-        extract_t*  extract, 
+        extract_t*  extract,
         const char* path_template,
         const char* path_out,
         int         preserve_dir
@@ -2021,7 +2145,7 @@ void extract_end(extract_t** pextract)
     extract_t* extract = *pextract;
     if (!extract) return;
     extract_document_free(extract->alloc, &extract->document);
-    
+
     {
         int i;
         for (i=0; i<extract->contentss_num; ++i) {