26 files changed, 3533 insertions, 853 deletions
diff --git a/extract/src/astring.c b/extract/src/astring.c
index fd09d639..e5d40217 100644
--- a/extract/src/astring.c
+++ b/extract/src/astring.c
@@ -27,6 +27,9 @@ void extract_astring_free(extract_alloc_t* alloc, extract_astring_t* string)
 int extract_astring_catl(extract_alloc_t* alloc, extract_astring_t* string, const char* s, size_t s_len)
 {
     if (extract_realloc2(alloc, &string->chars, string->chars_num+1, string->chars_num + s_len + 1)) return -1;
+    /* Coverity doesn't seem to realise that extract_realloc2() modifies
+    string->chars. */
+    /* coverity[deref_parm_field_in_call] */
     memcpy(string->chars + string->chars_num, s, s_len);
     string->chars[string->chars_num + s_len] = 0;
     string->chars_num += s_len;
@@ -65,7 +68,7 @@ int extract_astring_truncate(extract_astring_t* content, int len)
     return 0;
 }
 
-int astring_char_truncate_if(extract_astring_t* content, char c)
+int extract_astring_char_truncate_if(extract_astring_t* content, char c)
 {
     if (content->chars_num && content->chars[content->chars_num-1] == c) {
         extract_astring_truncate(content, 1);
@@ -73,40 +76,58 @@ int astring_char_truncate_if(extract_astring_t* content, char c)
     return 0;
 }
 
-int extract_astring_cat_xmlc(extract_alloc_t* alloc, extract_astring_t* string, int c)
+int extract_astring_catc_unicode(
+        extract_alloc_t* alloc,
+        extract_astring_t* string,
+        int c,
+        int xml,
+        int ascii_ligatures,
+        int ascii_dash,
+        int ascii_apostrophe
+        )
 {
     int ret = -1;
     
     if (0) {}
 
     /* Escape XML special characters. */
-    else if (c == '<')  extract_astring_cat(alloc, string, "&lt;");
-    else if (c == '>')  extract_astring_cat(alloc, string, "&gt;");
-    else if (c == '&')  extract_astring_cat(alloc, string, "&amp;");
-    else if (c == '"')  extract_astring_cat(alloc, string, "&quot;");
-    else if (c == '\'') extract_astring_cat(alloc, string, "&apos;");
+    else if (xml && c == '<')  extract_astring_cat(alloc, string, "&lt;");
+    else if (xml && c == '>')  extract_astring_cat(alloc, string, "&gt;");
+    else if (xml && c == '&')  extract_astring_cat(alloc, string, "&amp;");
+    else if (xml && c == '"')  extract_astring_cat(alloc, string, "&quot;");
+    else if (xml && c == '\'') extract_astring_cat(alloc, string, "&apos;");
 
     /* Expand ligatures. */
-    else if (c == 0xFB00)
+    else if (ascii_ligatures && c == 0xFB00)
     {
         if (extract_astring_cat(alloc, string, "ff")) goto end;
     }
-    else if (c == 0xFB01)
+    else if (ascii_ligatures && c == 0xFB01)
     {
         if (extract_astring_cat(alloc, string, "fi")) goto end;
     }
-    else if (c == 0xFB02)
+    else if (ascii_ligatures && c == 0xFB02)
     {
         if (extract_astring_cat(alloc, string, "fl")) goto end;
     }
-    else if (c == 0xFB03)
+    else if (ascii_ligatures && c == 0xFB03)
     {
         if (extract_astring_cat(alloc, string, "ffi")) goto end;
     }
-    else if (c == 0xFB04)
+    else if (ascii_ligatures && c == 0xFB04)
     {
         if (extract_astring_cat(alloc, string, "ffl")) goto end;
     }
+    
+    /* Convert some special characters to ascii. */
+    else if (ascii_dash && c == 0x2212)
+    {
+        if (extract_astring_catc(alloc, string, '-')) goto end;
+    }
+    else if (ascii_apostrophe && c == 0x2019)
+    {
+        if (extract_astring_catc(alloc, string, '\'')) goto end;
+    }
 
     /* Output ASCII verbatim. */
     else if (c >= 32 && c <= 127)
@@ -117,18 +138,65 @@ int extract_astring_cat_xmlc(extract_alloc_t* alloc, extract_astring_t* string,
     /* Escape all other characters. */
     else
     {
-        char    buffer[32];
-        if (c < 32
-                && (c != 0x9 && c != 0xa && c != 0xd)
-                )
+        if (xml)
         {
-            /* Illegal xml character; see
-            https://www.w3.org/TR/xml/#charsets. We replace with
-            0xfffd, the unicode replacement character. */
-            c = 0xfffd;
+            char    buffer[32];
+            if (c < 32
+                    && (c != 0x9 && c != 0xa && c != 0xd)
+                    )
+            {
+                /* Illegal xml character; see
+                https://www.w3.org/TR/xml/#charsets. We replace with
+                0xfffd, the unicode replacement character. */
+                c = 0xfffd;
+            }
+            snprintf(buffer, sizeof(buffer), "&#x%x;", c);
+            if (extract_astring_cat(alloc, string, buffer)) goto end;
+        }
+        else
+        {
+            /* Use utf8. */
+            if (c < 0x80)
+            {
+                if (extract_astring_catc(alloc, string, (char) c)) return -1;
+            }
+            else if (c < 0x0800)
+            {
+                char cc[2] = 
+                {
+                    (char) (((c >> 6) & 0x1f) | 0xc0),
+                    (char) (((c >> 0) & 0x3f) | 0x80)
+                };
+                if (extract_astring_catl(alloc, string, cc, sizeof(cc))) return -1;
+            }
+            else if (c < 0x10000)
+            {
+                char cc[3] = 
+                {
+                    (char) (((c >> 12) & 0x0f) | 0xe0),
+                    (char) (((c >>  6) & 0x3f) | 0x80),
+                    (char) (((c >>  0) & 0x3f) | 0x80)
+                };
+                if (extract_astring_catl(alloc, string, cc, sizeof(cc))) return -1;
+            }
+            else if (c < 0x110000)
+            {
+                char cc[4] = 
+                {
+                    (char) (((c >> 18) & 0x07) | 0xf0),
+                    (char) (((c >> 12) & 0x3f) | 0x80),
+                    (char) (((c >>  6) & 0x3f) | 0x80),
+                    (char) (((c >>  0) & 0x3f) | 0x80)
+                };
+                if (extract_astring_catl(alloc, string, cc, sizeof(cc))) return -1;
+            }
+            else
+            {
+                /* Use replacement character. */
+                char cc[4] = { (char) 0xef, (char) 0xbf, (char) 0xbd, 0};
+                if (extract_astring_catl(alloc, string, cc, sizeof(cc))) return -1;
+            }
         }
-        snprintf(buffer, sizeof(buffer), "&#x%x;", c);
-        if (extract_astring_cat(alloc, string, buffer)) goto end;
     }
     
     ret = 0;
@@ -136,3 +204,18 @@ int extract_astring_cat_xmlc(extract_alloc_t* alloc, extract_astring_t* string,
     end:
     return ret;
 }
+
+int extract_astring_catc_unicode_xml(extract_alloc_t* alloc, extract_astring_t* string, int c)
+{
+    /* Fixme, better to use ascii_ligatures=0, but that requires updates to
+    expected output files. */
+    return extract_astring_catc_unicode(
+            alloc,
+            string,
+            c,
+            1 /*xml*/,
+            1 /*ascii_ligatures*/,
+            0 /*ascii_dash*/,
+            0 /*ascii_apostrophe*/
+            );
+}
diff --git a/extract/src/astring.h b/extract/src/astring.h
index c2b60d25..aef4d87f 100644
--- a/extract/src/astring.h
+++ b/extract/src/astring.h
@@ -11,8 +11,11 @@ typedef struct
 } extract_astring_t;
 
 void extract_astring_init(extract_astring_t* string);
+/* Initialises <string> so it is ready for use. */
 
 void extract_astring_free(extract_alloc_t* alloc, extract_astring_t* string);
+/* Frees any existing data and returns with <string> ready for use as if by
+extract_astring_init(). */
 
 int extract_astring_catl(extract_alloc_t* alloc, extract_astring_t* string, const char* s, size_t s_len);
 
@@ -24,10 +27,33 @@ int extract_astring_catf(extract_alloc_t* alloc, extract_astring_t* string, cons
 int extract_astring_truncate(extract_astring_t* content, int len);
 /* Removes last <len> chars. */
 
-int astring_char_truncate_if(extract_astring_t* content, char c);
+int extract_astring_char_truncate_if(extract_astring_t* content, char c);
 /* Removes last char if it is <c>. */
 
 int extract_astring_cat_xmlc(extract_alloc_t* alloc, extract_astring_t* string, int c);
 /* Appends specified character using XML escapes as necessary. */
 
+int extract_astring_catc_unicode(
+        extract_alloc_t* alloc,
+        extract_astring_t* string,
+        int c,
+        int xml,
+        int ascii_ligatures,
+        int ascii_dash,
+        int ascii_apostrophe
+        );
+/* Appends unicode character <c> to <string>.
+    xml:
+        If true, we use XML escape sequences for special characters such as '<'
+        and unicode values above 127. Otherwise we encode as utf8.
+    ascii_ligatures: if true we expand ligatures to "fl", "fi" etc.
+    ascii_dash:
+        If true we replace unicode dash characters with '-'.
+    ascii_apostrophe:
+        If true we replace unicode apostrophe with ascii single-quote "'".
+*/
+
+int extract_astring_catc_unicode_xml(extract_alloc_t* alloc, extract_astring_t* string, int c);
+/* Appends specific unicode character, using XML escape sequences as required. */
+
 #endif
diff --git a/extract/src/buffer-test.c b/extract/src/buffer-test.c
index 6701fbab..a8464c2a 100644
--- a/extract/src/buffer-test.c
+++ b/extract/src/buffer-test.c
@@ -298,7 +298,7 @@ static void test_file(void)
 
 int main(void)
 {
-    outf_verbose_set(1);
+    extract_outf_verbose_set(1);
     test_read();
     test_write();
     test_file();
diff --git a/extract/src/buffer.c b/extract/src/buffer.c
index 3fd35bfd..b25dee73 100644
--- a/extract/src/buffer.c
+++ b/extract/src/buffer.c
@@ -375,7 +375,7 @@ int extract_buffer_write_internal(
                     not recoverable. <pos> will be the number of bytes in
                     source..+numbytes that have been successfully flushed, and
                     could be negative if we failed to flush earlier data. */
-                    outf("failed to flush. actual=%i delta=%i\n", actual, delta);
+                    outf("failed to flush. actual=%li delta=%li\n", (long) actual, (long) delta);
                     e = 0;
                     goto end;
                 }
diff --git a/extract/src/document.c b/extract/src/document.c
new file mode 100644
index 00000000..d501f259
--- /dev/null
+++ b/extract/src/document.c
@@ -0,0 +1,88 @@
+#include "document.h"
+#include "outf.h"
+
+
+void extract_span_init(span_t* span)
+{
+    span->font_name = NULL;
+    span->chars = NULL;
+    span->chars_num = 0;
+}
+
+void extract_span_free(extract_alloc_t* alloc, span_t** pspan)
+{
+    if (!*pspan) return;
+    extract_free(alloc, &(*pspan)->font_name);
+    extract_free(alloc, &(*pspan)->chars);
+    extract_free(alloc, pspan);
+}
+
+void extract_spans_free(extract_alloc_t* alloc, span_t*** pspans, int spans_num)
+{
+    span_t** spans = *pspans;
+    int s;
+    for (s=0; s<spans_num; ++s)
+    {
+        extract_span_free(alloc, &spans[s]);
+    }
+    extract_free(alloc, pspans);
+}
+
+void extract_line_free(extract_alloc_t* alloc, line_t** pline)
+{
+    line_t* line = *pline;
+    int s;
+    for (s=0; s<line->spans_num; ++s)
+    {
+        extract_span_free(alloc, &line->spans[s]);
+    }
+    extract_free(alloc, &line->spans);
+    extract_free(alloc, pline);
+}
+
+void extract_lines_free(extract_alloc_t* alloc, line_t*** plines, int lines_num)
+{
+    int l;
+    line_t** lines = *plines;
+    for (l=0; l<lines_num; ++l)
+    {
+        extract_line_free(alloc, &lines[l]);
+    }
+    extract_free(alloc, plines);
+}
+
+void extract_image_clear(extract_alloc_t* alloc, image_t* image)
+{
+    extract_free(alloc, &image->type);
+    extract_free(alloc, &image->name);
+    extract_free(alloc, &image->id);
+    if (image->data_free) {
+        image->data_free(image->data_free_handle, image->data);
+    }
+}
+
+void extract_cell_free(extract_alloc_t* alloc, cell_t** pcell)
+{
+    int p;
+    cell_t* cell = *pcell;
+    if (!cell) return;
+    
+    outf("cell->lines_num=%i", cell->lines_num);
+    outf("cell->paragraphs_num=%i", cell->paragraphs_num);
+    extract_lines_free(alloc, &cell->lines, cell->lines_num);
+    
+    outf("cell=%p cell->paragraphs_num=%i", cell, cell->paragraphs_num);
+    for (p=0; p<cell->paragraphs_num; ++p)
+    {
+        paragraph_t* paragraph = cell->paragraphs[p];
+        outf("paragraph->lines_num=%i", paragraph->lines_num);
+        /* We don't attempt to free paragraph->lines[] because they point into
+        cell->lines which are already freed. */
+        extract_free(alloc, &paragraph->lines);
+        extract_free(alloc, &cell->paragraphs[p]);
+    }
+    extract_free(alloc, &cell->paragraphs);
+    extract_free(alloc, pcell);
+}
+
+
diff --git a/extract/src/document.h b/extract/src/document.h
index c59348f4..2dc4f1ee 100644
--- a/extract/src/document.h
+++ b/extract/src/document.h
@@ -1,6 +1,15 @@
 #ifndef ARTIFEX_EXTRACT_DOCUMENT_H
 #define ARTIFEX_EXTRACT_DOCUMENT_H
 
+#include "../include/extract.h"
+
+#ifdef _MSC_VER
+    #include "compat_stdint.h"
+#else
+    #include <stdint.h>
+#endif
+
+
 static const double pi = 3.141592653589793;
 
 typedef struct
@@ -9,6 +18,16 @@ typedef struct
     double y;
 } point_t;
 
+const char* extract_point_string(const point_t* point);
+
+typedef struct
+{
+    point_t min;
+    point_t max;
+} rect_t;
+
+const char* extract_rect_string(const rect_t* rect);
+
 typedef struct
 {
     double  a;
@@ -19,9 +38,15 @@ typedef struct
     double  f;
 } matrix_t;
 
-double matrix_expansion(matrix_t m);
+const char* extract_matrix_string(const matrix_t* matrix);
 
-int matrix_cmp4(const matrix_t* lhs, const matrix_t* rhs)
+double      extract_matrix_expansion(matrix_t m);
+/* Returns a*d - b*c. */
+
+point_t     extract_multiply_matrix_point(matrix_t m, point_t p);
+matrix_t    extract_multiply_matrix_matrix(matrix_t m1, matrix_t m2);
+
+int extract_matrix_cmp4(const matrix_t* lhs, const matrix_t* rhs)
 ;
 /* Returns zero if first four members of *lhs and *rhs are equal, otherwise
 +/-1. */
@@ -48,7 +73,7 @@ typedef struct
     matrix_t    trm;
     char*       font_name;
     
-    /* font size is matrix_expansion(trm). */
+    /* font size is extract_matrix_cmp4(trm). */
     
     struct {
         unsigned font_bold      : 1;
@@ -61,14 +86,21 @@ typedef struct
 } span_t;
 /* List of chars that have same font and are usually adjacent. */
 
-char_t* span_char_last(span_t* span);
+void extract_span_init(span_t* span);
+
+void extract_span_free(extract_alloc_t* alloc, span_t** pspan);
+/* Frees a span_t, returning with *pspan set to NULL. */
+
+void extract_spans_free(extract_alloc_t* alloc, span_t*** pspans, int spans_num);
+
+char_t* extract_span_char_last(span_t* span);
 /* Returns last character in span. */
 
-int span_append_c(extract_alloc_t* alloc, span_t* span, int c);
+int extract_span_append_c(extract_alloc_t* alloc, span_t* span, int c);
 /* Appends new char_t to an span_t with .ucs=c and all other
 fields zeroed. */
 
-const char* span_string(extract_alloc_t* alloc, span_t* span);
+const char* extract_span_string(extract_alloc_t* alloc, span_t* span);
 /* Returns static string containing info about span_t. */
 
 typedef struct
@@ -78,10 +110,13 @@ typedef struct
 } line_t;
 /* List of spans that are aligned on same line. */
 
-span_t* line_span_first(line_t* line);
+void extract_line_free(extract_alloc_t* alloc, line_t** pline);
+void extract_lines_free(extract_alloc_t* alloc, line_t*** plines, int lines_num);
+
+span_t* extract_line_span_first(line_t* line);
 /* Returns first span in a line. */
 
-span_t* line_span_last(line_t* line);
+span_t* extract_line_span_last(line_t* line);
 /* Returns last span in a line. */
 
 typedef struct
@@ -112,6 +147,61 @@ typedef struct
 <name> and <id> are created to be unique identifiers for use in generated docx
 file. */
 
+void extract_image_clear(extract_alloc_t* alloc, image_t* image);
+
+typedef struct
+{
+    float   color;
+    rect_t  rect;
+} tableline_t;
+/* A line that is part of a table. */
+
+typedef struct
+{
+    tableline_t*    tablelines;
+    int             tablelines_num;
+} tablelines_t;
+
+
+typedef struct
+{
+    rect_t          rect;
+    
+    /* If left/above is true, this cell is not obscured by cell to its
+    left/above. */
+    uint8_t         left;
+    uint8_t         above;
+    
+    /* extend_right and extend_down are 1 for normal cells, 2 for cells which
+    extend right/down to cover an additional column/row, 3 to cover two
+    additional columns/rows etc. */
+    int             extend_right;
+    int             extend_down;
+    
+    /* Contents of this cell. */
+    line_t**        lines;
+    int             lines_num;
+    paragraph_t**   paragraphs;
+    int             paragraphs_num;
+} cell_t;
+/* A cell within a table. */
+
+void extract_cell_init(cell_t* cell);
+void extract_cell_free(extract_alloc_t* alloc, cell_t** pcell);
+
+typedef struct
+{
+    point_t     pos;    /* top-left. */
+    
+    /* Array of cells_num_x*cells_num_y cells; cell (x, y) is:
+        cells_num_x * y + x.
+    */
+    cell_t**    cells;
+    int         cells_num_x;
+    int         cells_num_y;
+} table_t;
+
+
 typedef struct
 {
     span_t**    spans;
@@ -129,10 +219,17 @@ typedef struct
     int             paragraphs_num;
     /* These refer to items in .lines. Initially empty, then set
     by extract_join(). */
+    
+    tablelines_t    tablelines_horizontal;
+    tablelines_t    tablelines_vertical;
+    
+    table_t**   tables;
+    int         tables_num;
 
 } extract_page_t;
 /* A page. Contains different representations of the list of spans. NB not
-called page_t because this clashes with a system type on hpux. */
++called page_t because this clashes with a system type on hpux. */
+
 
 typedef struct
 {
@@ -150,9 +247,31 @@ typedef struct
     int         imagetypes_num;
 } images_t;
 
+
 int extract_document_join(extract_alloc_t* alloc, document_t* document);
+/* This does all the work of finding paragraphs and tables. */
 
 double extract_matrices_to_font_size(matrix_t* ctm, matrix_t* trm);
 
+/* Things below here are used when generating output. */
+
+typedef struct
+{
+    char*   name;
+    double  size;
+    int     bold;
+    int     italic;
+} font_t;
+/* Basic information about current font. */
+
+typedef struct
+{
+    font_t      font;
+    matrix_t*   ctm_prev;
+} content_state_t;
+/* Used to keep track of font information when writing paragraphs of odt
+content, e.g. so we know whether a font has changed so need to start a new odt
+span. */
+
 
 #endif
diff --git a/extract/src/docx.c b/extract/src/docx.c
index 4532cd4e..761de176 100644
--- a/extract/src/docx.c
+++ b/extract/src/docx.c
@@ -21,6 +21,7 @@ docx_paragraph_finish(). */
 
 #include <assert.h>
 #include <errno.h>
+#include <float.h>
 #include <math.h>
 #include <stdlib.h>
 #include <stdio.h>
@@ -29,46 +30,42 @@ docx_paragraph_finish(). */
 #include <sys/stat.h>
 
 
-static int extract_docx_paragraph_start(extract_alloc_t* alloc, extract_astring_t* content)
+static int s_docx_paragraph_start(extract_alloc_t* alloc, extract_astring_t* content)
 {
     return extract_astring_cat(alloc, content, "\n\n<w:p>");
 }
 
-static int extract_docx_paragraph_finish(extract_alloc_t* alloc, extract_astring_t* content)
+static int s_docx_paragraph_finish(extract_alloc_t* alloc, extract_astring_t* content)
 {
     return extract_astring_cat(alloc, content, "\n</w:p>");
 }
 
-static int extract_docx_run_start(
+static int s_docx_run_start(
         extract_alloc_t* alloc,
         extract_astring_t* content,
-        const char* font_name,
-        double font_size,
-        int bold,
-        int italic
+        content_state_t* content_state
         )
-/* Starts a new run. Caller must ensure that extract_docx_run_finish() was
+/* Starts a new run. Caller must ensure that s_docx_run_finish() was
 called to terminate any previous run. */
 {
     int e = 0;
     if (!e) e = extract_astring_cat(alloc, content, "\n<w:r><w:rPr><w:rFonts w:ascii=\"");
-    if (!e) e = extract_astring_cat(alloc, content, font_name);
+    if (!e) e = extract_astring_cat(alloc, content, content_state->font.name);
     if (!e) e = extract_astring_cat(alloc, content, "\" w:hAnsi=\"");
-    if (!e) e = extract_astring_cat(alloc, content, font_name);
+    if (!e) e = extract_astring_cat(alloc, content, content_state->font.name);
     if (!e) e = extract_astring_cat(alloc, content, "\"/>");
-    if (!e && bold) e = extract_astring_cat(alloc, content, "<w:b/>");
-    if (!e && italic) e = extract_astring_cat(alloc, content, "<w:i/>");
+    if (!e && content_state->font.bold) e = extract_astring_cat(alloc, content, "<w:b/>");
+    if (!e && content_state->font.italic) e = extract_astring_cat(alloc, content, "<w:i/>");
     {
         char   font_size_text[32];
-        if (0) font_size = 10;
 
         if (!e) e = extract_astring_cat(alloc, content, "<w:sz w:val=\"");
-        snprintf(font_size_text, sizeof(font_size_text), "%f", font_size * 2);
+        snprintf(font_size_text, sizeof(font_size_text), "%f", content_state->font.size * 2);
         extract_astring_cat(alloc, content, font_size_text);
         extract_astring_cat(alloc, content, "\"/>");
 
         if (!e) e = extract_astring_cat(alloc, content, "<w:szCs w:val=\"");
-        snprintf(font_size_text, sizeof(font_size_text), "%f", font_size * 1.5);
+        snprintf(font_size_text, sizeof(font_size_text), "%f", content_state->font.size * 1.5);
         extract_astring_cat(alloc, content, font_size_text);
         extract_astring_cat(alloc, content, "\"/>");
     }
@@ -77,38 +74,39 @@ called to terminate any previous run. */
 
 }
 
-static int extract_docx_run_finish(extract_alloc_t* alloc, extract_astring_t* content)
+static int s_docx_run_finish(extract_alloc_t* alloc, content_state_t* state, extract_astring_t* content)
 {
+    if (state) state->font.name = NULL;
     return extract_astring_cat(alloc, content, "</w:t></w:r>");
 }
 
-static int extract_docx_paragraph_empty(extract_alloc_t* alloc, extract_astring_t* content)
+static int s_docx_paragraph_empty(extract_alloc_t* alloc, extract_astring_t* content)
 /* Append an empty paragraph to *content. */
 {
     int e = -1;
-    if (extract_docx_paragraph_start(alloc, content)) goto end;
+    static char fontname[] = "OpenSans";
+    content_state_t content_state = {0};
+    if (s_docx_paragraph_start(alloc, content)) goto end;
     /* It seems like our choice of font size here doesn't make any difference
     to the ammount of vertical space, unless we include a non-space
     character. Presumably something to do with the styles in the template
     document. */
-    if (extract_docx_run_start(
-            alloc,
-            content,
-            "OpenSans",
-            10 /*font_size*/,
-            0 /*font_bold*/,
-            0 /*font_italic*/
-            )) goto end;
+    content_state.font.name = fontname;
+    content_state.font.size = 10;
+    content_state.font.bold = 0;
+    content_state.font.italic = 0;
+    
+    if (s_docx_run_start(alloc, content, &content_state)) goto end;
     //docx_char_append_string(content, "&#160;");   /* &#160; is non-break space. */
-    if (extract_docx_run_finish(alloc, content)) goto end;
-    if (extract_docx_paragraph_finish(alloc, content)) goto end;
+    if (s_docx_run_finish(alloc, NULL /*state*/, content)) goto end;
+    if (s_docx_paragraph_finish(alloc, content)) goto end;
     e = 0;
     end:
     return e;
 }
 
 
-static int extract_docx_char_truncate_if(extract_astring_t* content, char c)
+static int s_docx_char_truncate_if(extract_astring_t* content, char c)
 /* Removes last char if it is <c>. */
 {
     if (content->chars_num && content->chars[content->chars_num-1] == c) {
@@ -118,22 +116,9 @@ static int extract_docx_char_truncate_if(extract_astring_t* content, char c)
 }
 
 
-typedef struct
-{
-    const char* font_name;
-    double      font_size;
-    int         font_bold;
-    int         font_italic;
-    matrix_t*   ctm_prev;
-} content_state_t;
-/* Used to keep track of font information when writing paragraphs of docx
-content, e.g. so we know whether a font has changed so need to start a new docx
-span. */
-
-
-static int extract_document_to_docx_content_paragraph(
+static int s_document_to_docx_content_paragraph(
         extract_alloc_t*    alloc,
-        content_state_t*    state,
+        content_state_t*    content_state,
         paragraph_t*        paragraph,
         extract_astring_t*  content
         )
@@ -142,7 +127,7 @@ font. */
 {
     int e = -1;
     int l;
-    if (extract_docx_paragraph_start(alloc, content)) goto end;
+    if (s_docx_paragraph_start(alloc, content)) goto end;
 
     for (l=0; l<paragraph->lines_num; ++l) {
         line_t* line = paragraph->lines[l];
@@ -151,45 +136,38 @@ font. */
             int si;
             span_t* span = line->spans[s];
             double font_size_new;
-            state->ctm_prev = &span->ctm;
+            content_state->ctm_prev = &span->ctm;
             font_size_new = extract_matrices_to_font_size(&span->ctm, &span->trm);
-            if (!state->font_name
-                    || strcmp(span->font_name, state->font_name)
-                    || span->flags.font_bold != state->font_bold
-                    || span->flags.font_italic != state->font_italic
-                    || font_size_new != state->font_size
+            if (!content_state->font.name
+                    || strcmp(span->font_name, content_state->font.name)
+                    || span->flags.font_bold != content_state->font.bold
+                    || span->flags.font_italic != content_state->font.italic
+                    || font_size_new != content_state->font.size
                     ) {
-                if (state->font_name) {
-                    if (extract_docx_run_finish(alloc, content)) goto end;
+                if (content_state->font.name) {
+                    if (s_docx_run_finish(alloc, content_state, content)) goto end;
                 }
-                state->font_name = span->font_name;
-                state->font_bold = span->flags.font_bold;
-                state->font_italic = span->flags.font_italic;
-                state->font_size = font_size_new;
-                if (extract_docx_run_start(
-                        alloc,
-                        content,
-                        state->font_name,
-                        state->font_size,
-                        state->font_bold,
-                        state->font_italic
-                        )) goto end;
+                content_state->font.name = span->font_name;
+                content_state->font.bold = span->flags.font_bold;
+                content_state->font.italic = span->flags.font_italic;
+                content_state->font.size = font_size_new;
+                if (s_docx_run_start(alloc, content, content_state)) goto end;
             }
 
             for (si=0; si<span->chars_num; ++si) {
                 char_t* char_ = &span->chars[si];
                 int c = char_->ucs;
-                if (extract_astring_cat_xmlc(alloc, content, c)) goto end;
+                if (extract_astring_catc_unicode_xml(alloc, content, c)) goto end;
             }
             /* Remove any trailing '-' at end of line. */
-            if (extract_docx_char_truncate_if(content, '-')) goto end;
+            if (s_docx_char_truncate_if(content, '-')) goto end;
         }
     }
-    if (state->font_name) {
-        if (extract_docx_run_finish(alloc, content)) goto end;
-        state->font_name = NULL;
+    if (content_state->font.name)
+    {
+        if (s_docx_run_finish(alloc, content_state, content)) goto end;
     }
-    if (extract_docx_paragraph_finish(alloc, content)) goto end;
+    if (s_docx_paragraph_finish(alloc, content)) goto end;
     
     e = 0;
     
@@ -197,7 +175,7 @@ font. */
     return e;
 }
 
-static int extract_document_append_image(
+static int s_docx_append_image(
         extract_alloc_t*    alloc,
         extract_astring_t*  content,
         image_t*            image
@@ -265,7 +243,7 @@ static int extract_document_append_image(
 }
 
 
-static int extract_document_output_rotated_paragraphs(
+static int s_docx_output_rotated_paragraphs(
         extract_alloc_t*    alloc,
         extract_page_t*     page,
         int                 paragraph_begin,
@@ -353,7 +331,7 @@ static int extract_document_output_rotated_paragraphs(
     /* Output paragraphs p0..p2-1. */
     for (p=paragraph_begin; p<paragraph_end; ++p) {
         paragraph_t* paragraph = page->paragraphs[p];
-        if (extract_document_to_docx_content_paragraph(alloc, state, paragraph, content)) goto end;
+        if (s_document_to_docx_content_paragraph(alloc, state, paragraph, content)) goto end;
     }
 
     extract_astring_cat(alloc, content, "\n");
@@ -387,7 +365,7 @@ static int extract_document_output_rotated_paragraphs(
 
     for (p=paragraph_begin; p<paragraph_end; ++p) {
         paragraph_t* paragraph = page->paragraphs[p];
-        if (extract_document_to_docx_content_paragraph(alloc, state, paragraph, content)) goto end;
+        if (s_document_to_docx_content_paragraph(alloc, state, paragraph, content)) goto end;
     }
 
     extract_astring_cat(alloc, content, "\n");
@@ -406,6 +384,257 @@ static int extract_document_output_rotated_paragraphs(
 }
 
 
+static int s_docx_append_table(extract_alloc_t* alloc, table_t* table, extract_astring_t* content)
+/* Appends table to content.
+
+We do not fix the size of the table or its columns and rows, but instead leave layout up
+to the application. */
+{
+    int e = -1;
+    int y;
+    
+    if (extract_astring_cat(alloc, content,
+            "\n"
+            "    <w:tbl>\n"
+            "        <w:tblLayout w:type=\"autofit\"/>\n"
+            )) goto end;
+
+    for (y=0; y<table->cells_num_y; ++y)
+    {
+        int x;
+        if (extract_astring_cat(alloc, content,
+                "        <w:tr>\n"
+                "            <w:trPr/>\n"
+                )) goto end;
+        
+        for (x=0; x<table->cells_num_x; ++x)
+        {
+            cell_t* cell = table->cells[y*table->cells_num_x + x];
+            if (!cell->left) continue;
+            
+            if (extract_astring_cat(alloc, content, "            <w:tc>\n")) goto end;
+            
+            /* Write cell properties. */
+            {
+                if (extract_astring_cat(alloc, content,
+                        "                <w:tcPr>\n"
+                        "                    <w:tcBorders>\n"
+                        "                        <w:top w:val=\"double\" w:sz=\"2\" w:space=\"0\" w:color=\"808080\"/>\n"
+                        "                        <w:start w:val=\"double\" w:sz=\"2\" w:space=\"0\" w:color=\"808080\"/>\n"
+                        "                        <w:bottom w:val=\"double\" w:sz=\"2\" w:space=\"0\" w:color=\"808080\"/>\n"
+                        "                        <w:end w:val=\"double\" w:sz=\"2\" w:space=\"0\" w:color=\"808080\"/>\n"
+                        "                    </w:tcBorders>\n"
+                        )) goto end;
+                if (cell->extend_right > 1)
+                {
+                    if (extract_astring_catf(alloc, content, "                    <w:gridSpan w:val=\"%i\"/>\n", cell->extend_right)) goto end;
+                }
+                if (cell->above)
+                {
+                    if (cell->extend_down > 1)
+                    {
+                        if (extract_astring_catf(alloc, content, "                    <w:vMerge w:val=\"restart\"/>\n", cell->extend_down)) goto end;
+                    }
+                }
+                else
+                {
+                    if (extract_astring_catf(alloc, content, "                    <w:vMerge w:val=\"continue\"/>\n")) goto end;
+                }
+                if (extract_astring_cat(alloc, content, "                </w:tcPr>\n")) goto end;
+            }
+            
+            /* Write contents of this cell. */
+            {
+                size_t chars_num_old = content->chars_num;
+                int p;
+                content_state_t content_state = {0};
+                content_state.font.name = NULL;
+                content_state.ctm_prev = NULL;
+                for (p=0; p<cell->paragraphs_num; ++p)
+                {
+                    paragraph_t* paragraph = cell->paragraphs[p];
+                    if (s_document_to_docx_content_paragraph(alloc, &content_state, paragraph, content)) goto end;
+                }
+                if (content_state.font.name)
+                {
+                    if (s_docx_run_finish(alloc, &content_state, content)) goto end;
+                }
+
+                /* Need to write out at least an empty paragraph in each cell,
+                otherwise Word/Libreoffice fail to show table at all; the
+                OOXML spec says "If a table cell does not include at least one
+                block-level element, then this document shall be considered
+                corrupt." */
+                if (content->chars_num == chars_num_old)
+                {
+                    if (extract_astring_catf(alloc, content, "<w:p/>\n")) goto end;
+                }
+            }
+            if (extract_astring_cat(alloc, content, "            </w:tc>\n")) goto end;
+        }
+        if (extract_astring_cat(alloc, content, "        </w:tr>\n")) goto end;
+    }
+    if (extract_astring_cat(alloc, content, "    </w:tbl>\n")) goto end;
+    e = 0;
+    
+    end:
+    return e;
+}
+
+static int s_docx_append_rotated_paragraphs(
+        extract_alloc_t*    alloc,
+        extract_page_t*     page,
+        content_state_t*    state,
+        int*                p,
+        int*                text_box_id,
+        const matrix_t*     ctm,
+        double              rotate,
+        extract_astring_t*  content
+        )
+/* Appends paragraphs with same rotation, starting with page->paragraphs[*p]
+and updates *p. */
+{
+    /* Find extent of paragraphs with this same rotation. extent
+    will contain max width and max height of paragraphs, in units
+    before application of ctm, i.e. before rotation. */
+    int e = -1;
+    point_t extent = {0, 0};
+    int p0 = *p;
+    int p1;
+    paragraph_t* paragraph = page->paragraphs[*p];
+    
+    outf("rotate=%.2frad=%.1fdeg ctm: ef=(%f %f) abcd=(%f %f %f %f)",
+            rotate, rotate * 180 / pi,
+            ctm->e,
+            ctm->f,
+            ctm->a,
+            ctm->b,
+            ctm->c,
+            ctm->d
+            );
+
+    {
+        /* We assume that first span is at origin of text
+        block. This assumes left-to-right text. */
+        double rotate0 = rotate;
+        const matrix_t* ctm0 = ctm;
+        point_t origin = {
+                paragraph->lines[0]->spans[0]->chars[0].x,
+                paragraph->lines[0]->spans[0]->chars[0].y
+                };
+        matrix_t ctm_inverse = {1, 0, 0, 1, 0, 0};
+        double ctm_det = ctm->a*ctm->d - ctm->b*ctm->c;
+        if (ctm_det != 0) {
+            ctm_inverse.a = +ctm->d / ctm_det;
+            ctm_inverse.b = -ctm->b / ctm_det;
+            ctm_inverse.c = -ctm->c / ctm_det;
+            ctm_inverse.d = +ctm->a / ctm_det;
+        }
+        else {
+            outf("cannot invert ctm=(%f %f %f %f)",
+                    ctm->a, ctm->b, ctm->c, ctm->d);
+        }
+
+        for (*p=p0; *p<page->paragraphs_num; ++(*p)) {
+            paragraph = page->paragraphs[*p];
+            ctm = &paragraph->lines[0]->spans[0]->ctm;
+            rotate = atan2(ctm->b, ctm->a);
+            if (rotate != rotate0) {
+                break;
+            }
+
+            /* Update <extent>. */
+            {
+                int l;
+                for (l=0; l<paragraph->lines_num; ++l) {
+                    line_t* line = paragraph->lines[l];
+                    span_t* span = extract_line_span_last(line);
+                    char_t* char_ = extract_span_char_last(span);
+                    double adv = char_->adv * extract_matrix_expansion(span->trm);
+                    double x = char_->x + adv * cos(rotate);
+                    double y = char_->y + adv * sin(rotate);
+
+                    double dx = x - origin.x;
+                    double dy = y - origin.y;
+
+                    /* Position relative to origin and before box rotation. */
+                    double xx = ctm_inverse.a * dx + ctm_inverse.b * dy;
+                    double yy = ctm_inverse.c * dx + ctm_inverse.d * dy;
+                    yy = -yy;
+                    if (xx > extent.x) extent.x = xx;
+                    if (yy > extent.y) extent.y = yy;
+                    if (0) outf("rotate=%f *p=%i: origin=(%f %f) xy=(%f %f) dxy=(%f %f) xxyy=(%f %f) span: %s",
+                            rotate, *p, origin.x, origin.y, x, y, dx, dy, xx, yy, extract_span_string(alloc, span));
+                }
+            }
+        }
+        p1 = *p;
+        rotate = rotate0;
+        ctm = ctm0;
+        outf("rotate=%f p0=%i p1=%i. extent is: (%f %f)",
+                rotate, p0, p1, extent.x, extent.y);
+    }
+
+    /* Paragraphs p0..p1-1 have same rotation. We output them into
+    a single rotated text box. */
+
+    /* We need unique id for text box. */
+    *text_box_id += 1;
+
+    {
+        /* Angles are in units of 1/60,000 degree. */
+        int rot = (int) (rotate * 180 / pi * 60000);
+
+        /* <wp:anchor distT=\.. etc are in EMU - 1/360,000 of a cm.
+        relativeHeight is z-ordering. (wp:positionV:wp:posOffset,
+        wp:positionV:wp:posOffset) is position of origin of box in
+        EMU.
+
+        The box rotates about its centre but we want to rotate
+        about the origin (top-left). So we correct the position of
+        box by subtracting the vector that the top-left moves when
+        rotated by angle <rotate> about the middle. */
+        double point_to_emu = 12700;    /* https://en.wikipedia.org/wiki/Office_Open_XML_file_formats#DrawingML */
+        int x = (int) (ctm->e * point_to_emu);
+        int y = (int) (ctm->f * point_to_emu);
+        int w = (int) (extent.x * point_to_emu);
+        int h = (int) (extent.y * point_to_emu);
+        int dx;
+        int dy;
+
+        if (0) outf("rotate: %f rad, %f deg. rot=%i", rotate, rotate*180/pi, rot);
+
+        h *= 2;
+        /* We can't predict how much space Word will actually
+        require for the rotated text, so make the box have the
+        original width but allow text to take extra vertical
+        space. There doesn't seem to be a way to make the text box
+        auto-grow to contain the text. */
+
+        dx = (int) ((1-cos(rotate)) * w / 2.0 + sin(rotate) * h / 2.0);
+        dy = (int) ((cos(rotate)-1) * h / 2.0 + sin(rotate) * w / 2.0);
+        outf("ctm->e,f=%f,%f rotate=%f => x,y=%ik %ik dx,dy=%ik %ik",
+                ctm->e,
+                ctm->f,
+                rotate * 180/pi,
+                x/1000,
+                y/1000,
+                dx/1000,
+                dy/1000
+                );
+        x -= dx;
+        y -= -dy;
+
+        if (s_docx_output_rotated_paragraphs(alloc, page, p0, p1, rot, x, y, w, h, *text_box_id, content, state)) goto end;
+    }
+    *p = p1 - 1;
+    e = 0;
+    
+    end:
+    
+    return e;
+}
+
 int extract_document_to_docx_content(
         extract_alloc_t*    alloc,
         document_t*         document,
@@ -422,184 +651,73 @@ int extract_document_to_docx_content(
     /* Write paragraphs into <content>. */
     for (p=0; p<document->pages_num; ++p) {
         extract_page_t* page = document->pages[p];
-        int p;
-        content_state_t state;
-        state.font_name = NULL;
-        state.font_size = 0;
-        state.font_bold = 0;
-        state.font_italic = 0;
-        state.ctm_prev = NULL;
         
-        for (p=0; p<page->paragraphs_num; ++p) {
-            paragraph_t* paragraph = page->paragraphs[p];
-            const matrix_t* ctm = &paragraph->lines[0]->spans[0]->ctm;
-            double rotate = atan2(ctm->b, ctm->a);
+        int p = 0;
+        int t = 0;
+        
+        content_state_t content_state;
+        content_state.font.name = NULL;
+        content_state.font.size = 0;
+        content_state.font.bold = 0;
+        content_state.font.italic = 0;
+        content_state.ctm_prev = NULL;
+        
+        /* Output paragraphs and tables in order of y coordinate. */
+        for(;;)
+        {
+            paragraph_t* paragraph = (p == page->paragraphs_num) ? NULL : page->paragraphs[p];
+            table_t* table = (t == page->tables_num) ? NULL : page->tables[t];
+            double y_paragraph;
+            double y_table;
+            if (!paragraph && !table)   break;
+            y_paragraph = (paragraph) ? paragraph->lines[0]->spans[0]->chars[0].y : DBL_MAX;
+            y_table = (table) ? table->pos.y : DBL_MAX;
             
-            if (spacing
-                    && state.ctm_prev
-                    && paragraph->lines_num
-                    && paragraph->lines[0]->spans_num
-                    && matrix_cmp4(
-                            state.ctm_prev,
-                            &paragraph->lines[0]->spans[0]->ctm
-                            )
-                    ) {
-                /* Extra vertical space between paragraphs that were at
-                different angles in the original document. */
-                if (extract_docx_paragraph_empty(alloc, content)) goto end;
-            }
+            if (paragraph && y_paragraph < y_table)
+            {
+                const matrix_t* ctm = &paragraph->lines[0]->spans[0]->ctm;
+                double rotate = atan2(ctm->b, ctm->a);
+
+                if (spacing
+                        && content_state.ctm_prev
+                        && paragraph->lines_num
+                        && paragraph->lines[0]->spans_num
+                        && extract_matrix_cmp4(
+                                content_state.ctm_prev,
+                                &paragraph->lines[0]->spans[0]->ctm
+                                )
+                        ) {
+                    /* Extra vertical space between paragraphs that were at
+                    different angles in the original document. */
+                    if (s_docx_paragraph_empty(alloc, content)) goto end;
+                }
 
-            if (spacing) {
-                /* Extra vertical space between paragraphs. */
-                if (extract_docx_paragraph_empty(alloc, content)) goto end;
-            }
-            
-            if (rotation && rotate != 0) {
-            
-                /* Find extent of paragraphs with this same rotation. extent
-                will contain max width and max height of paragraphs, in units
-                before application of ctm, i.e. before rotation. */
-                point_t extent = {0, 0};
-                int p0 = p;
-                int p1;
-                
-                outf("rotate=%.2frad=%.1fdeg ctm: ef=(%f %f) abcd=(%f %f %f %f)",
-                        rotate, rotate * 180 / pi,
-                        ctm->e,
-                        ctm->f,
-                        ctm->a,
-                        ctm->b,
-                        ctm->c,
-                        ctm->d
-                        );
-                
-                {
-                    /* We assume that first span is at origin of text
-                    block. This assumes left-to-right text. */
-                    double rotate0 = rotate;
-                    const matrix_t* ctm0 = ctm;
-                    point_t origin = {
-                            paragraph->lines[0]->spans[0]->chars[0].x,
-                            paragraph->lines[0]->spans[0]->chars[0].y
-                            };
-                    matrix_t ctm_inverse = {1, 0, 0, 1, 0, 0};
-                    double ctm_det = ctm->a*ctm->d - ctm->b*ctm->c;
-                    if (ctm_det != 0) {
-                        ctm_inverse.a = +ctm->d / ctm_det;
-                        ctm_inverse.b = -ctm->b / ctm_det;
-                        ctm_inverse.c = -ctm->c / ctm_det;
-                        ctm_inverse.d = +ctm->a / ctm_det;
-                    }
-                    else {
-                        outf("cannot invert ctm=(%f %f %f %f)",
-                                ctm->a, ctm->b, ctm->c, ctm->d);
-                    }
+                if (spacing) {
+                    /* Extra vertical space between paragraphs. */
+                    if (s_docx_paragraph_empty(alloc, content)) goto end;
+                }
 
-                    for (p=p0; p<page->paragraphs_num; ++p) {
-                        paragraph = page->paragraphs[p];
-                        ctm = &paragraph->lines[0]->spans[0]->ctm;
-                        rotate = atan2(ctm->b, ctm->a);
-                        if (rotate != rotate0) {
-                            break;
-                        }
-
-                        /* Update <extent>. */
-                        {
-                            int l;
-                            for (l=0; l<paragraph->lines_num; ++l) {
-                                line_t* line = paragraph->lines[l];
-                                span_t* span = line_span_last(line);
-                                char_t* char_ = span_char_last(span);
-                                double adv = char_->adv * matrix_expansion(span->trm);
-                                double x = char_->x + adv * cos(rotate);
-                                double y = char_->y + adv * sin(rotate);
-
-                                double dx = x - origin.x;
-                                double dy = y - origin.y;
-
-                                /* Position relative to origin and before box rotation. */
-                                double xx = ctm_inverse.a * dx + ctm_inverse.b * dy;
-                                double yy = ctm_inverse.c * dx + ctm_inverse.d * dy;
-                                yy = -yy;
-                                if (xx > extent.x) extent.x = xx;
-                                if (yy > extent.y) extent.y = yy;
-                                if (0) outf("rotate=%f p=%i: origin=(%f %f) xy=(%f %f) dxy=(%f %f) xxyy=(%f %f) span: %s",
-                                        rotate, p, origin.x, origin.y, x, y, dx, dy, xx, yy, span_string(alloc, span));
-                            }
-                        }
-                    }
-                    p1 = p;
-                    rotate = rotate0;
-                    ctm = ctm0;
-                    outf("rotate=%f p0=%i p1=%i. extent is: (%f %f)",
-                            rotate, p0, p1, extent.x, extent.y);
+                if (rotation && rotate != 0)
+                {
+                    if (s_docx_append_rotated_paragraphs(alloc, page, &content_state, &p, &text_box_id, ctm, rotate, content)) goto end;
                 }
-                
-                /* Paragraphs p0..p1-1 have same rotation. We output them into
-                a single rotated text box. */
-                
-                /* We need unique id for text box. */
-                text_box_id += 1;
-                
+                else
                 {
-                    /* Angles are in units of 1/60,000 degree. */
-                    int rot = (int) (rotate * 180 / pi * 60000);
-
-                    /* <wp:anchor distT=\.. etc are in EMU - 1/360,000 of a cm.
-                    relativeHeight is z-ordering. (wp:positionV:wp:posOffset,
-                    wp:positionV:wp:posOffset) is position of origin of box in
-                    EMU.
-
-                    The box rotates about its centre but we want to rotate
-                    about the origin (top-left). So we correct the position of
-                    box by subtracting the vector that the top-left moves when
-                    rotated by angle <rotate> about the middle. */
-                    double point_to_emu = 12700;    /* https://en.wikipedia.org/wiki/Office_Open_XML_file_formats#DrawingML */
-                    int x = (int) (ctm->e * point_to_emu);
-                    int y = (int) (ctm->f * point_to_emu);
-                    int w = (int) (extent.x * point_to_emu);
-                    int h = (int) (extent.y * point_to_emu);
-                    int dx;
-                    int dy;
-
-                    if (0) outf("rotate: %f rad, %f deg. rot=%i", rotate, rotate*180/pi, rot);
-
-                    h *= 2;
-                    /* We can't predict how much space Word will actually
-                    require for the rotated text, so make the box have the
-                    original width but allow text to take extra vertical
-                    space. There doesn't seem to be a way to make the text box
-                    auto-grow to contain the text. */
-
-                    dx = (int) ((1-cos(rotate)) * w / 2.0 + sin(rotate) * h / 2.0);
-                    dy = (int) ((cos(rotate)-1) * h / 2.0 + sin(rotate) * w / 2.0);
-                    outf("ctm->e,f=%f,%f rotate=%f => x,y=%ik %ik dx,dy=%ik %ik",
-                            ctm->e,
-                            ctm->f,
-                            rotate * 180/pi,
-                            x/1000,
-                            y/1000,
-                            dx/1000,
-                            dy/1000
-                            );
-                    x -= dx;
-                    y -= -dy;
-
-                    if (extract_document_output_rotated_paragraphs(alloc, page, p0, p1, rot, x, y, w, h, text_box_id, content, &state)) goto end;
+                    if (s_document_to_docx_content_paragraph(alloc, &content_state, paragraph, content)) goto end;
                 }
-                p = p1 - 1;
-                //p = page->paragraphs_num - 1;
+                p += 1;
             }
-            else {
-                if (extract_document_to_docx_content_paragraph(alloc, &state, paragraph, content)) goto end;
+            else if (table)
+            {
+                if (s_docx_append_table(alloc, table, content)) goto end;
+                t += 1;
             }
-        
         }
         
         if (images) {
             int i;
             for (i=0; i<page->images_num; ++i) {
-                extract_document_append_image(alloc, content, &page->images[i]);
+                s_docx_append_image(alloc, content, &page->images[i]);
             }
         }
     }
@@ -738,7 +856,6 @@ int extract_docx_write_template(
     int     e = -1;
     int     i;
     char*   path_tempdir = NULL;
-    FILE*   f = NULL;
     char*   path = NULL;
     char*   text = NULL;
     char*   text2 = NULL;
@@ -841,7 +958,6 @@ int extract_docx_write_template(
     extract_free(alloc, &path);
     extract_free(alloc, &text);
     extract_free(alloc, &text2);
-    if (f)  fclose(f);
 
     if (e) {
         outf("Failed to create %s", path_out);
diff --git a/extract/src/docx.h b/extract/src/docx.h
index 6e26568f..976272a6 100644
--- a/extract/src/docx.h
+++ b/extract/src/docx.h
@@ -13,8 +13,8 @@ int extract_document_to_docx_content(
         int                 images,
         extract_astring_t*  content
         );
-/* Makes *o_content point to a string containing all paragraphs in *document in
-docx XML format.
+/* Makes *o_content point to a string containing all paragraphs, images and
+tables (tables as of 2021-07-22) in *document in docx XML format.
 
 This string can be passed to extract_docx_content_item() or
 extract_docx_write_template() to be inserted into a docx archive's
diff --git a/extract/src/docx_template_build.py b/extract/src/docx_template_build.py
index 5e2f5380..8b836300 100755
--- a/extract/src/docx_template_build.py
+++ b/extract/src/docx_template_build.py
@@ -9,6 +9,9 @@ Args:
     --pretty <directory>
         Prettyfies all .xml files within <directory> using 'xmllint --format'.
 
+    -f
+        Force touch of output file, even if unchanged.
+
     -i <in-path>
         Set template docx/odt file to extract from.
     
@@ -57,12 +60,17 @@ def write(text, path, encoding):
     with open(path, 'wb') as f:
         f.write(text.encode(encoding))
 
-def write_if_diff(text, path, encoding):
-    if os.path.isfile(path):
-        old = read(path, encoding)
-        if old == text:
-            return
-    print(f'Updating path={path} because contents have changed')
+def write_if_diff(text, path, encoding, force):
+    '''
+    Does nothing if <force> is false and file named <path> already contains
+    <text>. Otherwise writes <text> to file named <path>.
+    '''
+    if not force:
+        if os.path.isfile(path):
+            old = read(path, encoding)
+            if old == text:
+                return
+        print(f'Updating path={path} because contents have changed')
     write(text, path, encoding)
 
 def check_path_safe(path):
@@ -98,6 +106,8 @@ def main():
     path_in = None
     path_out = None
     infix = None
+    force = False
+
     args = iter(sys.argv[1:])
     while 1:
         try: arg = next(args)
@@ -114,6 +124,8 @@ def main():
                     path = os.path.join(dirpath, filename)
                     system(f'xmllint --format {path} > {path}-')
                     system(f'mv {path}- {path}')
+        elif arg == '-f':
+            force = True
         elif arg == '-i':
             path_in = next(args)
         elif arg == '-n':
@@ -166,7 +178,7 @@ def main():
         for filename in sorted(filenames):
             num_items += 1
             path = os.path.join(dirpath, filename)
-            print(f'looking at path={path}')
+            #print(f'looking at path={path}')
             name = path[ len(path_temp)+1: ]
             out_c.write(f'    {{\n')
             out_c.write(f'        "{name}",\n')
@@ -213,7 +225,7 @@ def main():
     out_c.write(f'int {infix}_template_items_num = {num_items};\n')
     
     out_c = out_c.getvalue()
-    write_if_diff(out_c, f'{path_out}.c', 'utf-8')
+    write_if_diff(out_c, f'{path_out}.c', 'utf-8', force)
     
     out_h = io.StringIO()
     out_h.write(f'#ifndef EXTRACT_{infix.upper()}_TEMPLATE_H\n')
@@ -233,7 +245,7 @@ def main():
     out_h.write(f'\n')
     out_h.write(f'\n')
     out_h.write(f'#endif\n')
-    write_if_diff(out_h.getvalue(), f'{path_out}.h', 'utf-8')
+    write_if_diff(out_h.getvalue(), f'{path_out}.h', 'utf-8', force)
     #os.system(f'rm -r "{path_temp}"')
     
 if __name__ == '__main__':
diff --git a/extract/src/extract-exe.c b/extract/src/extract-exe.c
index 22b520db..ee34023a 100644
--- a/extract/src/extract-exe.c
+++ b/extract/src/extract-exe.c
@@ -139,6 +139,7 @@ int main(int argc, char** argv)
             if (arg_next_string(argv, argc, &i, &format_name)) goto end;
             if (!strcmp(format_name, "odt")) format = extract_format_ODT;
             else if (!strcmp(format_name, "docx")) format = extract_format_DOCX;
+            else if (!strcmp(format_name, "html")) format = extract_format_HTML;
             else
             {
                 printf("-f value should be 'odt' or 'docx', not '%s'.\n", format_name);
@@ -170,7 +171,7 @@ int main(int argc, char** argv)
         else if (!strcmp(arg, "-v")) {
             int verbose;
             if (arg_next_int(argv, argc, &i, &verbose)) goto end;
-            outf_verbose_set(verbose);
+            extract_outf_verbose_set(verbose);
             outf("Have changed verbose to %i", verbose);
         }
         else if (!strcmp(arg, "--v-alloc")) {
diff --git a/extract/src/extract.c b/extract/src/extract.c
index 9eb85d2f..2c375571 100644
--- a/extract/src/extract.c
+++ b/extract/src/extract.c
@@ -5,6 +5,7 @@
 #include "document.h"
 #include "docx.h"
 #include "docx_template.h"
+#include "html.h"
 #include "mem.h"
 #include "memento.h"
 #include "odt.h"
@@ -25,7 +26,7 @@
 
 
 
-double matrix_expansion(matrix_t m)
+double extract_matrix_expansion(matrix_t m)
 {
     return sqrt(fabs(m.a * m.d - m.b * m.c));
 }
@@ -41,14 +42,31 @@ static void char_init(char_t* item)
     item->adv = 0;
 }
 
+const char* extract_point_string(const point_t* point)
+{
+    static char buffer[128];
+    snprintf(buffer, sizeof(buffer), "(%f %f)", point->x, point->y);
+    return buffer;
+}
+
+const char* extract_rect_string(const rect_t* rect)
+{
+    static char buffer[2][256];
+    static int i = 0;
+    i = (i + 1) % 2;
+    snprintf(buffer[i], sizeof(buffer[i]), "((%f %f) (%f %f))", rect->min.x, rect->min.y, rect->max.x, rect->max.y);
+    return buffer[i];
+}
 
-const char* span_string(extract_alloc_t* alloc, span_t* span)
+const char* extract_span_string(extract_alloc_t* alloc, span_t* span)
 {
     static extract_astring_t ret = {0};
     double x0 = 0;
     double y0 = 0;
+    point_t pre0 = {0, 0};
     double x1 = 0;
     double y1 = 0;
+    point_t pre1 = {0, 0};
     int c0 = 0;
     int c1 = 0;
     int i;
@@ -62,17 +80,23 @@ const char* span_string(extract_alloc_t* alloc, span_t* span)
         c0 = span->chars[0].ucs;
         x0 = span->chars[0].x;
         y0 = span->chars[0].y;
+        pre0.x = span->chars[0].pre_x;
+        pre0.y = span->chars[0].pre_y;
         c1 = span->chars[span->chars_num-1].ucs;
         x1 = span->chars[span->chars_num-1].x;
         y1 = span->chars[span->chars_num-1].y;
+        pre1.x = span->chars[span->chars_num-1].pre_x;
+        pre1.y = span->chars[span->chars_num-1].pre_y;
     }
     {
-        char buffer[200];
+        char buffer[400];
         snprintf(buffer, sizeof(buffer),
-                "span chars_num=%i (%c:%f,%f)..(%c:%f,%f) font=%s:(%f,%f) wmode=%i chars_num=%i: ",
+                "span ctm=%s trm=%s chars_num=%i (%c:%f,%f pre(%f %f))..(%c:%f,%f pre(%f %f)) font=%s:(%f,%f) wmode=%i chars_num=%i: ",
+                extract_matrix_string(&span->ctm),
+                extract_matrix_string(&span->trm),
                 span->chars_num,
-                c0, x0, y0,
-                c1, x1, y1,
+                c0, x0, y0, pre0.x, pre0.y,
+                c1, x1, y1, pre1.x, pre1.y,
                 span->font_name,
                 span->trm.a,
                 span->trm.d,
@@ -84,9 +108,11 @@ const char* span_string(extract_alloc_t* alloc, span_t* span)
             snprintf(
                     buffer,
                     sizeof(buffer),
-                    " i=%i {x=%f adv=%f}",
+                    " i=%i {x=%f y=%f ucs=%i adv=%f}",
                     i,
                     span->chars[i].x,
+                    span->chars[i].y,
+                    span->chars[i].ucs,
                     span->chars[i].adv
                     );
             extract_astring_cat(alloc, &ret, buffer);
@@ -101,7 +127,7 @@ const char* span_string(extract_alloc_t* alloc, span_t* span)
     return ret.chars;
 }
 
-int span_append_c(extract_alloc_t* alloc, span_t* span, int c)
+int extract_span_append_c(extract_alloc_t* alloc, span_t* span, int c)
 {
     char_t* item;
     if (extract_realloc2(
@@ -119,7 +145,7 @@ int span_append_c(extract_alloc_t* alloc, span_t* span, int c)
     return 0;
 }
 
-char_t* span_char_last(span_t* span)
+char_t* extract_span_char_last(span_t* span)
 {
     assert(span->chars_num > 0);
     return &span->chars[span->chars_num-1];
@@ -138,58 +164,62 @@ static const char* line_string(line_t* line)
     int i;
     for (i=0; i<line->spans_num; ++i) {
         extract_astring_cat(&ret, " ");
-        extract_astring_cat(&ret, span_string(line->spans[i]));
+        extract_astring_cat(&ret, extract_span_string(line->spans[i]));
     }
     return ret.chars;
 }
 #endif
 
 /* Returns first span in a line. */
-span_t* line_span_last(line_t* line)
+span_t* extract_line_span_last(line_t* line)
 {
     assert(line->spans_num > 0);
     return line->spans[line->spans_num - 1];
 }
 
-span_t* line_span_first(line_t* line)
+span_t* extract_line_span_first(line_t* line)
 {
     assert(line->spans_num > 0);
     return line->spans[0];
 }
 
-static void page_free(extract_alloc_t* alloc, extract_page_t* page)
+
+static void table_free(extract_alloc_t* alloc, table_t** ptable)
+{
+    int c;
+    table_t* table = *ptable;
+    outf("table->cells_num_x=%i table->cells_num_y=%i",
+            table->cells_num_x,
+            table->cells_num_y
+            );
+    for (c = 0;  c< table->cells_num_x * table->cells_num_y; ++c)
+    {
+        extract_cell_free(alloc, &table->cells[c]);
+    }
+    extract_free(alloc, &table->cells);
+    extract_free(alloc, ptable);
+}
+
+static void page_free(extract_alloc_t* alloc, extract_page_t** ppage)
 {
-    int s;
+    extract_page_t* page = *ppage;
     if (!page) return;
 
-    for (s=0; s<page->spans_num; ++s) {
-        span_t* span = page->spans[s];
-        if (span) {
-            extract_free(alloc, &span->chars);
-            extract_free(alloc, &span->font_name);
-        }
-        extract_free(alloc, &span);
-    }
-    extract_free(alloc, &page->spans);
+    outf0("page=%p page->spans_num=%i page->lines_num=%i",
+            page, page->spans_num, page->lines_num);
+    extract_spans_free(alloc, &page->spans, page->spans_num);
 
-    {
-        int l;
-        for (l=0; l<page->lines_num; ++l) {
-            line_t* line = page->lines[l];
-            extract_free(alloc, &line->spans);
-            extract_free(alloc, &line);
-            /* We don't free line->spans->chars[] because already freed via
-            page->spans. */
-        }
-    }
-    extract_free(alloc, &page->lines);
+    extract_lines_free(alloc, &page->lines, page->lines_num);
 
     {
         int p;
         for (p=0; p<page->paragraphs_num; ++p) {
             paragraph_t* paragraph = page->paragraphs[p];
+            /* We don't call extract_lines_free(&paragraph->lines) because
+            these point into the same data as page->lines, which we have
+            already freed above. */
             if (paragraph) extract_free(alloc, &paragraph->lines);
-            extract_free(alloc, &paragraph);
+            extract_free(alloc, &page->paragraphs[p]);
         }
     }
     extract_free(alloc, &page->paragraphs);
@@ -197,13 +227,26 @@ static void page_free(extract_alloc_t* alloc, extract_page_t* page)
     {
         int i;
         for (i=0; i<page->images_num; ++i) {
-            extract_free(alloc, &page->images[i].data);
-            extract_free(alloc, &page->images[i].type);
-            extract_free(alloc, &page->images[i].id);
-            extract_free(alloc, &page->images[i].name);
+            extract_image_clear(alloc, &page->images[i]);
         }
+        extract_free(alloc, &page->images);
     }
     extract_free(alloc, &page->images);
+
+    extract_free(alloc, &page->tablelines_horizontal.tablelines);
+    extract_free(alloc, &page->tablelines_vertical.tablelines);
+    
+    {
+        int t;
+        outf("page=%p page->tables_num=%i", page, page->tables_num);
+        for (t=0; t<page->tables_num; ++t)
+        {
+            table_free(alloc, &page->tables[t]);
+        }
+        extract_free(alloc, &page->tables);
+    }
+    
+    extract_free(alloc, ppage);
 }
 
 static span_t* page_span_append(extract_alloc_t* alloc, extract_page_t* page)
@@ -212,9 +255,7 @@ error. */
 {
     span_t* span;
     if (extract_malloc(alloc, &span, sizeof(*span))) return NULL;
-    span->font_name = NULL;
-    span->chars = NULL;
-    span->chars_num = 0;
+    extract_span_init(span);
     if (extract_realloc2(
             alloc,
             &page->spans,
@@ -234,14 +275,7 @@ static void extract_images_free(extract_alloc_t* alloc, images_t* images)
 {
     int i;
     for (i=0; i<images->images_num; ++i) {
-        image_t*    image = &images->images[i];
-        extract_free(alloc, &image->type);
-        extract_free(alloc, &image->name);
-        extract_free(alloc, &image->id);
-        if (image->data_free) {
-            image->data_free(image->data_free_handle, image->data);
-        }
-        extract_free(alloc, &images->images[i]);
+        extract_image_clear(alloc, &images->images[i]);
     }
     extract_free(alloc, &images->images);
     extract_free(alloc, &images->imagetypes);
@@ -260,10 +294,12 @@ On return document->page[].images* will be NULL etc.
     int p;
     images_t   images = {0};
     outf("extract_document_images(): images.images_num=%i", images.images_num);
-    for (p=0; p<document->pages_num; ++p) {
+    for (p=0; p<document->pages_num; ++p)
+    {
         extract_page_t* page = document->pages[p];
         int i;
-        for (i=0; i<page->images_num; ++i) {
+        for (i=0; i<page->images_num; ++i)
+        {
             image_t* image;
             if (extract_realloc2(
                     alloc,
@@ -280,14 +316,17 @@ On return document->page[].images* will be NULL etc.
             /* Add image type if we haven't seen it before. */
             {
                 int it;
-                for (it=0; it<images.imagetypes_num; ++it) {
+                for (it=0; it<images.imagetypes_num; ++it)
+                {
                     outf("it=%i images.imagetypes[it]=%s image->type=%s",
                             it, images.imagetypes[it], image->type);
                     if (!strcmp(images.imagetypes[it], image->type)) {
                         break;
                     }
                 }
-                if (it == images.imagetypes_num) {
+                if (it == images.imagetypes_num)
+                {
+                    /* We haven't seen this image type before. */
                     if (extract_realloc2(
                             alloc,
                             &images.imagetypes,
@@ -314,9 +353,12 @@ On return document->page[].images* will be NULL etc.
     }
     e = 0;
     end:
-    if (e) {
+    if (e)
+    {
+        extract_free(alloc, &images.images);
     }
-    else {
+    else
+    {
         *o_images = images;
     }
     return e;
@@ -330,8 +372,7 @@ static void extract_document_free(extract_alloc_t* alloc, document_t* document)
     }
     for (p=0; p<document->pages_num; ++p) {
         extract_page_t* page = document->pages[p];
-        page_free(alloc, page);
-        extract_free(alloc, &page);
+        page_free(alloc, &page);
     }
     extract_free(alloc, &document->pages);
     document->pages = NULL;
@@ -347,7 +388,7 @@ static int s_sign(double x)
     return 0;
 }
 
-int matrix_cmp4(const matrix_t* lhs, const matrix_t* rhs)
+int extract_matrix_cmp4(const matrix_t* lhs, const matrix_t* rhs)
 {
     int ret;
     ret = s_sign(lhs->a - rhs->a);  if (ret) return ret;
@@ -358,7 +399,7 @@ int matrix_cmp4(const matrix_t* lhs, const matrix_t* rhs)
 }
 
 
-static point_t multiply_matrix_point(matrix_t m, point_t p)
+point_t extract_multiply_matrix_point(matrix_t m, point_t p)
 {
     double x = p.x;
     p.x = m.a * x + m.c * p.y;
@@ -366,6 +407,18 @@ static point_t multiply_matrix_point(matrix_t m, point_t p)
     return p;
 }
 
+matrix_t extract_multiply_matrix_matrix(matrix_t m1, matrix_t m2)
+{
+    matrix_t ret;
+    ret.a = m1.a * m2.a + m1.b * m2.c;
+    ret.b = m1.a * m2.b + m1.b * m2.d;
+    ret.c = m1.c * m2.a + m1.d * m2.c;
+    ret.d = m1.c * m2.b + m1.d * m2.d;
+    ret.e = m1.e + m2.e;
+    ret.f = m1.f + m2.f;
+    return ret;
+}
+
 static int s_matrix_read(const char* text, matrix_t* matrix)
 {
     int n;
@@ -427,8 +480,8 @@ char_t into a new span_t. */
         return 0;
     }
 
-    font_size = matrix_expansion(span->trm)
-            * matrix_expansion(span->ctm);
+    font_size = extract_matrix_expansion(span->trm)
+            * extract_matrix_expansion(span->ctm);
 
     if (span->flags.wmode) {
         dir.x = 0;
@@ -438,7 +491,7 @@ char_t into a new span_t. */
         dir.x = 1;
         dir.y = 0;
     }
-    dir = multiply_matrix_point(span->trm, dir);
+    dir = extract_multiply_matrix_point(span->trm, dir);
 
     x = char_[-2].pre_x + char_[-2].adv * dir.x;
     y = char_[-2].pre_y + char_[-2].adv * dir.y;
@@ -470,10 +523,10 @@ char_t into a new span_t. */
             sometimes seem to appear in the middle of words for some
             reason. */
             outfx("removing space before final char in: %s",
-                    span_string(span));
+                    extract_span_string(span));
             span->chars[span->chars_num-2] = span->chars[span->chars_num-1];
             span->chars_num -= 1;
-            outfx("span is now:                         %s", span_string(span));
+            outfx("span is now:                         %s", extract_span_string(span));
             return 0;
         }
     }
@@ -536,9 +589,42 @@ struct extract_t
     int                 contentss_num;
     
     images_t            images;
-
+    
     extract_format_t    format;
     extract_odt_styles_t odt_styles;
+    
+    char*               tables_csv_format;
+    int                 tables_csv_i;
+    
+    enum
+    {
+        path_type_NONE,
+        path_type_FILL,
+        path_type_STROKE,
+    } path_type;
+    
+    union
+    {
+        struct
+        {
+            matrix_t    ctm;
+            double      color;
+            point_t     points[4];
+            int         n;
+        } fill;
+        
+        struct
+        {
+            matrix_t    ctm;
+            double      color;
+            double      width;
+            point_t     point0;
+            int         point0_set;
+            point_t     point;
+            int         point_set;
+        } stroke;
+    
+    } path;
 };
 
 
@@ -551,7 +637,12 @@ int extract_begin(
     int e = -1;
     extract_t*  extract;
     
-    if (format != extract_format_ODT && format != extract_format_DOCX)
+    if (1
+            && format != extract_format_ODT
+            && format != extract_format_DOCX
+            && format != extract_format_HTML
+            && format != extract_format_TEXT
+            )
     {
         outf0("Invalid format=%i\n", format);
         errno = EINVAL;
@@ -570,6 +661,8 @@ int extract_begin(
     extract->image_n = 10;
     
     extract->format = format;
+    extract->tables_csv_format = NULL;
+    extract->tables_csv_i = 0;
     
     e = 0;
     
@@ -578,6 +671,11 @@ int extract_begin(
     return e;
 }
 
+int extract_tables_csv_format(extract_t* extract, const char* path_format)
+{
+    return extract_strdup(extract->alloc, path_format, &extract->tables_csv_format);
+}
+
 
 static void image_free_fn(void* handle, void* image_data)
 {
@@ -872,6 +970,22 @@ int extract_span_begin(
     span_t* span;
     assert(extract->document.pages_num > 0);
     page = extract->document.pages[extract->document.pages_num-1];
+    outf("extract_span_begin(): ctm=(%f %f %f %f %f %f) trm=(%f %f %f %f %f %f) font_name=%s, wmode=%i",
+            ctm_a,
+            ctm_b,
+            ctm_c,
+            ctm_d,
+            ctm_e,
+            ctm_f,
+            trm_a,
+            trm_b,
+            trm_c,
+            trm_d,
+            trm_e,
+            trm_f,
+            font_name,
+            wmode
+            );
     span = page_span_append(extract->alloc, page);
     if (!span) goto end;
     span->ctm.a = ctm_a;
@@ -880,12 +994,14 @@ int extract_span_begin(
     span->ctm.d = ctm_d;
     span->ctm.e = ctm_e;
     span->ctm.f = ctm_f;
+    
     span->trm.a = trm_a;
     span->trm.b = trm_b;
     span->trm.c = trm_c;
     span->trm.d = trm_d;
     span->trm.e = trm_e;
     span->trm.f = trm_f;
+    
     {
         const char* ff = strchr(font_name, '+');
         const char* f = (ff) ? ff+1 : font_name;
@@ -916,7 +1032,49 @@ int extract_add_char(
     extract_page_t* page = extract->document.pages[extract->document.pages_num-1];
     span_t* span = page->spans[page->spans_num - 1];
     
-    if (autosplit && y - extract->span_offset_y != 0) {
+    outf("(%f %f) ucs=% 5i=%c adv=%f", x, y, ucs, (ucs >=32 && ucs< 127) ? ucs : ' ', adv);
+    /* Ignore the specified <autosplit> - there seems no advantage to not
+    splitting spans on multiple lines, and not doing so causes problems with
+    missing spaces in the output. */
+    autosplit = 1;
+    
+    if (span->chars_num)
+    {
+        char_t* char_prev = &span->chars[span->chars_num - 1];
+        double xx = span->ctm.a * x + span->ctm.c * y + span->ctm.e;
+        double yy = span->ctm.b * x + span->ctm.d * y + span->ctm.f;
+        double dx = xx - char_prev->x;
+        double dy = yy - char_prev->y;
+        double a = atan2(dy, dx);
+        double span_a;
+        matrix_t m = extract_multiply_matrix_matrix(span->trm, span->ctm);
+        point_t dir = {1 - span->flags.wmode, span->flags.wmode};
+        dir = extract_multiply_matrix_point(m, dir);
+        span_a = atan2(dir.y, dir.x);
+        if (fabs(span_a - a) > 0.01)
+        {
+            /* Create new span. */
+            span_t* span0 = span;
+            outf("chars_num=%i prev=(%f %f) => (%f %f) xy=(%f %f) => xxyy=(%f %f) delta=(%f %f) a=%f not in line with dir=(%f %f) a=%f: ",
+                    span->chars_num,
+                    char_prev->pre_x, char_prev->pre_y,
+                    char_prev->x, char_prev->y,
+                    x, y,
+                    xx, yy,
+                    dx, dy, a,
+                    dir.x, dir.y, span_a
+                    );
+            extract->num_spans_autosplit += 1;
+            span = page_span_append(extract->alloc, page);
+            if (!span) goto end;
+            *span = *span0;
+            span->chars = NULL;
+            span->chars_num = 0;
+            if (extract_strdup(extract->alloc, span0->font_name, &span->font_name)) goto end;
+        }
+    }
+    
+    if (0 && autosplit && y - extract->span_offset_y != 0) {
         
         double e = span->ctm.e + span->ctm.a * (x - extract->span_offset_x)
                 + span->ctm.b * (y - extract->span_offset_y);
@@ -949,21 +1107,20 @@ int extract_add_char(
                 char_pre_y, offset_y);
     }
     
-    if (span_append_c(extract->alloc, span, 0 /*c*/)) goto end;
+    if (extract_span_append_c(extract->alloc, span, 0 /*c*/)) goto end;
+    /* Coverity warns, but extract_span_append_c() will have appended an item. */
+    /* coverity[var_deref_op] */
     char_ = &span->chars[ span->chars_num-1];
     
-    char_->pre_x = x - extract->span_offset_x;
-    char_->pre_y = y - extract->span_offset_y;
+    char_->pre_x = x;
+    char_->pre_y = y;
 
-    char_->x = span->ctm.a * char_->pre_x + span->ctm.b * char_->pre_y;
-    char_->y = span->ctm.c * char_->pre_x + span->ctm.d * char_->pre_y;
+    char_->x = span->ctm.a * char_->pre_x + span->ctm.c * char_->pre_y + span->ctm.e;
+    char_->y = span->ctm.b * char_->pre_x + span->ctm.d * char_->pre_y + span->ctm.f;
     
     char_->adv = adv;
     char_->ucs = ucs;
 
-    char_->x += span->ctm.e;
-    char_->y += span->ctm.f;
-
     {
         int page_spans_num_old = page->spans_num;
         if (page_span_end_clean(extract->alloc, page)) goto end;
@@ -1049,6 +1206,174 @@ int extract_add_image(
     return e;
 }
 
+
+static int tablelines_append(extract_alloc_t* alloc, tablelines_t* tablelines, rect_t* rect, double color)
+{
+    if (extract_realloc(
+            alloc,
+            &tablelines->tablelines,
+            sizeof(*tablelines->tablelines) * (tablelines->tablelines_num + 1)
+            )) return -1;
+    tablelines->tablelines[ tablelines->tablelines_num].rect = *rect;
+    tablelines->tablelines[ tablelines->tablelines_num].color = (float) color;
+    tablelines->tablelines_num += 1;
+    return 0;
+}
+
+static point_t transform(double x, double y, 
+        double ctm_a,
+        double ctm_b,
+        double ctm_c,
+        double ctm_d,
+        double ctm_e,
+        double ctm_f
+        )
+{
+    point_t ret;
+    ret.x = ctm_a * x + ctm_b * y + ctm_e;
+    ret.y = ctm_c * x + ctm_d * y + ctm_f;
+    return ret;
+}
+
+static double s_min(double a, double b)
+{
+    return (a < b) ? a : b;
+}
+
+static double s_max(double a, double b)
+{
+    return (a > b) ? a : b;
+}
+
+int extract_add_path4(
+        extract_t*  extract,
+        double ctm_a,
+        double ctm_b,
+        double ctm_c,
+        double ctm_d,
+        double ctm_e,
+        double ctm_f,
+        double x0,
+        double y0,
+        double x1,
+        double y1,
+        double x2,
+        double y2,
+        double x3,
+        double y3,
+        double color
+        )
+{
+    extract_page_t* page = extract->document.pages[extract->document.pages_num-1];
+    point_t points[4] = {
+            transform(x0, y0, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f),
+            transform(x1, y1, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f),
+            transform(x2, y2, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f),
+            transform(x3, y3, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f)
+            };
+    rect_t rect;
+    int i;
+    double dx;
+    double dy;
+    if (0 && color == 1)
+    {
+        return 0;
+    }
+    outf("cmt=(%f %f %f %f %f %f) points=[(%f %f) (%f %f) (%f %f) (%f %f)]",
+            ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f,
+            x0, y0, x1, y1, x2, y2, x3, y3
+            );
+    outf("extract_add_path4(): [(%f %f) (%f %f) (%f %f) (%f %f)]",
+            x0, y0, x1, y1, x2, y2, x3, y3);
+    /* Find first step with dx > 0. */
+    for (i=0; i<4; ++i)
+    {
+        if (points[(i+1) % 4].x > points[(i+0) % 4].x)    break;
+    }
+    outf("i=%i", i);
+    if (i == 4) return 0;
+    rect.min.x = points[(i+0) % 4].x;
+    rect.max.x = points[(i+1) % 4].x;
+    if (points[(i+2) % 4].x != rect.max.x)  return 0;
+    if (points[(i+3) % 4].x != rect.min.x)  return 0;
+    y0 = points[(i+1) % 4].y;
+    y1 = points[(i+2) % 4].y;
+    if (y0 == y1)   return 0;
+    if (points[(i+3) % 4].y != y1)  return 0;
+    if (points[(i+4) % 4].y != y0)  return 0;
+    rect.min.y = (y1 > y0) ? y0 : y1;
+    rect.max.y = (y1 > y0) ? y1 : y0;
+    
+    dx = rect.max.x - rect.min.x;
+    dy = rect.max.y - rect.min.y;
+    if (dx / dy > 5)
+    {
+        /* Horizontal line. */
+        outf("have found horizontal line: %s", extract_rect_string(&rect));
+        if (tablelines_append(extract->alloc, &page->tablelines_horizontal, &rect, color)) return -1;
+    }
+    else if (dy / dx > 5)
+    {
+        /* Vertical line. */
+        outf("have found vertical line: %s", extract_rect_string(&rect));
+        if (tablelines_append(extract->alloc, &page->tablelines_vertical, &rect, color)) return -1;
+    }
+    return 0;
+}
+
+
+int extract_add_line(
+        extract_t*  extract,
+        double ctm_a,
+        double ctm_b,
+        double ctm_c,
+        double ctm_d,
+        double ctm_e,
+        double ctm_f,
+        double width,
+        double x0,
+        double y0,
+        double x1,
+        double y1,
+        double color
+        )
+{
+    extract_page_t* page = extract->document.pages[extract->document.pages_num-1];
+    point_t p0 = transform(x0, y0, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f);
+    point_t p1 = transform(x1, y1, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f);
+    double width2 = width * sqrt( fabs( ctm_a * ctm_d - ctm_b * ctm_c));
+    rect_t  rect;
+    (void) color;
+    rect.min.x = s_min(p0.x, p1.x);
+    rect.min.y = s_min(p0.y, p1.y);
+    rect.max.x = s_max(p0.x, p1.x);
+    rect.max.y = s_max(p0.y, p1.y);
+    
+    outf("%s: width=%f ((%f %f)(%f %f)) rect=%s",
+            extract_FUNCTION,
+            width,
+            x0, y0, x1, y1,
+            extract_rect_string(&rect)
+            );
+    if (rect.min.x == rect.max.x && rect.min.y == rect.max.y)
+    {
+    }
+    else if (rect.min.x == rect.max.x)
+    {
+        rect.min.x -= width2 / 2;
+        rect.max.x += width2 / 2;
+        return tablelines_append(extract->alloc, &page->tablelines_vertical, &rect, color);
+    }
+    else if (rect.min.y == rect.max.y)
+    {
+        rect.min.y -= width2 / 2;
+        rect.max.y += width2 / 2;
+        return tablelines_append(extract->alloc, &page->tablelines_horizontal, &rect, color);
+    }
+    return 0;
+}
+
+
 int extract_page_begin(extract_t* extract)
 {
     /* Appends new empty extract_page_t to an extract->document. */
@@ -1062,6 +1387,13 @@ int extract_page_begin(extract_t* extract)
     page->paragraphs_num = 0;
     page->images = NULL;
     page->images_num = 0;
+    page->tablelines_horizontal.tablelines = NULL;
+    page->tablelines_horizontal.tablelines_num = 0;
+    page->tablelines_vertical.tablelines = NULL;
+    page->tablelines_vertical.tablelines_num = 0;
+    page->tables = NULL;
+    page->tables_num = 0;
+    
     if (extract_realloc2(
             extract->alloc,
             &extract->document.pages,
@@ -1076,6 +1408,231 @@ int extract_page_begin(extract_t* extract)
     return 0;
 }
 
+int extract_fill_begin(
+        extract_t*  extract,
+        double ctm_a,
+        double ctm_b,
+        double ctm_c,
+        double ctm_d,
+        double ctm_e,
+        double ctm_f,
+        double color
+        )
+{
+    assert(extract->path_type == path_type_NONE);
+    extract->path_type = path_type_FILL;
+    extract->path.fill.color = color;
+    extract->path.fill.n = 0;
+    extract->path.fill.ctm.a = ctm_a;
+    extract->path.fill.ctm.b = ctm_b;
+    extract->path.fill.ctm.c = ctm_c;
+    extract->path.fill.ctm.d = ctm_d;
+    extract->path.fill.ctm.e = ctm_e;
+    extract->path.fill.ctm.f = ctm_f;
+    return 0;
+}
+
+int extract_stroke_begin(
+        extract_t*  extract,
+        double ctm_a,
+        double ctm_b,
+        double ctm_c,
+        double ctm_d,
+        double ctm_e,
+        double ctm_f,
+        double line_width,
+        double color
+        )
+{
+    assert(extract->path_type == path_type_NONE);
+    extract->path_type = path_type_STROKE;
+    extract->path.stroke.ctm.a = ctm_a;
+    extract->path.stroke.ctm.b = ctm_b;
+    extract->path.stroke.ctm.c = ctm_c;
+    extract->path.stroke.ctm.d = ctm_d;
+    extract->path.stroke.ctm.e = ctm_e;
+    extract->path.stroke.ctm.f = ctm_f;
+    extract->path.stroke.width = line_width;
+    extract->path.stroke.color = color;
+    extract->path.stroke.point0_set = 0;
+    extract->path.stroke.point_set = 0;
+    return 0;
+}
+
+int extract_moveto(extract_t* extract, double x, double y)
+{
+    if (extract->path_type == path_type_FILL)
+    {
+        if (extract->path.fill.n == -1) return 0;
+        if (extract->path.fill.n != 0)
+        {
+            outf0("returning error. extract->path.fill.n=%i", extract->path.fill.n);
+            extract->path.fill.n = -1;
+            return 0;
+        }
+        extract->path.fill.points[extract->path.fill.n].x = x;
+        extract->path.fill.points[extract->path.fill.n].y = y;
+        extract->path.fill.n += 1;
+        return 0;
+    }
+    else if (extract->path_type == path_type_STROKE)
+    {
+        extract->path.stroke.point.x = x;
+        extract->path.stroke.point.y = y;
+        extract->path.stroke.point_set = 1;
+        if (!extract->path.stroke.point0_set)
+        {
+            extract->path.stroke.point0 = extract->path.stroke.point;
+            extract->path.stroke.point0_set = 1;
+        }
+        return 0;
+    }
+    else
+    {
+        assert(0);
+        return -1;
+    }
+}
+
+int extract_lineto(extract_t* extract, double x, double y)
+{
+    if (extract->path_type == path_type_FILL)
+    {
+        if (extract->path.fill.n == -1)    return 0;
+        if (extract->path.fill.n == 0 || extract->path.fill.n >= 4)
+        {
+            outf0("returning error. extract->path.fill.n=%i", extract->path.fill.n);
+            extract->path.fill.n = -1;
+            return 0;
+        }
+        extract->path.fill.points[extract->path.fill.n].x = x;
+        extract->path.fill.points[extract->path.fill.n].y = y;
+        extract->path.fill.n += 1;
+        return 0;
+    }
+    else if (extract->path_type == path_type_STROKE)
+    {
+        if (extract->path.stroke.point_set)
+        {
+            if (extract_add_line(
+                    extract,
+                    extract->path.stroke.ctm.a,
+                    extract->path.stroke.ctm.b,
+                    extract->path.stroke.ctm.c,
+                    extract->path.stroke.ctm.d,
+                    extract->path.stroke.ctm.e,
+                    extract->path.stroke.ctm.f,
+                    extract->path.stroke.width,
+                    extract->path.stroke.point.x,
+                    extract->path.stroke.point.y,
+                    x,
+                    y,
+                    extract->path.stroke.color
+                    ))
+            {
+                return -1;
+            }
+        }
+        extract->path.stroke.point.x = x;
+        extract->path.stroke.point.y = y;
+        extract->path.stroke.point_set = 1;
+        if (!extract->path.stroke.point0_set)
+        {
+            extract->path.stroke.point0 = extract->path.stroke.point;
+            extract->path.stroke.point0_set = 1;
+        }
+        return 0;
+    }
+    else
+    {
+        assert(0);
+        return -1;
+    }
+}
+
+int extract_closepath(extract_t* extract)
+{
+    if (extract->path_type == path_type_FILL)
+    {
+        if (extract->path.fill.n == 4)
+        {
+            /* We are closing a four-element path, so this could be a thin
+            rectangle that defines a line in a table. */
+            int e;
+            e = extract_add_path4(
+                    extract,
+                    extract->path.fill.ctm.a,
+                    extract->path.fill.ctm.b,
+                    extract->path.fill.ctm.c,
+                    extract->path.fill.ctm.d,
+                    extract->path.fill.ctm.e,
+                    extract->path.fill.ctm.f,
+                    extract->path.fill.points[0].x,
+                    extract->path.fill.points[0].y,
+                    extract->path.fill.points[1].x,
+                    extract->path.fill.points[1].y,
+                    extract->path.fill.points[2].x,
+                    extract->path.fill.points[2].y,
+                    extract->path.fill.points[3].x,
+                    extract->path.fill.points[3].y,
+                    extract->path.fill.color
+                    );
+            if (e) return e;
+        }
+        extract->path.fill.n = 0;
+        return 0;
+    }
+    else if (extract->path_type == path_type_STROKE)
+    {
+        if (extract->path.stroke.point0_set && extract->path.stroke.point_set)
+        {
+            if (extract_add_line(
+                    extract,
+                    extract->path.stroke.ctm.a,
+                    extract->path.stroke.ctm.b,
+                    extract->path.stroke.ctm.c,
+                    extract->path.stroke.ctm.d,
+                    extract->path.stroke.ctm.e,
+                    extract->path.stroke.ctm.f,
+                    extract->path.stroke.width,
+                    extract->path.stroke.point.x,
+                    extract->path.stroke.point.y,
+                    extract->path.stroke.point0.x,
+                    extract->path.stroke.point0.y,
+                    extract->path.stroke.color
+                    ))
+            {
+                return -1;
+            }
+            return 0;
+        }
+        extract->path.stroke.point = extract->path.stroke.point0;
+        return 0;
+    }
+    else
+    {
+        assert(0);
+        return -1;
+    }
+}
+
+
+int extract_fill_end(extract_t* extract)
+{
+    assert(extract->path_type == path_type_FILL);
+    extract->path_type = path_type_NONE;
+    return 0;
+}
+
+
+int extract_stroke_end(extract_t* extract)
+{
+    assert(extract->path_type == path_type_STROKE);
+    extract->path_type = path_type_NONE;
+    return 0;
+}
+
+
 
 int extract_page_end(extract_t* extract)
 {
@@ -1083,6 +1640,118 @@ int extract_page_end(extract_t* extract)
     return 0;
 }
 
+
+static int paragraphs_to_text_content(
+        extract_alloc_t* alloc,
+        paragraph_t** paragraphs,
+        int paragraphs_num,
+        extract_astring_t* text
+        )
+{
+    int p;
+    for (p=0; p<paragraphs_num; ++p)
+    {
+        paragraph_t* paragraph = paragraphs[p];
+        int l;
+        for (l=0; l<paragraph->lines_num; ++l)
+        {
+            line_t* line = paragraph->lines[l];
+            int s;
+            for (s=0; s<line->spans_num; ++s)
+            {
+                span_t* span = line->spans[s];
+                int c;
+                for (c=0; c<span->chars_num; ++c)
+                {
+                    /* We encode each character as utf8. */
+                    char_t* char_ = &span->chars[c];
+                    unsigned cc = char_->ucs;
+                    if (extract_astring_catc_unicode(
+                            alloc,
+                            text,
+                            cc,
+                            0 /*xml*/,
+                            1 /*ascii_ligatures*/,
+                            1 /*ascii_dash*/,
+                            1 /*ascii_apostrophe*/
+                            )) return -1;
+                }
+            }
+        }
+        if (extract_astring_catc(alloc, text, '\n')) return -1;
+    }
+    return 0;
+}
+
+
+static int extract_write_tables_csv(extract_t* extract)
+{
+    int ret = -1;
+    int p;
+    char* path = NULL;
+    FILE* f = NULL;
+    extract_astring_t text = {NULL, 0};
+    if (!extract->tables_csv_format) return 0;
+    
+    outf("extract_write_tables_csv(): path_format=%s", extract->tables_csv_format);
+    outf("extract->document.pages_num=%i", extract->document.pages_num);
+    for (p=0; p<extract->document.pages_num; ++p)
+    {
+        extract_page_t* page = extract->document.pages[p];
+        int t;
+        outf("p=%i page->tables_num=%i", p, page->tables_num);
+        for (t=0; t<page->tables_num; ++t)
+        {
+            table_t* table = page->tables[t];
+            int y;
+            extract_free(extract->alloc, &path);
+            if (extract_asprintf(extract->alloc, &path, extract->tables_csv_format, extract->tables_csv_i) < 0) goto end;
+            extract->tables_csv_i += 1;
+            outf("Writing table %i to: %s", t, path);
+            outf("table->cells_num_x=%i", table->cells_num_x);
+            outf("table->cells_num_y=%i", table->cells_num_y);
+            f = fopen(path, "w");
+            if (!f) goto end;
+            for (y=0; y<table->cells_num_y; ++y)
+            {
+                int x;
+                int have_output = 0;
+                for (x=0; x<table->cells_num_x; ++x)
+                {
+                    cell_t* cell = table->cells[table->cells_num_x * y + x];
+                    extract_astring_free(extract->alloc, &text);
+                    if (y==0)
+                    {
+                        outf("y=0 x=%i cell->rect=%s", x, extract_rect_string(&cell->rect));
+                    }
+                    if (have_output) fprintf(f, ",");
+                    have_output = 1;
+                    if (paragraphs_to_text_content(
+                            extract->alloc,
+                            cell->paragraphs,
+                            cell->paragraphs_num,
+                            &text
+                            )) goto end;
+                    /* Reference cvs output trims trailing spaces. */
+                    extract_astring_char_truncate_if(&text, ' ');
+                    fprintf(f, "\"%s\"", text.chars ? text.chars : "");
+                }
+                fprintf(f, "\n");
+            }
+            fclose(f);
+            f = NULL;
+        }
+    }
+    ret = 0;
+
+    end:
+    if (f) fclose(f);
+    extract_free(extract->alloc, &path);
+    extract_astring_free(extract->alloc, &text);
+    return ret;
+}
+
+
 int extract_process(
         extract_t*  extract,
         int         spacing,
@@ -1126,6 +1795,30 @@ int extract_process(
                 &extract->contentss[extract->contentss_num - 1]
                 )) goto end;
     }
+    else if (extract->format == extract_format_HTML)
+    {
+        if (extract_document_to_html_content(
+                extract->alloc,
+                &extract->document,
+                rotation,
+                images,
+                &extract->contentss[extract->contentss_num - 1]
+                )) goto end;
+    }
+    else if (extract->format == extract_format_TEXT)
+    {
+        int p;
+        for (p=0; p<extract->document.pages_num; ++p)
+        {
+            extract_page_t* page = extract->document.pages[p];
+            if (paragraphs_to_text_content(
+                    extract->alloc,
+                    page->paragraphs,
+                    page->paragraphs_num,
+                    &extract->contentss[extract->contentss_num - 1]
+                    )) goto end;
+        }
+    }
     else
     {
         outf0("Invalid format=%i", extract->format);
@@ -1136,11 +1829,15 @@ int extract_process(
 
     if (extract_document_images(extract->alloc, &extract->document, &extract->images)) goto end;
     
+    if (extract->tables_csv_format)
+    {
+        extract_write_tables_csv(extract);
+    }
+    
     {
         int i;
         for (i=0; i<extract->document.pages_num; ++i) {
-            page_free(extract->alloc, extract->document.pages[i]);
-            extract_free(extract->alloc, &extract->document.pages[i]);
+            page_free(extract->alloc, &extract->document.pages[i]);
         }
         extract_free(extract->alloc, &extract->document.pages);
         extract->document.pages_num = 0;
@@ -1159,9 +1856,9 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer)
     char*           text2 = NULL;
     int             i;
     
-    if (extract_zip_open(buffer, &zip)) goto end;
     if (extract->format == extract_format_ODT)
     {
+        if (extract_zip_open(buffer, &zip)) goto end;
         for (i=0; i<odt_template_items_num; ++i) {
             const odt_template_item_t* item = &odt_template_items[i];
             extract_free(extract->alloc, &text2);
@@ -1191,9 +1888,11 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer)
             if (extract_asprintf(extract->alloc, &text2, "Pictures/%s", image->name) < 0) goto end;
             if (extract_zip_write_file(zip, image->data, image->data_size, text2)) goto end;
         }
+        if (extract_zip_close(&zip)) goto end;
     }
     else if (extract->format == extract_format_DOCX)
     {
+        if (extract_zip_open(buffer, &zip)) goto end;
         for (i=0; i<docx_template_items_num; ++i) {
             const docx_template_item_t* item = &docx_template_items[i];
             extract_free(extract->alloc, &text2);
@@ -1222,6 +1921,22 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer)
             if (extract_asprintf(extract->alloc, &text2, "word/media/%s", image->name) < 0) goto end;
             if (extract_zip_write_file(zip, image->data, image->data_size, text2)) goto end;
         }
+        if (extract_zip_close(&zip)) goto end;
+        
+    }
+    else if (extract->format == extract_format_HTML)
+    {
+        for (i=0; i<extract->contentss_num; ++i)
+        {
+            if (extract_buffer_write(buffer, extract->contentss[i].chars, extract->contentss[i].chars_num, NULL)) goto end;
+        }
+    }
+    else if (extract->format == extract_format_TEXT)
+    {
+        for (i=0; i<extract->contentss_num; ++i)
+        {
+            if (extract_buffer_write(buffer, extract->contentss[i].chars, extract->contentss[i].chars_num, NULL)) goto end;
+        }
     }
     else
     {
@@ -1231,15 +1946,15 @@ int extract_write(extract_t* extract, extract_buffer_t* buffer)
         return 1;
     }
     
-    if (extract_zip_close(&zip)) goto end;
-    assert(!zip);
-    
     e = 0;
     
     end:
-    if (e) outf("failed: %s", strerror(errno));
+    if (e)
+    {
+        outf("failed: %s", strerror(errno));
+        extract_zip_close(&zip);
+    }
     extract_free(extract->alloc, &text2);
-    extract_zip_close(&zip);
     
     return e;
 }
@@ -1300,6 +2015,7 @@ int extract_write_template(
     }
 }
 
+
 void extract_end(extract_t** pextract)
 {
     extract_t* extract = *pextract;
@@ -1314,12 +2030,13 @@ void extract_end(extract_t** pextract)
         extract_free(extract->alloc, &extract->contentss);
     }
     extract_images_free(extract->alloc, &extract->images);
+    extract_odt_styles_free(extract->alloc, &extract->odt_styles);
     extract_free(extract->alloc, pextract);
 }
 
 void extract_internal_end(void)
 {
-    span_string(NULL, NULL);
+    extract_span_string(NULL, NULL);
 }
 
 void extract_exp_min(extract_t* extract, size_t size)
@@ -1329,8 +2046,8 @@ void extract_exp_min(extract_t* extract, size_t size)
 
 double extract_matrices_to_font_size(matrix_t* ctm, matrix_t* trm)
 {
-    double font_size = matrix_expansion(*trm)
-            * matrix_expansion(*ctm);
+    double font_size = extract_matrix_expansion(*trm)
+            * extract_matrix_expansion(*ctm);
     /* Round font_size to nearest 0.01. */
     font_size = (double) (int) (font_size * 100.0f + 0.5f) / 100.0f;
     return font_size;
diff --git a/extract/src/html.c b/extract/src/html.c
new file mode 100644
index 00000000..d12a3101
--- /dev/null
+++ b/extract/src/html.c
@@ -0,0 +1,314 @@
+/* These extract_html_*() functions generate docx content and docx zip archive
+data.
+
+Caller must call things in a sensible order to create valid content -
+e.g. don't call docx_paragraph_start() twice without intervening call to
+docx_paragraph_finish(). */
+
+#include "../include/extract.h"
+
+#include "astring.h"
+#include "document.h"
+#include "html.h"
+#include "mem.h"
+#include "memento.h"
+#include "outf.h"
+#include "sys.h"
+#include "text.h"
+#include "zip.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <float.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <sys/stat.h>
+
+
+static void content_state_init(content_state_t* content_state)
+{
+    content_state->font.name = NULL;
+    content_state->font.size = 0;
+    content_state->font.bold = 0;
+    content_state->font.italic = 0;
+    content_state->ctm_prev = NULL;
+}
+
+static int content_state_reset(extract_alloc_t* alloc, content_state_t* content_state, extract_astring_t* content)
+{
+    int e = -1;
+    if (content_state->font.bold)
+    {
+        if (extract_astring_cat(alloc, content, "</b>")) goto end;
+        content_state->font.bold = 0;
+    }
+    if (content_state->font.italic)
+    {
+        if (extract_astring_cat(alloc, content, "</i>")) goto end;
+        content_state->font.italic = 0;
+    }
+    e = 0;
+    
+    end:
+    return e;
+}
+
+static int paragraph_to_html_content(
+        extract_alloc_t*    alloc,
+        content_state_t*    content_state,
+        paragraph_t*        paragraph,
+        int                 single_line,
+        extract_astring_t*  content
+        )
+{
+    int e = -1;
+    const char* endl = (single_line) ? "" : "\n";
+    int l;
+    if (extract_astring_catf(alloc, content, "%s%s<p>", endl, endl)) goto end;
+
+    for (l=0; l<paragraph->lines_num; ++l)
+    {
+        line_t* line = paragraph->lines[l];
+        int s;
+        for (s=0; s<line->spans_num; ++s)
+        {
+            int c;
+            span_t* span = line->spans[s];
+            content_state->ctm_prev = &span->ctm;
+            if (span->flags.font_bold != content_state->font.bold)
+            {
+                if (extract_astring_cat(alloc, content,
+                        span->flags.font_bold ? "<b>" : "</b>"
+                        )) goto end;
+                content_state->font.bold = span->flags.font_bold;
+            }
+            if (span->flags.font_italic != content_state->font.italic)
+            {
+                if ( extract_astring_cat(alloc, content,
+                        span->flags.font_italic ? "<i>" : "</i>"
+                        )) goto end;
+                content_state->font.italic = span->flags.font_italic;
+            }
+
+            for (c=0; c<span->chars_num; ++c)
+            {
+                char_t* char_ = &span->chars[c];
+                if (extract_astring_catc_unicode_xml(alloc, content, char_->ucs)) goto end;
+            }
+        }
+
+        if (content->chars_num && l+1 < paragraph->lines_num)
+        {
+            if (content->chars[content->chars_num-1] == '-')    content->chars_num -= 1;
+            else if (content->chars[content->chars_num-1] != ' ')
+            {
+                extract_astring_catc(alloc, content, ' ');
+            }
+        }
+    }
+    if (extract_astring_catf(alloc, content, "%s</p>", endl)) goto end;
+    
+    e = 0;
+    
+    end:
+    return e;
+}
+
+
+static int paragraphs_to_html_content(
+        extract_alloc_t*    alloc,
+        content_state_t*    state,
+        paragraph_t**       paragraphs,
+        int                 paragraphs_num,
+        int                 single_line,
+        extract_astring_t*  content
+        )
+/* Append html for paragraphs[] to <content>. Updates *state if we change font
+etc. */
+{
+    int e = -1;
+    int p;
+    for (p=0; p<paragraphs_num; ++p)
+    {
+        paragraph_t* paragraph = paragraphs[p];
+        if (paragraph_to_html_content(alloc, state, paragraph, single_line, content)) goto end;
+    }
+    
+    if (content_state_reset(alloc, state, content)) goto end;
+    e = 0;
+    
+    end:
+    return e;
+}
+
+static int append_table(extract_alloc_t* alloc, content_state_t* state, table_t* table, extract_astring_t* content)
+{
+    int e = -1;
+    int y;
+    
+    if (extract_astring_cat(alloc, content, "\n\n<table border=\"1\" style=\"border-collapse:collapse\">\n")) goto end;
+    
+    for (y=0; y<table->cells_num_y; ++y)
+    {
+        /* If 1, we put each <td>...</td> on a separate line. */
+        int multiline = 0;
+        int x;
+        if (extract_astring_cat(alloc, content, "    <tr>\n")) goto end;
+        if (!multiline)
+        {
+            if (extract_astring_cat(alloc, content, "        ")) goto end;
+        }
+        for (x=0; x<table->cells_num_x; ++x)
+        {
+            cell_t* cell = table->cells[y*table->cells_num_x + x];
+            if (!cell->above || !cell->left)
+            {
+                /* HTML does not require anything for cells that are subsumed
+                by other cells that extend horizontally and vertically. */
+                continue;
+            }
+            if (extract_astring_cat(alloc, content, "        ")) goto end;
+            if (extract_astring_cat(alloc, content, "<td")) goto end;
+            
+            if (cell->extend_right > 1)
+            {
+                if (extract_astring_catf(alloc, content, " colspan=\"%i\"", cell->extend_right)) goto end;
+            }
+            if (cell->extend_down > 1)
+            {
+                if (extract_astring_catf(alloc, content, " rowspan=\"%i\"", cell->extend_down)) goto end;
+            }
+            
+            if (extract_astring_cat(alloc, content, ">")) goto end;
+
+            if (paragraphs_to_html_content(alloc, state, cell->paragraphs, cell->paragraphs_num, 1 /* single_line*/, content)) goto end;
+            if (extract_astring_cat(alloc, content, "</td>")) goto end;
+            if (extract_astring_cat(alloc, content, "\n")) goto end;
+
+            if (content_state_reset(alloc, state, content)) goto end;
+        }
+        if (!multiline)
+        {
+            if (extract_astring_cat(alloc, content, "\n")) goto end;
+        }
+        if (extract_astring_cat(alloc, content, "    </tr>\n")) goto end;
+    }
+    if (extract_astring_cat(alloc, content, "</table>\n\n")) goto end;
+    e = 0;
+    
+    end:
+    return e;
+}
+
+
+static char_t* paragraph_first_char(const paragraph_t* paragraph)
+{
+    line_t* line = paragraph->lines[paragraph->lines_num - 1];
+    span_t* span = line->spans[line->spans_num - 1];
+    return &span->chars[0];
+}
+
+static int compare_paragraph_y(const void* a, const void* b)
+{
+    const paragraph_t* const* a_paragraph = a;
+    const paragraph_t* const* b_paragraph = b;
+    double a_y = paragraph_first_char(*a_paragraph)->y;
+    double b_y = paragraph_first_char(*b_paragraph)->y;
+    if (a_y > b_y)  return +1;
+    if (a_y < b_y)  return -1;
+    return 0;
+}
+
+int extract_document_to_html_content(
+        extract_alloc_t*    alloc,
+        document_t*         document,
+        int                 rotation,
+        int                 images,
+        extract_astring_t*  content
+        )
+{
+    int ret = -1;
+    int p;
+    paragraph_t** paragraphs = NULL;
+    
+    (void) rotation;
+    (void) images;
+    
+    extract_astring_cat(alloc, content, "<html>\n");
+    extract_astring_cat(alloc, content, "<body>\n");
+    
+    /* Write paragraphs into <content>. */
+    for (p=0; p<document->pages_num; ++p)
+    {
+        extract_page_t* page = document->pages[p];
+        int p;
+        int t;
+        content_state_t state;
+        content_state_init(&state);
+        extract_free(alloc, &paragraphs);
+        
+        /* Output paragraphs and tables in order of increasing <y> coordinate.
+
+        Unfortunately the paragraph ordering we do in page->paragraphs[]
+        isn't quite right and results in bad ordering if ctm/trm matrices are
+        inconsistent. So we create our own list of paragraphs sorted strictly
+        by y coordinate of the first char of each paragraph. */
+        if (extract_malloc(alloc, &paragraphs, sizeof(*paragraphs) * page->paragraphs_num)) goto end;
+        for (p = 0; p < page->paragraphs_num; ++p)
+        {
+            paragraphs[p] = page->paragraphs[p];
+        }
+        qsort(paragraphs, page->paragraphs_num, sizeof(*paragraphs), compare_paragraph_y);
+        
+        if (0)
+        {
+            int p;
+            outf0("paragraphs are:");
+            for (p=0; p<page->paragraphs_num; ++p)
+            {
+                paragraph_t* paragraph = page->paragraphs[p];
+                line_t* line = paragraph->lines[0];
+                span_t* span = line->spans[0];
+                outf0("    p=%i: %s", p, extract_span_string(NULL, span));
+            }
+        }
+
+        p = 0;
+        t = 0;        
+        for(;;)
+        {
+            double y_paragraph;
+            double y_table;
+            paragraph_t* paragraph = (p == page->paragraphs_num) ? NULL : paragraphs[p];
+            table_t* table = (t == page->tables_num) ? NULL : page->tables[t];
+            if (!paragraph && !table) break;
+            y_paragraph = (paragraph) ? paragraph->lines[0]->spans[0]->chars[0].y : DBL_MAX;
+            y_table = (table) ? table->pos.y : DBL_MAX;
+            outf("p=%i y_paragraph=%f", p, y_paragraph);
+            outf("t=%i y_table=%f", t, y_table);
+            if (paragraph && y_paragraph < y_table)
+            {
+                //extract_astring_catf(alloc, content, "<p>@@@ paragraph %i y=%f @@@)</p>\n", p, y_paragraph);
+                if (paragraph_to_html_content(alloc, &state, paragraph, 0 /*single_line*/, content)) goto end;
+                if (content_state_reset(alloc, &state, content)) goto end;
+                p += 1;
+            }
+            else if (table)
+            {
+                //extract_astring_catf(alloc, content, "<p>@@@ table %t y=%f @@@)</p>\n", p, y_table);
+                if (append_table(alloc, &state, table, content)) goto end;
+                t += 1;
+            }
+        }
+    }
+    extract_astring_cat(alloc, content, "</body>\n");
+    extract_astring_cat(alloc, content, "</html>\n");
+    ret = 0;
+
+    end:
+    extract_free(alloc, &paragraphs);
+    return ret;
+}
diff --git a/extract/src/html.h b/extract/src/html.h
new file mode 100644
index 00000000..6148a067
--- /dev/null
+++ b/extract/src/html.h
@@ -0,0 +1,23 @@
+#ifndef ARTIFEX_EXTRACT_HTML_H
+#define ARTIFEX_EXTRACT_HTML_H
+
+/* Only for internal use by extract code.  */
+
+/* Things for creating docx files. */
+
+int extract_document_to_html_content(
+        extract_alloc_t*    alloc,
+        document_t*         document,
+        int                 rotation,
+        int                 images,
+        extract_astring_t*  content
+        );
+/* Makes *o_content point to a string containing all paragraphs in *document in
+docx XML format.
+
+This string can be passed to extract_docx_content_item() or
+extract_docx_write_template() to be inserted into a docx archive's
+word/document.xml. */
+
+
+#endif
diff --git a/extract/src/join.c b/extract/src/join.c
index f12e2751..4425de3d 100644
--- a/extract/src/join.c
+++ b/extract/src/join.c
@@ -7,6 +7,7 @@
 #include "outf.h"
 
 #include <assert.h>
+#include <float.h>
 #include <math.h>
 #include <stdio.h>
 
@@ -17,24 +18,39 @@ static char_t* span_char_first(span_t* span)
     return &span->chars[0];
 }
 
+static span_t* s_line_span_first(line_t* line)
+{
+    return extract_line_span_first(line);
+}
+
 /* Returns first char_t in a line. */
 static char_t* line_item_first(line_t* line)
 {
-    span_t* span = line_span_first(line);
+    span_t* span = s_line_span_first(line);
     return span_char_first(span);
 }
 
 /* Returns last char_t in a line. */
 static char_t* line_item_last(line_t* line)
 {
-    span_t* span = line_span_last(line);
-    return span_char_last(span);
+    span_t* span = extract_line_span_last(line);
+    return extract_span_char_last(span);
 }
 
-static const char* matrix_string(const matrix_t* matrix)
+static point_t char_to_point(const char_t* char_)
 {
-    static char ret[64];
-    snprintf(ret, sizeof(ret), "{%f %f %f %f %f %f}",
+    point_t ret;
+    ret.x = char_->x;
+    ret.y = char_->y;
+    return ret;
+}
+
+const char* extract_matrix_string(const matrix_t* matrix)
+{
+    static char ret[5][64];
+    static int i = 0;
+    i = (i + 1) % 5;
+    snprintf(ret[i], sizeof(ret[i]), "{%f %f %f %f %f %f}",
             matrix->a,
             matrix->b,
             matrix->c,
@@ -42,17 +58,17 @@ static const char* matrix_string(const matrix_t* matrix)
             matrix->e,
             matrix->f
             );
-    return ret;
+    return ret[i];
 }
 
 /* Returns total width of span. */
 static double span_adv_total(span_t* span)
 {
-    double dx = span_char_last(span)->x - span_char_first(span)->x;
-    double dy = span_char_last(span)->y - span_char_first(span)->y;
+    double dx = extract_span_char_last(span)->x - span_char_first(span)->x;
+    double dy = extract_span_char_last(span)->y - span_char_first(span)->y;
     /* We add on the advance of the last item; this avoids us returning zero if
     there's only one item. */
-    double adv = span_char_last(span)->adv * matrix_expansion(span->trm);
+    double adv = extract_span_char_last(span)->adv * extract_matrix_expansion(span->trm);
     return sqrt(dx*dx + dy*dy) + adv;
 }
 
@@ -66,15 +82,30 @@ static double spans_adv(
     double delta_x = b->x - a->x;
     double delta_y = b->y - a->y;
     double s = sqrt( delta_x*delta_x + delta_y*delta_y);
-    double a_size = a->adv * matrix_expansion(a_span->trm);
+    double a_size = a->adv * extract_matrix_expansion(a_span->trm);
     s -= a_size;
     return s;
 }
 
 static double span_angle(span_t* span)
 {
-    /* Assume ctm is a rotation matix. */
     double ret = atan2(-span->ctm.c, span->ctm.a);
+    if (0)
+    {
+        /* This is an attempt to take into account the trm matrix when looking
+        at spans, because for agstat.pdf vertical text seems to be achieved
+        by making trm rotate by 90 degrees. But it messes up the ordering of
+        rotated paragraphs in Python2.pdf so is disabled for now. */
+        matrix_t m = extract_multiply_matrix_matrix(span->trm, span->ctm);
+        point_t dir;
+        double ret;
+        dir.x = span->flags.wmode ? 0 : 1;
+        dir.y = span->flags.wmode ? 1 : 0;
+        dir = extract_multiply_matrix_point(m, dir);
+        ret = atan2(dir.y, dir.x);
+        return ret;
+    }
+    /* Assume ctm is a rotation matix. */
     outfx("ctm.a=%f ctm.b=%f ret=%f", span->ctm.a, span->ctm.b, ret);
     return ret;
     /* Not sure whether this is right. Inclined text seems to be done by
@@ -89,6 +120,22 @@ static double span_angle(span_t* span)
     }*/
 }
 
+static double span_angle2(span_t* span)
+{
+    if (span->chars_num > 1)
+    {
+        double dx = span->chars[span->chars_num-1].x - span->chars[0].x;
+        double dy = span->chars[span->chars_num-1].y - span->chars[0].y;
+        double ret1 = span_angle(span);
+        double ret2 = atan2(-dy, dx);
+        if (fabs(ret2 - ret1) > 0.01)
+        {
+            outf("### ret1=%f ret2=%f: %s", ret1, ret2, extract_span_string(NULL, span));
+        }
+    }
+    return span_angle(span);
+}
+
 /* Returns static string containing brief info about span_t. */
 static const char* span_string2(extract_alloc_t* alloc, span_t* span)
 {
@@ -182,36 +229,36 @@ static int lines_are_compatible(
 {
     if (a == b) return 0;
     if (!a->spans || !b->spans) return 0;
-    if (line_span_first(a)->flags.wmode != line_span_first(b)->flags.wmode) {
+    if (s_line_span_first(a)->flags.wmode != s_line_span_first(b)->flags.wmode) {
         return 0;
     }
-    if (matrix_cmp4(
-            &line_span_first(a)->ctm,
-            &line_span_first(b)->ctm
+    if (extract_matrix_cmp4(
+            &s_line_span_first(a)->ctm,
+            &s_line_span_first(b)->ctm
             )) {
         if (verbose) {
             outf("ctm's differ:");
             outf("    %f %f %f %f %f %f",
-                    line_span_first(a)->ctm.a,
-                    line_span_first(a)->ctm.b,
-                    line_span_first(a)->ctm.c,
-                    line_span_first(a)->ctm.d,
-                    line_span_first(a)->ctm.e,
-                    line_span_first(a)->ctm.f
+                    s_line_span_first(a)->ctm.a,
+                    s_line_span_first(a)->ctm.b,
+                    s_line_span_first(a)->ctm.c,
+                    s_line_span_first(a)->ctm.d,
+                    s_line_span_first(a)->ctm.e,
+                    s_line_span_first(a)->ctm.f
                     );
             outf("    %f %f %f %f %f %f",
-                    line_span_first(b)->ctm.a,
-                    line_span_first(b)->ctm.b,
-                    line_span_first(b)->ctm.c,
-                    line_span_first(b)->ctm.d,
-                    line_span_first(b)->ctm.e,
-                    line_span_first(b)->ctm.f
+                    s_line_span_first(b)->ctm.a,
+                    s_line_span_first(b)->ctm.b,
+                    s_line_span_first(b)->ctm.c,
+                    s_line_span_first(b)->ctm.d,
+                    s_line_span_first(b)->ctm.e,
+                    s_line_span_first(b)->ctm.f
                     );
         }
         return 0;
     }
     {
-        double angle_b = span_angle(line_span_first(b));
+        double angle_b = span_angle(s_line_span_first(b));
         if (angle_b != angle_a) {
             outfx("%s:%i: angles differ");
             return 0;
@@ -221,6 +268,80 @@ static int lines_are_compatible(
 }
 
 
+static const unsigned ucs_NONE = ((unsigned) -1);
+
+static int s_span_inside_rects(
+        extract_alloc_t* alloc,
+        span_t* span,
+        rect_t* rects,
+        int rects_num,
+        span_t* o_span
+        )
+/* Returns with <o_span> containing char_t's from <span> that are inside
+rects[], and *span modified to remove any char_t's that we have moved to
+<o_span>.
+
+May return with span->chars_num == 0, in which case the caller must remove the
+span (including freeing .font_name), because lots of code assumes that there
+are no empty spans. */
+{
+    int c;
+    *o_span = *span;
+    extract_strdup(alloc, span->font_name, &o_span->font_name);
+    o_span->chars = NULL;
+    o_span->chars_num = 0;
+    for (c=0; c<span->chars_num; ++c)
+    {
+        /* For now we just look at whether span's (x, y) is within any
+        rects[]. We could instead try to find character's bounding box etc. */
+        char_t* char_ = &span->chars[c];
+        int r;
+        for (r=0; r<rects_num; ++r)
+        {
+            rect_t* rect = &rects[r];
+            if (1
+                    && char_->x >= rect->min.x
+                    && char_->x < rect->max.x
+                    && char_->y >= rect->min.y
+                    && char_->y < rect->max.y
+                    )
+            {
+                if (extract_span_append_c(alloc, o_span, char_->ucs))   return -1;
+                /* Coverity warns, but o_span must have at least one item. */
+                /* coverity[var_deref_op] */
+                *extract_span_char_last(o_span) = *char_;
+                char_->ucs = ucs_NONE; /* Mark for removal below, so it is not used again. */
+                break;
+            }
+        }
+    }
+
+    /* Remove any char_t's that we've used. */
+    {
+        int cc = 0;
+        for (c=0; c<span->chars_num; ++c)
+        {
+            char_t* char_ = &span->chars[c];
+            if (char_->ucs != ucs_NONE)
+            {
+                span->chars[cc] = span->chars[c];
+                cc += 1;
+            }
+        }
+        /* This might set span->chars_num to zero; our caller needs to remove
+        the span - lots of code assumes that all spans contain at least one
+        character. */
+        span->chars_num = cc;
+    }
+
+    if (o_span->chars_num)
+    {
+        //outf0("  span: %s", extract_span_string(alloc, span));
+        outf("o_span: %s", extract_span_string(alloc, o_span));
+    }
+    return 0;
+}
+
 /* Creates representation of span_t's that consists of a list of line_t's, with
 each line_t contains pointers to a list of span_t's.
 
@@ -230,11 +351,16 @@ On entry:
     Original value of *o_lines and *o_lines_num are ignored.
 
     <spans> points to array of <spans_num> span_t*'s, each pointing to
-    an span_t.
+    a span_t.
 
 On exit:
     If we succeed, we return 0, with *o_lines pointing to array of *o_lines_num
-    line_t*'s, each pointing to an line_t.
+    line_t*'s, each pointing to a line_t.
+    
+    If <rects_num> is zero, each of these line_t's will contain pointers to
+    items in <spans>; otherwise each of the line_t's will contain new spans
+    which should be freed by the caller (spans are not necessarily wholy inside
+    or outside rects[] so we need to create new spams).
 
     Otherwise we return -1 with errno set. *o_lines and *o_lines_num are
     undefined.
@@ -242,35 +368,85 @@ On exit:
 static int make_lines(
         extract_alloc_t*    alloc,
         span_t**            spans,
-        int                 spans_num,
+        int*                spans_num,
+        rect_t*             rects,
+        int                 rects_num,
         line_t***           o_lines,
         int*                o_lines_num
         )
 {
     int ret = -1;
 
-    /* Make an line_t for each span. Then we will join some of these
-    line_t's together before returning. */
-    int         lines_num = spans_num;
+    /* Make a line_t for each span. Then we will join some of these line_t's
+    together before returning. */
+    int         lines_num = 0;
     line_t**    lines = NULL;
     int         a;
     int         num_compatible;
     int         num_joins;
-    if (extract_malloc(alloc, &lines, sizeof(*lines) * lines_num)) goto end;
-
-    /* Ensure we can clean up after error. */
-    for (a=0; a<lines_num; ++a) {
-        lines[a] = NULL;
-    }
-    for (a=0; a<lines_num; ++a) {
-        if (extract_malloc(alloc, &lines[a], sizeof(line_t))) goto end;
-        lines[a]->spans_num = 0;
-        if (extract_malloc(alloc, &lines[a]->spans, sizeof(span_t*) * 1)) goto end;
-        lines[a]->spans_num = 1;
-        lines[a]->spans[0] = spans[a];
-        outfx("initial line a=%i: %s", a, line_string(lines[a]));
+    span_t*     span = NULL;
+    
+    if (rects_num)
+    {
+        /* Make <lines> contain new span_t's and char_t's that are inside rects[]. */
+        for (a=0; a<*spans_num; ++a)
+        {
+            if (spans[a]->chars_num == 0)   continue; /* In case used for table, */
+            if (extract_realloc(alloc, &span, sizeof(*span))) goto end;
+            extract_span_init(span);
+            if (s_span_inside_rects(alloc, spans[a], rects, rects_num, span))
+            {
+                goto end;
+            }
+            if (span->chars_num)
+            {
+                if (extract_realloc(alloc, &lines, sizeof(*lines) * (lines_num + 1))) goto end;
+                if (extract_malloc(alloc, &lines[lines_num], sizeof(line_t))) goto end;
+                lines_num += 1;
+                if (extract_malloc(alloc, &lines[lines_num-1]->spans, sizeof(span_t*) * 1)) goto end;
+                lines[lines_num-1]->spans[0] = span;
+                lines[lines_num-1]->spans_num = 1;
+                span = NULL;
+            }
+            else
+            {
+                extract_span_free(alloc, &span);
+            }
+            
+            if (!spans[a]->chars_num)
+            {
+                /* All characters in this span are inside table, so remove
+                entire span, otherwise the same characters will end up being
+                output outside the table also. */
+                extract_span_free(alloc, &spans[a]);
+                memmove(&spans[a], &spans[a+1], sizeof(*spans) * ((*spans_num) - (a+1)));
+                *spans_num -= 1;
+                a -= 1;
+            }
+        }
     }
+    else
+    {
+        /* Make <lines> be a copy of <spans>. */
+        lines_num = *spans_num;
+        if (extract_malloc(alloc, &lines, sizeof(*lines) * lines_num)) goto end;
 
+        /* Ensure we can clean up after error. */
+        for (a=0; a<lines_num; ++a) {
+            lines[a] = NULL;
+        }
+        for (a=0; a<lines_num; ++a) {
+            if (extract_malloc(alloc, &lines[a], sizeof(line_t))) goto end;
+            lines[a]->spans_num = 0;
+            if (extract_malloc(alloc, &lines[a]->spans, sizeof(span_t*) * 1)) goto end;
+            lines[a]->spans_num = 1;
+            lines[a]->spans[0] = spans[a];
+            /* Ensure that spans[] can be safely freed now we've moved it into lines[]. */
+            spans[a] = NULL;
+            outfx("initial line a=%i: %s", a, line_string(lines[a]));
+        }
+    }
+    
     num_compatible = 0;
 
     /* For each line, look for nearest aligned line, and append if found. */
@@ -290,14 +466,14 @@ static int make_lines(
         }
 
         if (0 && a < 1) verbose = 1;
-        outfx("looking at line_a=%s", line_string2(line_a));
+        outfx("looking at line_a=%s", line_string2(alloc, line_a));
 
-        span_a = line_span_last(line_a);
+        span_a = extract_line_span_last(line_a);
         angle_a = span_angle(span_a);
         if (verbose) outf("a=%i angle_a=%f ctm=%s: %s",
                 a,
                 angle_a * 180/pi,
-                matrix_string(&span_a->ctm),
+                extract_matrix_string(&span_a->ctm),
                 line_string2(alloc, line_a)
                 );
 
@@ -310,7 +486,6 @@ static int make_lines(
                 continue;
             }
             if (verbose) {
-                outf("");
                 outf("a=%i b=%i: nearest_line_b=%i nearest_adv=%f",
                         a,
                         b,
@@ -330,17 +505,17 @@ static int make_lines(
                 /* Find angle between last glyph of span_a and first glyph of
                 span_b. This detects whether the lines are lined up with each other
                 (as opposed to being at the same angle but in different lines). */
-                span_t* span_b = line_span_first(line_b);
-                double dx = span_char_first(span_b)->x - span_char_last(span_a)->x;
-                double dy = span_char_first(span_b)->y - span_char_last(span_a)->y;
+                span_t* span_b = s_line_span_first(line_b);
+                double dx = span_char_first(span_b)->x - extract_span_char_last(span_a)->x;
+                double dy = span_char_first(span_b)->y - extract_span_char_last(span_a)->y;
                 double angle_a_b = atan2(-dy, dx);
                 const double angle_tolerance_deg = 1;
                 if (verbose) {
                     outf("delta=(%f %f) alast=(%f %f) bfirst=(%f %f): angle_a=%f angle_a_b=%f",
                             dx,
                             dy,
-                            span_char_last(span_a)->x,
-                            span_char_last(span_a)->y,
+                            extract_span_char_last(span_a)->x,
+                            extract_span_char_last(span_a)->y,
                             span_char_first(span_b)->x,
                             span_char_first(span_b)->y,
                             angle_a * 180 / pi,
@@ -353,7 +528,7 @@ static int make_lines(
                     /* Find distance between end of line_a and beginning of line_b. */
                     double adv = spans_adv(
                             span_a,
-                            span_char_last(span_a),
+                            extract_span_char_last(span_a),
                             span_char_first(span_b)
                             );
                     if (verbose) outf("nearest_adv=%f. angle_a_b=%f adv=%f",
@@ -370,8 +545,8 @@ static int make_lines(
                 else {
                     if (verbose) outf(
                             "angle beyond tolerance: span_a last=(%f,%f) span_b first=(%f,%f) angle_a_b=%g angle_a=%g span_a.trm{a=%f b=%f}",
-                            span_char_last(span_a)->x,
-                            span_char_last(span_a)->y,
+                            extract_span_char_last(span_a)->x,
+                            extract_span_char_last(span_a)->y,
                             span_char_first(span_b)->x,
                             span_char_first(span_b)->y,
                             angle_a_b * 180 / pi,
@@ -386,24 +561,30 @@ static int make_lines(
         if (nearest_line) {
             /* line_a and nearest_line are aligned so we can move line_b's
             spans on to the end of line_a. */
-            span_t* span_b = line_span_first(nearest_line);
+            double average_adv;
+            span_t* span_b = s_line_span_first(nearest_line);
             b = nearest_line_b;
             if (verbose) outf("found nearest line. a=%i b=%i", a, b);
 
+            /* Find average advance of the two adjacent spans in the two
+            lines we are considering joining, so that we can decide whether
+            the distance between them is large enough to merit joining with
+            a space character). */
+            average_adv = (
+                    (span_adv_total(span_a) + span_adv_total(span_b))
+                    /
+                    (double) (span_a->chars_num + span_b->chars_num)
+                    );
+
+            if (0 && nearest_adv > 5 * average_adv)
+            {
+                continue;
+            }
+            
             if (1
-                    && span_char_last(span_a)->ucs != ' '
+                    && extract_span_char_last(span_a)->ucs != ' '
                     && span_char_first(span_b)->ucs != ' '
                     ) {
-                /* Find average advance of the two adjacent spans in the two
-                lines we are considering joining, so that we can decide whether
-                the distance between them is large enough to merit joining with
-                a space character). */
-                double average_adv = (
-                        (span_adv_total(span_a) + span_adv_total(span_b))
-                        /
-                        (double) (span_a->chars_num + span_b->chars_num)
-                        );
-
                 int insert_space = (nearest_adv > 0.25 * average_adv);
                 if (insert_space) {
                     /* Append space to span_a before concatenation. */
@@ -413,8 +594,8 @@ static int make_lines(
                                 nearest_adv,
                                 average_adv
                                 );
-                        outf("    a: %s", span_string(alloc, span_a));
-                        outf("    b: %s", span_string(alloc, span_b));
+                        outf("    a: %s", extract_span_string(alloc, span_a));
+                        outf("    b: %s", extract_span_string(alloc, span_b));
                     }
                     if (extract_realloc2(
                             alloc,
@@ -427,6 +608,13 @@ static int make_lines(
                     extract_bzero(item, sizeof(*item));
                     item->ucs = ' ';
                     item->adv = nearest_adv;
+                    /* This is a hack to give our extra space a vaguely useful
+                    (x,y) coordinate - this can be used later on when ordering
+                    paragraphs. We could try to be more accurate by adding
+                    item[-1]'s .adv suitably transformed by .wmode, .ctm and
+                    .trm. */
+                    item->x = item[-1].x;
+                    item->y = item[-1].y;
                 }
 
                 if (verbose) {
@@ -440,14 +628,14 @@ static int make_lines(
                             "joining line insert_space=%i a=%i (y=%f) to line b=%i (y=%f). nearest_adv=%f average_adv=%f",
                             insert_space,
                             a,
-                            span_char_last(span_a)->y,
+                            extract_span_char_last(span_a)->y,
                             b,
                             span_char_first(span_b)->y,
                             nearest_adv,
                             average_adv
                             );
-                    outf("a: %s", span_string(alloc, span_a));
-                    outf("b: %s", span_string(alloc, span_b));
+                    outf("a: %s", extract_span_string(alloc, span_a));
+                    outf("b: %s", extract_span_string(alloc, span_b));
                 }
             }
 
@@ -487,7 +675,7 @@ static int make_lines(
                 the new extended line_a needs checking again. */
                 a -= 1;
             }
-            outfx("new line is:\n    %s", line_string2(line_a));
+            outfx("num_joins=%i new line is:\n    %s", num_joins, line_string2(line_a));
         }
     }
 
@@ -524,7 +712,7 @@ static int make_lines(
     ret = 0;
 
     outf("Turned %i spans into %i lines. num_compatible=%i",
-            spans_num,
+            *spans_num,
             lines_num,
             num_compatible
             );
@@ -532,9 +720,18 @@ static int make_lines(
     end:
     if (ret) {
         /* Free everything. */
+        extract_span_free(alloc, &span);
         if (lines) {
             for (a=0; a<lines_num; ++a) {
-                if (lines[a])   extract_free(alloc, &lines[a]->spans);
+                if (lines[a])
+                {
+                    int s;
+                    for (s=0; s<lines[a]->spans_num; ++s)
+                    {
+                        extract_span_free(alloc, &lines[a]->spans[s]);
+                    }
+                    extract_free(alloc, &lines[a]->spans);
+                }
                 extract_free(alloc, &lines[a]);
             }
         }
@@ -552,7 +749,7 @@ static double line_font_size_max(line_t* line)
     for (i=0; i<line->spans_num; ++i) {
         span_t* span = line->spans[i];
         /* fixme: <size> should be double, which changes some output. */
-        double size = matrix_expansion(span->trm);
+        double size = extract_matrix_expansion(span->trm);
         if (size > size_max) {
             size_max = size;
         }
@@ -581,21 +778,35 @@ respectively.
 
 AQB is a right angle. We need to find AQ.
 */
-static double line_distance(
-        double ax,
-        double ay,
-        double bx,
-        double by,
-        double angle
-        )
+static double line_distance_y( double ax, double ay, double bx, double by, double angle)
 {
     double dx = bx - ax;
     double dy = by - ay;
 
-
     return dx * sin(angle) + dy * cos(angle);
 }
 
+/* Returns distance QB in above diagram. */
+static double line_distance_x( double ax, double ay, double bx, double by, double angle)
+{
+    double dx = bx - ax;
+    double dy = by - ay;
+
+    return dx * cos(angle) - dy * sin(angle);
+}
+
+static double line_distance_xp(point_t a, point_t b, double angle)
+{
+    return line_distance_x(a.x, a.y, b.x, b.y, angle);
+}
+
+static int lines_overlap(point_t a_left, point_t a_right, point_t b_left, point_t b_right, double angle)
+{
+    if (line_distance_xp(a_left, b_right, angle) < 0)  return 0;
+    if (line_distance_xp(a_right, b_left, angle) >= 0) return 0;
+    return 1;
+}
+
 
 /* A comparison function for use with qsort(), for sorting paragraphs within a
 page. */
@@ -606,14 +817,49 @@ static int paragraphs_cmp(const void* a, const void* b)
     line_t* a_line = paragraph_line_first(*a_paragraph);
     line_t* b_line = paragraph_line_first(*b_paragraph);
 
-    span_t* a_span = line_span_first(a_line);
-    span_t* b_span = line_span_first(b_line);
+    span_t* a_span = s_line_span_first(a_line);
+    span_t* b_span = s_line_span_first(b_line);
 
-    /* If ctm matrices differ, always return this diff first. Note that we
-    ignore .e and .f because if data is from ghostscript then .e and .f vary
-    for each span, and we don't care about these differences. */
-    int d = matrix_cmp4(&a_span->ctm, &b_span->ctm);
-    if (d)  return d;
+    if (0)
+    {
+        double a_angle = span_angle2(a_span);
+        double b_angle = span_angle2(b_span);
+        if (fabs(a_angle - b_angle) > 0.01)
+        {
+            outf0("angles differ: a_angle=%f b_angle=%f", a_angle, b_angle);
+            outf0("a_span: %s", extract_span_string(NULL, a_span));
+            outf0("b_span: %s", extract_span_string(NULL, b_span));
+            if (a_angle - b_angle > 3.14/2) {
+                /* Give up if more than 90 deg. */
+                return 0;
+            }
+            if (a_angle > b_angle)  return 1;
+            if (a_angle < b_angle)  return -1;
+            return 0;
+        }
+    }
+    if (1)
+    {
+        /* If ctm matrices differ, always return this diff first. Note that we
+        ignore .e and .f because if data is from ghostscript then .e and .f
+        vary for each span, and we don't care about these differences. */
+        int d = extract_matrix_cmp4(&a_span->ctm, &b_span->ctm);
+        if (d)
+        {
+            outf("extract_matrix_cmp4() returned non-zero.");
+            outf("a_span->ctm=%s trm=%s: %s",
+                    extract_matrix_string(&a_span->ctm),
+                    extract_matrix_string(&a_span->trm),
+                    extract_span_string(NULL, a_span)
+                    );
+            outf("b_span->ctm=%s trm=%s: %s",
+                    extract_matrix_string(&b_span->ctm),
+                    extract_matrix_string(&a_span->trm),
+                    extract_span_string(NULL, b_span)
+                    );
+            return d;
+        }
+    }
 
     {
         double a_angle = line_angle(a_line);
@@ -628,7 +874,7 @@ static int paragraphs_cmp(const void* a, const void* b)
             double ay = line_item_first(a_line)->y;
             double bx = line_item_first(b_line)->x;
             double by = line_item_first(b_line)->y;
-            double distance = line_distance(ax, ay, bx, by, angle);
+            double distance = line_distance_y(ax, ay, bx, by, angle);
             if (distance > 0)   return -1;
             if (distance < 0)   return +1;
         }
@@ -669,7 +915,7 @@ static int make_paragraphs(
     int num_joins;
     paragraph_t** paragraphs = NULL;
 
-    /* Start off with an paragraph_t for each line_t. */
+    /* Start off with a paragraph_t for each line_t. */
     int paragraphs_num = lines_num;
     if (extract_malloc(alloc, &paragraphs, sizeof(*paragraphs) * paragraphs_num)) goto end;
     /* Ensure we can clean up after error when setting up. */
@@ -685,11 +931,12 @@ static int make_paragraphs(
         paragraphs[a]->lines[0] = lines[a];
     }
 
+    /* Now join paragraphs together where possible. */
     num_joins = 0;
     for (a=0; a<paragraphs_num; ++a) {
-        paragraph_t* nearest_paragraph;
-        int nearest_paragraph_b;
-        double nearest_paragraph_distance;
+        paragraph_t* nearest_paragraph = NULL;
+        int nearest_paragraph_b = -1;
+        double nearest_paragraph_distance = -1;
         line_t* line_a;
         double angle_a;
         int verbose;
@@ -702,14 +949,9 @@ static int make_paragraphs(
             continue;
         }
 
-        nearest_paragraph = NULL;
-        nearest_paragraph_b = -1;
-        nearest_paragraph_distance = -1;
         assert(paragraph_a->lines_num > 0);
-
         line_a = paragraph_line_last(paragraph_a);
         angle_a = line_angle(line_a);
-
         verbose = 0;
 
         /* Look for nearest paragraph_t that could be appended to
@@ -732,7 +974,7 @@ static int make_paragraphs(
                 double ay = line_item_last(line_a)->y;
                 double bx = line_item_first(line_b)->x;
                 double by = line_item_first(line_b)->y;
-                double distance = line_distance(ax, ay, bx, by, angle_a);
+                double distance = line_distance_y(ax, ay, bx, by, angle_a);
                 if (verbose) {
                     outf(
                             "angle_a=%f a=(%f %f) b=(%f %f) delta=(%f %f) distance=%f:",
@@ -746,17 +988,39 @@ static int make_paragraphs(
                     outf("    line_a=%s", line_string2(alloc, line_a));
                     outf("    line_b=%s", line_string2(alloc, line_b));
                 }
-                if (distance > 0) {
+                if (distance > 0)
+                {
                     if (nearest_paragraph_distance == -1
-                            || distance < nearest_paragraph_distance) {
-                        if (verbose) {
-                            outf("updating nearest. distance=%f:", distance);
-                            outf("    line_a=%s", line_string2(alloc, line_a));
-                            outf("    line_b=%s", line_string2(alloc, line_b));
+                            || distance < nearest_paragraph_distance)
+                    {
+                        int ok = 1;
+                        if (0)
+                        {
+                            /* Check whether lines overlap horizontally. */
+                            point_t a_left = char_to_point(line_item_first(line_a));
+                            point_t b_left = char_to_point(line_item_first(line_b));
+                            point_t a_right = char_to_point(line_item_last(line_a));
+                            point_t b_right = char_to_point(line_item_last(line_b));
+
+                            if (!lines_overlap(a_left, a_right, b_left, b_right, angle_a))
+                            {
+                                outf("Not joining lines because not overlapping.");
+                                ok = 0;
+                            }
+                        }
+
+                        if (ok)
+                        {
+                            if (verbose) {
+                                outf("updating nearest. distance=%f:", distance);
+                                outf("    line_a=%s", line_string2(alloc, line_a));
+                                outf("    line_b=%s", line_string2(alloc, line_b));
+                            }
+
+                            nearest_paragraph_distance = distance;
+                            nearest_paragraph_b = b;
+                            nearest_paragraph = paragraph_b;
                         }
-                        nearest_paragraph_distance = distance;
-                        nearest_paragraph_b = b;
-                        nearest_paragraph = paragraph_b;
                     }
                 }
             }
@@ -787,24 +1051,34 @@ static int make_paragraphs(
                     outf("    %s", paragraph_string(alloc, paragraph_a));
                     outf("    %s", paragraph_string(alloc, nearest_paragraph));
                     outf("paragraph_a ctm=%s",
-                            matrix_string(&paragraph_a->lines[0]->spans[0]->ctm)
+                            extract_matrix_string(&paragraph_a->lines[0]->spans[0]->ctm)
                             );
                     outf("paragraph_a trm=%s",
-                            matrix_string(&paragraph_a->lines[0]->spans[0]->trm)
+                            extract_matrix_string(&paragraph_a->lines[0]->spans[0]->trm)
                             );
                 }
                 /* Join these two paragraph_t's. */
-                a_span = line_span_last(line_a);
-                if (span_char_last(a_span)->ucs == '-') {
+                a_span = extract_line_span_last(line_a);
+                if (extract_span_char_last(a_span)->ucs == '-'
+                        || extract_span_char_last(a_span)->ucs == 0x2212 /* unicode dash */
+                        )
+                {
                     /* remove trailing '-' at end of prev line. char_t doesn't
                     contain any malloc-heap pointers so this doesn't leak. */
                     a_span->chars_num -= 1;
                 }
-                else {
+                else if (extract_span_char_last(a_span)->ucs == ' ')
+                {
+                }
+                else if (extract_span_char_last(a_span)->ucs == '/')
+                {
+                }
+                else
+                {
                     /* Insert space before joining adjacent lines. */
                     char_t* c_prev;
                     char_t* c;
-                    if (span_append_c(alloc, line_span_last(line_a), ' ')) goto end;
+                    if (extract_span_append_c(alloc, extract_line_span_last(line_a), ' ')) goto end;
                     c_prev = &a_span->chars[ a_span->chars_num-2];
                     c = &a_span->chars[ a_span->chars_num-1];
                     c->x = c_prev->x + c_prev->adv * a_span->ctm.a;
@@ -834,9 +1108,10 @@ static int make_paragraphs(
 
                 num_joins += 1;
                 outfx(
-                        "have joined paragraph a=%i to snearest_paragraph_b=%i",
+                        "have joined paragraph a=%i to nearest_paragraph_b=%i. num_joins=%i.",
                         a,
-                        nearest_paragraph_b
+                        nearest_paragraph_b,
+                        num_joins
                         );
 
                 if (nearest_paragraph_b > a) {
@@ -884,26 +1159,21 @@ static int make_paragraphs(
 
     /* Sort paragraphs so they appear in correct order, using paragraphs_cmp().
     */
-    qsort(
-            paragraphs,
-            paragraphs_num,
-            sizeof(paragraph_t*), paragraphs_cmp
-            );
+    qsort(paragraphs, paragraphs_num, sizeof(paragraph_t*), paragraphs_cmp);
 
     *o_paragraphs = paragraphs;
     *o_paragraphs_num = paragraphs_num;
     ret = 0;
-    outf("Turned %i lines into %i paragraphs",
-            lines_num,
-            paragraphs_num
-            );
-
+    outf("Turned %i lines into %i paragraphs", lines_num, paragraphs_num);
 
     end:
 
-    if (ret) {
-        if (paragraphs) {
-            for (a=0; a<paragraphs_num; ++a) {
+    if (ret)
+    {
+        if (paragraphs)
+        {
+            for (a=0; a<paragraphs_num; ++a)
+            {
                 if (paragraphs[a])   extract_free(alloc, &paragraphs[a]->lines);
                 extract_free(alloc, &paragraphs[a]);
             }
@@ -913,39 +1183,688 @@ static int make_paragraphs(
     return ret;
 }
 
-int extract_document_join(extract_alloc_t* alloc, document_t* document)
+static int s_join_page_rects(
+        extract_alloc_t*    alloc,
+        extract_page_t*     page,
+        rect_t*             rects,
+        int                 rects_num,
+        line_t***           lines,
+        int*                lines_num,
+        paragraph_t***      paragraphs,
+        int*                paragraphs_num
+        )
+/* Extracts text that is inside any of rects[0..rects_num], or all text if
+rects_num is zero. */
 {
-    int ret = -1;
+    if (make_lines(
+            alloc,
+            page->spans,
+            &page->spans_num,
+            rects,
+            rects_num,
+            lines,
+            lines_num
+            )) return -1;
+    if (make_paragraphs(
+            alloc,
+            *lines,
+            *lines_num,
+            paragraphs,
+            paragraphs_num
+            )) return -1;
+    
+    return 0;
+}
+
+
+static int tablelines_compare_x(const void* a, const void* b)
+/* Compares two tableline_t's rectangles using x as primary key. */
+{
+    const tableline_t*  aa = a;
+    const tableline_t*  bb = b;
+    if (aa->rect.min.x > bb->rect.min.x)    return +1;
+    if (aa->rect.min.x < bb->rect.min.x)    return -1;
+    if (aa->rect.min.y > bb->rect.min.y)    return +1;
+    if (aa->rect.min.y < bb->rect.min.y)    return -1;
+    return 0;
+}
 
-    /* For each page in <document> we join spans into lines and paragraphs. A
-    line is a list of spans that are at the same angle and on the same line. A
-    paragraph is a list of lines that are at the same angle and close together.
+static int tablelines_compare_y(const void* a, const void* b)
+/* Compares two tableline_t's rectangles using y as primary key. */
+{
+    const tableline_t*  aa = a;
+    const tableline_t*  bb = b;
+    if (aa->rect.min.y > bb->rect.min.y)    return +1;
+    if (aa->rect.min.y < bb->rect.min.y)    return -1;
+    if (aa->rect.min.x > bb->rect.min.x)    return +1;
+    if (aa->rect.min.x < bb->rect.min.x)    return -1;
+    return 0;
+}
+
+static int table_find_y_range(extract_alloc_t* alloc, tablelines_t* all, double y_min, double y_max,
+        tablelines_t* out)
+/* Makes <out> to contain all lines in <all> with y coordinate in the range
+y_min..y_max. */
+{
+    int i;
+    for (i=0; i<all->tablelines_num; ++i)
+    {
+        if (all->tablelines[i].rect.min.y >= y_min && all->tablelines[i].rect.min.y < y_max)
+        {
+            if (extract_realloc(alloc, &out->tablelines, sizeof(*out->tablelines) * (out->tablelines_num + 1))) return -1;
+            out->tablelines[out->tablelines_num] = all->tablelines[i];
+            out->tablelines_num += 1;
+        }
+        else
+        {
+            outf("Excluding line because outside y=%f..%f: %s", y_min, y_max, extract_rect_string(&all->tablelines[i].rect));
+        }
+    }
+    return 0;
+}
+
+
+static int overlap(double a_min, double a_max, double b_min, double b_max)
+/* Returns one if a_min..a_max significantly overlapps b_min..b_max, otherwise
+zero. */
+{
+    double overlap;
+    int ret0;
+    int ret1;
+    assert(a_min < a_max);
+    assert(b_min < b_max);
+    if (b_min < a_min)  b_min = a_min;
+    if (b_max > a_max)  b_max = a_max;
+    if (b_max < b_min)  b_max = b_min;
+    overlap = (b_max - b_min) / (a_max - a_min);
+    ret0 = overlap > 0.2;
+    ret1 = overlap > 0.8;
+    if (ret0 != ret1)
+    {
+        if (0) outf0("warning, unclear overlap=%f: a=%f..%f b=%f..%f", overlap, a_min, a_max, b_min, b_max);
+    }
+    return overlap > 0.8;
+}
+
+void extract_cell_init(cell_t* cell)
+{
+    cell->rect.min.x = 0;
+    cell->rect.min.y = 0;
+    cell->rect.max.x = 0;
+    cell->rect.max.y = 0;
+    cell->above = 0;
+    cell->left = 0;
+    cell->extend_right = 0;
+    cell->extend_down = 0;
+    cell->lines = NULL;
+    cell->lines_num = 0;
+    cell->paragraphs = NULL;
+    cell->paragraphs_num = 0;
+}
+
+
+static int table_find_extend(cell_t** cells, int cells_num_x, int cells_num_y)
+{    
+    /* Find cell extensions to right and down by looking at cells' .left and
+    .above flags.
+    
+    For example for adjacent cells ABC..., we extend A to include cells BC..
+    until we reach a cell with .left set to one.
+    
+    ABCDE
+    FGHIJ
+    KLMNO
+    
+    When looking to extend cell A, we only look at cells in the same column or
+    same row, (i.e. in the above example we look at BCDE and FK, and not at
+    GHIJ and LMNO).
+
+    For example if BCDE have no left lines and FK have no above lines, we
+    ignore any lines in GHIJ and LMNO and make A extend to the entire 3x4
+    box. Having found this box, we set .above=0 and .left to 0 in all enclosed
+    cells, which simplifies html table generation code.
     */
-    int p;
-    for (p=0; p<document->pages_num; ++p) {
-        extract_page_t* page = document->pages[p];
-        outf("processing page %i: num_spans=%i", p, page->spans_num);
+    int y;
+    for (y=0; y<cells_num_y; ++y)
+    {
+        int x;
+        for (x=0; x<cells_num_x; ++x)
+        {
+            cell_t* cell = cells[y * cells_num_x + x];
+            outf("xy=(%i %i) above=%i left=%i", x, y, cell->above, cell->left);
+            if (cell->left && cell->above)
+            {
+                /* See how far this cell extends to right and down. */
+                int xx;
+                int yy;
+                for (xx=x+1; xx<cells_num_x; ++xx)
+                {
+                    if (cells[y * cells_num_x + xx]->left)  break;
+                }
+                cell->extend_right = xx - x;
+                cell->rect.max.x = cells[y * cells_num_x + xx-1]->rect.max.x;
+                for (yy=y+1; yy<cells_num_y; ++yy)
+                {
+                    if (cells[yy * cells_num_x + x]->above) break;
+                }
+                cell->extend_down = yy - y;
+                cell->rect.max.y = cells[(yy-1) * cells_num_x + x]->rect.max.y;
+                
+                /* Clear .above and .left in enclosed cells. */
+                for (xx = x; xx < x + cell->extend_right; ++xx)
+                {
+                    int yy;
+                    for (yy = y; yy < y + cell->extend_down; ++yy)
+                    {
+                        cell_t* cell2 = cells[cells_num_x * yy  + xx];
+                        if ( xx==x && yy==y)
+                        {}
+                        else
+                        {
+                            if (xx==x)
+                            {
+                                cell2->extend_right = cell->extend_right;
+                            }
+                            cell2->above = 0;
+                            /* We set .left to 1 for left-most cells - e.g. F
+                            and K in the above diagram; this allows us to
+                            generate correct html without lots of recursing
+                            looking for extend_down in earlier cells. */
+                            cell2->left = (xx == x);
+                            outf("xy=(%i %i) xxyy=(%i %i) have set cell2->above=%i left=%i",
+                                    x, y, xx, yy, cell2->above, cell2->left
+                                    );
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return 0;
+}
 
-        if (make_lines(
-                alloc,
-                page->spans,
-                page->spans_num,
-                &page->lines,
-                &page->lines_num
-                )) goto end;
 
-        if (make_paragraphs(
+static int table_find_cells_text(extract_alloc_t* alloc, extract_page_t* page,
+        cell_t** cells, int cells_num_x, int cells_num_y)
+/* Sets each cell to contain the text that is within the cell's boundary. We
+remove any found text from the page. */
+{
+    /* Find text within each cell. We don't attempt to handle images within
+    cells. */
+    int e = -1;
+    int i;
+    int cells_num = cells_num_x * cells_num_y;
+    for (i=0; i<cells_num; ++i)
+    {
+        cell_t* cell = cells[i];
+        if (!cell->above || !cell->left) continue;
+        if (s_join_page_rects(
                 alloc,
-                page->lines,
-                page->lines_num,
-                &page->paragraphs,
-                &page->paragraphs_num
-                )) goto end;
+                page,
+                &cell->rect,
+                1 /*rects_num*/,
+                &cell->lines,
+                &cell->lines_num,
+                &cell->paragraphs,
+                &cell->paragraphs_num
+                )) return -1;
     }
+    
+    /* Append the table we have found to page->tables[]. */
+    if (extract_realloc(alloc, &page->tables, sizeof(*page->tables) * (page->tables_num + 1))) goto end;
+    if (extract_malloc(alloc, &page->tables[page->tables_num], sizeof(*page->tables[page->tables_num]))) goto end;
+    page->tables[page->tables_num]->pos.x = cells[0]->rect.min.x;
+    page->tables[page->tables_num]->pos.y = cells[0]->rect.min.y;
+    page->tables[page->tables_num]->cells = cells;
+    page->tables[page->tables_num]->cells_num_x = cells_num_x;
+    page->tables[page->tables_num]->cells_num_y = cells_num_y;
+    page->tables_num += 1;
+    
+    if (0)
+    {
+        /* For debugging. */
+        int y;
+        outf0("table:\n");
+        for (y=0; y<cells_num_y; ++y)
+        {
+            int x;
+            for (x=0; x<cells_num_x; ++x)
+            {
+                cell_t* cell = cells[cells_num_x * y + x];
+                fprintf(stderr, "    %c%c x=%i y=% 3i 3i w=%i h=%i",
+                        cell->left ? '|' : ' ',
+                        cell->above ? '-' : ' ',
+                        x,
+                        y,
+                        cell->extend_right,
+                        cell->extend_down
+                        );
+            }
+            fprintf(stderr, "\n");
+        }
+        
+    }
+    
+    e = 0;
+    end:
+    return e;
+}
 
-    ret = 0;
 
+static int table_find(extract_alloc_t* alloc, extract_page_t* page, double y_min, double y_max)
+/* Finds single table made from lines whose y coordinates are in the range
+y_min..y_max. */
+{
+    tablelines_t* all_h = &page->tablelines_horizontal;
+    tablelines_t* all_v = &page->tablelines_vertical;
+    int e = -1;
+    int i;
+    
+    /* Find subset of vertical and horizontal lines that are within range
+    y_min..y_max, and sort by y coordinate. */
+    tablelines_t    tl_h = {NULL, 0};
+    tablelines_t    tl_v = {NULL, 0};
+    cell_t**    cells = NULL;
+    int         cells_num = 0;
+    int         cells_num_x = 0;
+    int         cells_num_y = 0;
+    int x;
+    int y;
+
+    outf("y=(%f %f)", y_min, y_max);
+    
+    if (table_find_y_range(alloc, all_h, y_min, y_max, &tl_h)) goto end;
+    if (table_find_y_range(alloc, all_v, y_min, y_max, &tl_v)) goto end;
+    /* Suppress false coverity warning - qsort() does not dereference null
+    pointer if nmemb is zero. */
+    /* coverity[var_deref_model] */
+    qsort(tl_v.tablelines, tl_v.tablelines_num, sizeof(*tl_v.tablelines), tablelines_compare_x);
+    
+    if (0)
+    {
+        /* Show raw lines info. */
+        outf0("all_h->tablelines_num=%i tl_h.tablelines_num=%i", all_h->tablelines_num, tl_h.tablelines_num);
+        for (i=0; i<tl_h.tablelines_num; ++i)
+        {
+            outf0("    %i: %s", i, extract_rect_string(&tl_h.tablelines[i].rect));
+        }
+
+        outf0("all_v->tablelines_num=%i tl_v.tablelines_num=%i", all_v->tablelines_num, tl_v.tablelines_num);
+        for (i=0; i<tl_v.tablelines_num; ++i)
+        {
+            outf0("    %i: %s", i, extract_rect_string(&tl_v.tablelines[i].rect));
+        }
+    }
+    /* Find the cells defined by the vertical and horizontal lines.
+
+    It seems that lines can be disjoint, e.g. what looks like a single
+    horizontal line could be made up of multiple lines all with the same
+    y coordinate, so we use i_next and j_next to skip these sublines when
+    iterating. */
+    cells = NULL;
+    cells_num = 0;
+    cells_num_x = 0;
+    cells_num_y = 0;
+    for (i=0; i<tl_h.tablelines_num; )
+    {
+        int i_next;
+        int j;
+        for (i_next=i+1; i_next<tl_h.tablelines_num; ++i_next)
+        {
+            if (tl_h.tablelines[i_next].rect.min.y - tl_h.tablelines[i].rect.min.y > 5) break;
+        }
+        if (i_next == tl_h.tablelines_num)
+        {
+            /* Ignore last row of points - cells need another row below. */
+            break;
+        }
+        cells_num_y += 1;
+        
+        for (j=0; j<tl_v.tablelines_num; )
+        {
+            int j_next;
+            int ii;
+            int jj;
+            cell_t* cell;
+            
+            for (j_next = j+1; j_next<tl_v.tablelines_num; ++j_next)
+            {
+                if (tl_v.tablelines[j_next].rect.min.x - tl_v.tablelines[j].rect.min.x > 0.5) break;
+            }
+            outf("i=%i j=%i tl_v.tablelines[j].rect=%s", i, j, extract_rect_string(&tl_v.tablelines[j].rect));
+            
+            if (j_next == tl_v.tablelines_num) break;
+                        
+            if (extract_realloc(alloc, &cells, sizeof(*cells) * (cells_num+1))) goto end;
+            if (extract_malloc(alloc, &cells[cells_num], sizeof(*cells[cells_num]))) goto end;
+            cell = cells[cells_num];
+            cells_num += 1;
+            if (i==0)   cells_num_x += 1;
+            
+            cell->rect.min.x = tl_v.tablelines[j].rect.min.x;
+            cell->rect.min.y = tl_h.tablelines[i].rect.min.y;
+            cell->rect.max.x = (j_next < tl_v.tablelines_num) ? tl_v.tablelines[j_next].rect.min.x : cell->rect.min.x;
+            cell->rect.max.y = (i_next < tl_h.tablelines_num) ? tl_h.tablelines[i_next].rect.min.y : cell->rect.min.y;
+            cell->above = (i==0);
+            cell->left = (j==0);
+            cell->extend_right = 1;
+            cell->extend_down = 1;
+            cell->lines = NULL;
+            cell->lines_num = 0;
+            cell->paragraphs = NULL;
+            cell->paragraphs_num = 0;
+            
+            /* Set cell->above if there is a horizontal line above the cell. */
+            outf("Looking to set above for i=%i j=%i rect=%s", i, j, extract_rect_string(&cell->rect));
+            for (ii = i; ii < i_next; ++ii)
+            {
+                tableline_t* h = &tl_h.tablelines[ii];
+                if (overlap(
+                        cell->rect.min.x,
+                        cell->rect.max.x,
+                        h->rect.min.x,
+                        h->rect.max.x
+                        ))
+                {
+                    cell->above = 1;
+                    break;
+                }
+            }
+            
+            /* Set cell->left if there is a vertical line to the left of the cell. */
+            for (jj = j; jj < j_next; ++jj)
+            {
+                tableline_t* v = &tl_v.tablelines[jj];
+                if (overlap(
+                        cell->rect.min.y,
+                        cell->rect.max.y,
+                        v->rect.min.y,
+                        v->rect.max.y
+                        ))
+                {
+                    cell->left = 1;
+                    break;
+                }
+            }
+            
+            j = j_next;
+        }
+        
+        i = i_next;
+    }
+    
+    assert(cells_num == cells_num_x * cells_num_y);
+    
+    /* Remove cols and rows where no cells have .above and .left - these
+    will not appear. It also avoids spurious empty columns when table uses
+    closely-spaced double lines as separators. */
+    for (x=0; x<cells_num_x; ++x)
+    {
+        int has_cells = 0;
+        for (y=0; y<cells_num_y; ++y)
+        {
+            cell_t* cell = cells[y * cells_num_x + x];
+            if (cell->above && cell->left)
+            {
+                has_cells = 1;
+                break;
+            }
+        }
+        if (!has_cells)
+        {
+            /* Remove column <x>. */
+            int j = 0;
+            outf("Removing column %i. cells_num=%i cells_num_x=%i cells_num_y=%i", x, cells_num, cells_num_x, cells_num_y);
+            for (i=0; i<cells_num; ++i)
+            {
+                if (i % cells_num_x == x)
+                {
+                    extract_cell_free(alloc, &cells[i]);
+                    continue;
+                }
+                cells[j] = cells[i];
+                j += 1;
+            }
+            cells_num -= cells_num_y;
+            cells_num_x -= 1;
+        }
+    }
+    
+    if (cells_num == 0)
+    {
+        e = 0;
+        goto end;
+    }
+
+    if (table_find_extend(cells, cells_num_x, cells_num_y)) goto end;
+    
+    if (table_find_cells_text(alloc, page, cells, cells_num_x, cells_num_y)) goto end;
+    
+    e = 0;
     end:
+    extract_free(alloc, &tl_h.tablelines);
+    extract_free(alloc, &tl_v.tablelines);
+    if (e)
+    {
+        for (i=0; i<cells_num; ++i)
+        {
+            extract_cell_free(alloc, &cells[i]);
+        }
+        extract_free(alloc, &cells);
+    }
+    return e;
+}
 
-    return ret;
+
+static int extract_page_tables_find_lines(
+        extract_alloc_t*    alloc,
+        extract_page_t*     page
+        )
+/* Finds tables in <page> by looking for lines in page->tablelines_horizontal
+and page->tablelines_vertical that look like table dividers.
+
+Any text found inside tables is removed from page->spans[].
+*/
+{
+    double miny;
+    double maxy;
+    double margin = 1;
+    int iv;
+    int ih;
+    outf("page->tablelines_horizontal.tablelines_num=%i", page->tablelines_horizontal.tablelines_num);
+    outf("page->tablelines_vertical.tablelines_num=%i", page->tablelines_vertical.tablelines_num);
+    
+    /* Sort all lines by y coordinate. */
+    qsort(
+            page->tablelines_horizontal.tablelines,
+            page->tablelines_horizontal.tablelines_num,
+            sizeof(*page->tablelines_horizontal.tablelines),
+            tablelines_compare_y
+            );
+    qsort(
+            page->tablelines_vertical.tablelines,
+            page->tablelines_vertical.tablelines_num,
+            sizeof(*page->tablelines_vertical.tablelines),
+            tablelines_compare_y
+            );
+    
+    if (0)
+    {
+        /* Show info about lines. */
+        int i;
+        outf0("tablelines_horizontal:");
+        for (i=0; i<page->tablelines_horizontal.tablelines_num; ++i)
+        {
+            outf0("    color=%f: %s",
+                    page->tablelines_horizontal.tablelines[i].color,
+                    extract_rect_string(&page->tablelines_horizontal.tablelines[i].rect)
+                    );
+        }
+        outf0("tablelines_vertical:");
+        for (i=0; i<page->tablelines_vertical.tablelines_num; ++i)
+        {
+            outf0("    color=%f: %s",
+                    page->tablelines_vertical.tablelines[i].color,
+                    extract_rect_string(&page->tablelines_vertical.tablelines[i].rect)
+                    );
+        }
+    }
+    
+    /* Look for completely separate vertical regions that define different
+    tables, by looking for vertical gaps between the rects of each
+    horizontal/vertical line. */
+    maxy = -DBL_MAX;
+    miny = -DBL_MAX;
+    iv = 0;
+    ih = 0;
+    for(;;)
+    {
+        tableline_t* tlv = NULL;
+        tableline_t* tlh = NULL;
+        tableline_t* tl;
+        if (iv < page->tablelines_vertical.tablelines_num)
+        {
+            tlv = &page->tablelines_vertical.tablelines[iv];
+        }
+        /* We only consider horizontal lines that are not white. This is a bit
+        of a cheat to get the right behaviour with twotables_2.pdf. */
+        while (ih < page->tablelines_horizontal.tablelines_num)
+        {
+            if (page->tablelines_horizontal.tablelines[ih].color == 1)
+            {
+                /* Ignore white horizontal lines. */
+                ++ih;
+            }
+            else
+            {
+                tlh = &page->tablelines_horizontal.tablelines[ih];
+                break;
+            }
+        }
+        if (tlv && tlh)
+        {
+            tl = (tlv->rect.min.y < tlh->rect.min.y) ? tlv : tlh;
+        }
+        else if (tlv) tl = tlv;
+        else if (tlh) tl = tlh;
+        else break;
+        if (tl == tlv)  iv += 1;
+        else ih += 1;
+        if (tl->rect.min.y > maxy + margin)
+        {
+            if (maxy > miny)
+            {
+                outf("New table. maxy=%f miny=%f", maxy, miny);
+                /* Find table. */
+                table_find(alloc, page, miny - margin, maxy + margin);
+            }
+            miny = tl->rect.min.y;
+        }
+        if (tl->rect.max.y > maxy)  maxy = tl->rect.max.y;
+    }
+    
+    /* Find last table. */
+    table_find(alloc, page, miny - margin, maxy + margin);
+    
+    return 0;
+}
+
+
+static void show_tables(table_t** tables, int tables_num)
+/* For debugging only. */
+{
+    int i;
+    outf0("tables_num=%i", tables_num);
+    for (i=0; i<tables_num; ++i)
+    {
+        table_t* table = tables[i];
+        int y;
+        outf0("table %i: cells_num_y=%i cells_num_x=%i", i, table->cells_num_y, table->cells_num_x);
+        for (y=0; y<table->cells_num_y; ++y)
+        {
+            int x;
+            for (x=0; x<table->cells_num_x; ++x)
+            {
+                cell_t* cell = table->cells[table->cells_num_x * y + x];
+                outf0("cell: y=% 3i x=% 3i: left=%i above=%i rect=%s",
+                        y, x, cell->left, cell->above, extract_rect_string(&cell->rect));
+            }
+        }
+    }
+}
+
+static int extract_page_tables_find(
+        extract_alloc_t*    alloc,
+        extract_page_t*     page
+        )
+/* Find tables in <page>.
+
+At the moment this only calls extract_page_tables_find_lines(), but in future
+will call other functions that find tables in different ways, e.g. by analysing
+an image of a page, or looking for blocks of whitespace in between chunks of
+text. */
+{
+    if (extract_page_tables_find_lines(alloc, page)) return -1;
+
+    if (0)
+    {
+        outf0("=== tables from extract_page_tables_find_lines():");
+        show_tables(page->tables, page->tables_num);
+    }
+
+    return 0;
+}
+
+static int extract_document_join_page(
+        extract_alloc_t*    alloc,
+        extract_page_t*     page
+        )
+/* Finds tables and paragraphs on <page>. */
+{
+    /* Find tables on this page first. This will remove text that is within
+    tables from page->spans, so that text doesn't appearing more than once in
+    the final output. */
+    if (extract_page_tables_find(alloc, page)) return -1;
+
+    /* Now join remaining spans into lines and paragraphs. */
+    if (s_join_page_rects(
+            alloc,
+            page,
+            NULL /*rects*/,
+            0 /*rects_num*/,
+            &page->lines,
+            &page->lines_num,
+            &page->paragraphs,
+            &page->paragraphs_num
+            ))
+    {
+        outf0("s_join_page_rects failed. page->spans_num=%i page->lines_num=%i page->paragraphs_num=%i",
+                page->spans_num,
+                page->lines_num,
+                page->paragraphs_num
+                );
+        return -1;
+    }
+    
+    return 0;
+}
+
+
+int extract_document_join(extract_alloc_t* alloc, document_t* document)
+{
+    /* For each page in <document> we find tables and join spans into lines and paragraphs.
+
+    A line is a list of spans that are at the same angle and on the same
+    line. A paragraph is a list of lines that are at the same angle and close
+    together.
+    */
+    int p;
+    for (p=0; p<document->pages_num; ++p) {
+        extract_page_t* page = document->pages[p];
+        
+        outf("processing page %i: num_spans=%i", p, page->spans_num);
+        if (extract_document_join_page(alloc, page)) return -1;
+    }
+
+    return 0;
 }
diff --git a/extract/src/mem.c b/extract/src/mem.c
index 83b5032c..1c3c96e6 100644
--- a/extract/src/mem.c
+++ b/extract/src/mem.c
@@ -19,16 +19,26 @@ void extract_bzero(void *b, size_t len)
 int extract_vasprintf(extract_alloc_t* alloc, char** out, const char* format, va_list va)
 {
     int n;
-    int n2;
+    int ret;
     va_list va2;
     va_copy(va2, va);
     n = vsnprintf(NULL, 0, format, va);
-    if (n < 0) return n;
-    if (extract_malloc(alloc, out, n + 1)) return -1;
-    n2 = vsnprintf(*out, n + 1, format, va2);
+    if (n < 0)
+    {
+        ret = n;
+        goto end;
+    }
+    if (extract_malloc(alloc, out, n + 1))
+    {
+        ret = -1;
+        goto end;
+    }
+    vsnprintf(*out, n + 1, format, va2);
+    ret = 0;
+    
+    end:
     va_end(va2);
-    assert(n2 == n);
-    return n2;
+    return ret;
 }
 
 
diff --git a/extract/src/mem.h b/extract/src/mem.h
index ffdcb049..2611b04f 100644
--- a/extract/src/mem.h
+++ b/extract/src/mem.h
@@ -8,8 +8,17 @@
 
 void extract_bzero(void *b, size_t len);
 
-int extract_vasprintf(extract_alloc_t* alloc, char** out, const char* format, va_list va);
-int extract_asprintf(extract_alloc_t* alloc, char** out, const char* format, ...);
+int extract_vasprintf(extract_alloc_t* alloc, char** out, const char* format, va_list va)
+        #ifdef __GNUC__
+        __attribute__ ((format (printf, 3, 0)))
+        #endif
+        ;
+
+int extract_asprintf(extract_alloc_t* alloc, char** out, const char* format, ...)
+        #ifdef __GNUC__
+        __attribute__ ((format (printf, 3, 4)))
+        #endif
+        ;
 
 int extract_strdup(extract_alloc_t* alloc, const char* s, char** o_out);
 
diff --git a/extract/src/memento.py b/extract/src/memento.py
index 987cd4fd..55171e39 100755
--- a/extract/src/memento.py
+++ b/extract/src/memento.py
@@ -3,20 +3,29 @@
 '''
 Post-processor for Memento.
 
+Usage:
+    memento.py <args> [<command> ...]
+
 Args:
     -q <quiet>
         Controls how often we output 'Memory squeezing @ ...' lines. E.g. '-q
         10' outputs for multiples of 10.
+
+If <command> is specified we run it and look at the output. Otherwise we assume
+that Memento output is available on our stdin.
 '''
 
 import os
 import re
+import subprocess
 import sys
 
 
 def main():
     quiet = 1
+    quiet_next = 0
     out_raw = None
+    command = None
     args = iter(sys.argv[1:])
     while 1:
         try:
@@ -29,15 +38,32 @@ def main():
             out_raw = open(next(args), 'w')
         elif arg == '-q':
             quiet = int(next(args))
-        else:
+        elif arg.startswith('-'):
             raise Exception(f'unrecognised arg: {arg}')
+        else:
+            command = arg
+            for a in args:
+                command += f' {a}'
+
+    if command:
+        print(f'Running: {command}')
+        child = subprocess.Popen(
+                command,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                shell=True,
+                text=True,
+                )
+        stdin = child.stdout
+    else:
+        stdin = sys.stdin
     
     openbsd = os.uname()[0] == 'OpenBSD'
     n = None
     segv = 0
     leaks = 0
     lines = []
-    for line in sys.stdin:
+    for line in stdin:
         if out_raw:
             out_raw.write(line)
         m = re.match('^Memory squeezing @ ([0-9]+)( complete)?', line)
@@ -45,7 +71,7 @@ def main():
             if not m.group(2):
                 # Start of squeeze.
                 
-                if not openbsd:
+                if 0 and not openbsd:
                     # Looks like memento's forked processes might terminate
                     # before they get to output the 'Memory squeezing @ <N>
                     # complete' line.
@@ -53,9 +79,10 @@ def main():
                     assert n is None, f'n={n} line={line!r}'
                 
                 n = int(m.group(1))
-                if n % quiet == 0:
-                    sys.stdout.write(line)
+                if n >= quiet_next:
+                    sys.stdout.write(f'quiet_next={quiet_next!r} n={n!r}: {line}')
                     sys.stdout.flush()
+                    quiet_next = (n + quiet) // quiet * quiet
             else:
                 # End of squeeze.
                 assert n == int(m.group(1))
@@ -66,6 +93,8 @@ def main():
                         if l.endswith('\n'):
                             l = l[:-1]
                         print(f'    {l}')
+                    if command:
+                        print(f'Examine with: MEMENTO_FAILAT={n} {command}')
                 lines = []
                 segv = 0
                 leaks = 0
diff --git a/extract/src/misc-test.c b/extract/src/misc-test.c
index 58b098ff..5e658e8f 100644
--- a/extract/src/misc-test.c
+++ b/extract/src/misc-test.c
@@ -35,6 +35,15 @@ static void s_check(
     if (!ok) s_num_fails += 1;
 }
 
+static void s_check_e( int e, const char* text)
+{
+    if (e)
+    {
+        s_num_fails += 1;
+        printf( "Error: e=%i: %s\n", e, text);
+    }
+}
+
 static void s_check_int(const char* text, int value_expected, int expected_errno)
 {
     int     value;
@@ -59,6 +68,53 @@ static void s_check_uint(const char* text, unsigned expected_value, int expected
     return;
 }
 
+static void s_check_xml_parse()
+{
+    int e;
+    extract_buffer_t* buffer;
+    extract_xml_tag_t tag;
+    unsigned i;
+    const char* texts[] = {
+            "<foo a=1>text</foo>",
+            "< >",
+            "<foo bar=>",
+            "< bar=>",
+            "< =>",
+            };
+    
+    extract_xml_tag_init( &tag);
+    
+    for (i=0; i<sizeof(texts) / sizeof(texts[0]); ++i)
+    {
+        const char* text = texts[i];
+        printf("testing extract_xml_pparse_*(): %s\n", text);
+        e = extract_buffer_open_simple(
+                NULL /*alloc*/,
+                text,
+                strlen(text),
+                NULL /*handle*/,
+                NULL /*fn_close*/,
+                &buffer
+                );
+        s_check_e( e, "extract_buffer_open_simple()");
+        e = extract_xml_pparse_init( NULL /*alloc*/, buffer, NULL /*first_line*/);
+        s_check_e( e, "extract_xml_pparse_init()");
+
+        e = extract_xml_pparse_next( buffer, &tag);
+        s_check_e( e, "extract_xml_pparse_next()");
+        s_check_e( tag.name ? 0 : 1, "tag.name is not null");
+        
+        {
+            int j;
+            for (j=0; j<tag.attributes_num; ++j)
+            {
+                s_check_e( tag.attributes[j].name ? 0 : 1, "attribute is non-null");
+                s_check_e( tag.attributes[j].value ? 0 : 1, "attribute is non-null");
+            }
+        }
+    }
+}
+
 int main(void)
 {
     printf("testing extract_xml_str_to_int():\n");
@@ -73,6 +129,8 @@ int main(void)
     s_check_uint("-20b", 0, EINVAL);
     s_check_uint("123456789123", 0, ERANGE);
     
+    s_check_xml_parse();
+    
     printf("s_num_fails=%i\n", s_num_fails);
     
     if (s_num_fails) {
diff --git a/extract/src/odt.c b/extract/src/odt.c
index bacb362d..9e369078 100644
--- a/extract/src/odt.c
+++ b/extract/src/odt.c
@@ -21,6 +21,7 @@ odt_paragraph_finish(). */
 
 #include <assert.h>
 #include <errno.h>
+#include <float.h>
 #include <math.h>
 #include <stdlib.h>
 #include <stdio.h>
@@ -29,17 +30,16 @@ odt_paragraph_finish(). */
 #include <sys/stat.h>
 
 
-static int extract_odt_paragraph_start(extract_alloc_t* alloc, extract_astring_t* content)
+static int s_odt_paragraph_start(extract_alloc_t* alloc, extract_astring_t* content)
 {
     return extract_astring_cat(alloc, content, "\n\n<text:p>");
 }
 
-static int extract_odt_paragraph_finish(extract_alloc_t* alloc, extract_astring_t* content)
+static int s_odt_paragraph_finish(extract_alloc_t* alloc, extract_astring_t* content)
 {
     return extract_astring_cat(alloc, content, "</text:p>");
 }
 
-
 /* ODT doesn't seem to support ad-hoc inline font specifications; instead
 we have to define a style at the start of the content.xml file. So when
 writing content we insert a style name and add the required styles to a
@@ -48,10 +48,7 @@ extract_odt_styles_t struct. */
 struct extract_odt_style_t
 {
     int     id; /* A unique id for this style. */
-    char*   font_name;
-    double  font_size;
-    int     font_bold;
-    int     font_italic;
+    font_t  font;
 };
 
 struct extract_odt_styles_t
@@ -61,41 +58,47 @@ struct extract_odt_styles_t
     int                     styles_num;
 };
 
-static int extract_odt_style_compare(extract_odt_style_t* a, extract_odt_style_t*b)
+static int s_odt_style_compare(extract_odt_style_t* a, extract_odt_style_t*b)
 {
     int d;
     double dd;
-    if ((d = strcmp(a->font_name, b->font_name)))   return d;
-    if ((dd = a->font_size - b->font_size) != 0.0)  return (dd > 0.0) ? 1 : -1;
-    if ((d = a->font_bold - b->font_bold))          return d;
-    if ((d = a->font_italic - b->font_italic))      return d;
+    if ((d = strcmp(a->font.name, b->font.name)))   return d;
+    if ((dd = a->font.size - b->font.size) != 0.0)  return (dd > 0.0) ? 1 : -1;
+    if ((d = a->font.bold - b->font.bold))          return d;
+    if ((d = a->font.italic - b->font.italic))      return d;
     return 0;
 }
 
-static int extract_odt_style_append_definition(extract_alloc_t* alloc, extract_odt_style_t* style, extract_astring_t* text)
+static int s_odt_style_append_definition(extract_alloc_t* alloc, extract_odt_style_t* style, extract_astring_t* text)
 {
-    const char* font_name = style->font_name;
+    const char* font_name = style->font.name;
     /* This improves output e.g. for zlib.3.pdf, but clearly a hack. */
     if (0 && strstr(font_name, "Helvetica"))
     {
         font_name = "Liberation Sans";
     }
-    outf("style->font_name=%s font_name=%s", style->font_name, font_name);
+    outf("style->font_name=%s font_name=%s", style->font.name, font_name);
     if (extract_astring_catf(alloc, text, "<style:style style:name=\"T%i\" style:family=\"text\">", style->id)) return -1;
     if (extract_astring_catf(alloc, text, "<style:text-properties style:font-name=\"%s\"", font_name)) return -1;
-    if (extract_astring_catf(alloc, text, " fo:font-size=\"%.2fpt\"", style->font_size)) return -1;
-    if (extract_astring_catf(alloc, text, " fo:font-weight=\"%s\"", style->font_bold ? "bold" : "normal")) return -1;
-    if (extract_astring_catf(alloc, text, " fo:font-style=\"%s\"", style->font_italic ? "italic" : "normal")) return -1;
+    if (extract_astring_catf(alloc, text, " fo:font-size=\"%.2fpt\"", style->font.size)) return -1;
+    if (extract_astring_catf(alloc, text, " fo:font-weight=\"%s\"", style->font.bold ? "bold" : "normal")) return -1;
+    if (extract_astring_catf(alloc, text, " fo:font-style=\"%s\"", style->font.italic ? "italic" : "normal")) return -1;
     if (extract_astring_cat(alloc, text, " /></style:style>")) return -1;
     return 0;
 }
 
 void extract_odt_styles_free(extract_alloc_t* alloc, extract_odt_styles_t* styles)
 {
+    int i;
+    for (i=0; i<styles->styles_num; ++i)
+    {
+        extract_odt_style_t* style = &styles->styles[i];
+        extract_free(alloc, &style->font.name);
+    }
     extract_free(alloc, &styles->styles);
 }
 
-static int extract_odt_styles_definitions(
+static int s_odt_styles_definitions(
         extract_alloc_t*        alloc,
         extract_odt_styles_t*   styles,
         extract_astring_t*      out
@@ -105,7 +108,7 @@ static int extract_odt_styles_definitions(
     if (extract_astring_cat(alloc, out, "<office:automatic-styles>")) return -1;
     for (i=0; i<styles->styles_num; ++i)
     {
-        if (extract_odt_style_append_definition(alloc, &styles->styles[i], out)) return -1;
+        if (s_odt_style_append_definition(alloc, &styles->styles[i], out)) return -1;
     }
     extract_astring_cat(alloc, out, "<style:style style:name=\"gr1\" style:family=\"graphic\">\n");
     extract_astring_cat(alloc, out, "<style:graphic-properties"
@@ -159,25 +162,22 @@ static int extract_odt_styles_definitions(
     return 0;
 }
 
-static int styles_add(
+static int s_odt_styles_add(
         extract_alloc_t*        alloc,
         extract_odt_styles_t*   styles,
-        const char*             font_name,
-        double                  font_size,
-        int                     font_bold,
-        int                     font_italic,
+        font_t*                 font,
         extract_odt_style_t**   o_style
     )
 /* Adds specified style to <styles> if not already present. Sets *o_style to
 point to the style_t within <styles>. */
 {
-    extract_odt_style_t style = {0 /*id*/, (char*) font_name, font_size, font_bold, font_italic};
+    extract_odt_style_t style = {0 /*id*/, *font};
     int i;
     /* We keep styles->styles[] sorted; todo: use bsearch or similar when
     searching. */
     for (i=0; i<styles->styles_num; ++i)
     {
-        int d = extract_odt_style_compare(&style, &styles->styles[i]);
+        int d = s_odt_style_compare(&style, &styles->styles[i]);
         if (d == 0)
         {
             *o_style = &styles->styles[i];
@@ -190,92 +190,79 @@ point to the style_t within <styles>. */
     memmove(&styles->styles[i+1], &styles->styles[i], sizeof(styles->styles[0]) * (styles->styles_num - i));
     styles->styles_num += 1;
     styles->styles[i].id = styles->styles_num + 10; /* Leave space for template's built-in styles. */
-    if (extract_strdup(alloc, font_name, &styles->styles[i].font_name)) return -1;
-    styles->styles[i].font_size = font_size;
-    styles->styles[i].font_bold = font_bold;
-    styles->styles[i].font_italic = font_italic;
+    if (extract_strdup(alloc, font->name, &styles->styles[i].font.name)) return -1;
+    styles->styles[i].font.size = font->size;
+    styles->styles[i].font.bold = font->bold;
+    styles->styles[i].font.italic = font->italic;
     *o_style = &styles->styles[i];
     return 0;
 }
 
 static int extract_odt_run_start(
-        extract_alloc_t* alloc,
-        extract_astring_t* content,
-        extract_odt_styles_t* styles,
-        const char* font_name,
-        double font_size,
-        int bold,
-        int italic
+        extract_alloc_t*        alloc,
+        extract_astring_t*      content,
+        extract_odt_styles_t*   styles,
+        content_state_t*        content_state
         )
-/* Starts a new run. Caller must ensure that extract_odt_run_finish() was
+/* Starts a new run. Caller must ensure that s_odt_run_finish() was
 called to terminate any previous run. */
 {
     extract_odt_style_t* style;
-    if (styles_add(alloc, styles, font_name, font_size, bold, italic, &style)) return -1;
+    if (s_odt_styles_add(
+            alloc,
+            styles,
+            &content_state->font,
+            &style
+            )) return -1;
     if (extract_astring_catf(alloc, content, "<text:span text:style-name=\"T%i\">", style->id)) return -1;
     return 0;
 }
 
-static int extract_odt_run_finish(extract_alloc_t* alloc, extract_astring_t* content)
+static int s_odt_run_finish(extract_alloc_t* alloc, content_state_t* content_state, extract_astring_t* content)
 {
+    if (content_state)  content_state->font.name = NULL;
     return extract_astring_cat(alloc, content, "</text:span>");
 }
 
-static int extract_odt_paragraph_empty(extract_alloc_t* alloc, extract_astring_t* content, extract_odt_styles_t* styles)
+static int s_odt_append_empty_paragraph(extract_alloc_t* alloc, extract_astring_t* content, extract_odt_styles_t* styles)
 /* Append an empty paragraph to *content. */
 {
     int e = -1;
-    if (extract_odt_paragraph_start(alloc, content)) goto end;
+    static char fontname[] = "OpenSans";
+    content_state_t content_state = {0};
+    if (s_odt_paragraph_start(alloc, content)) goto end;
     /* [This comment is from docx, haven't checked odt.] It seems like our
-    choice of font size here doesn't make any difference to the ammount of
+    choice of font size here doesn't make any difference to the amount of
     vertical space, unless we include a non-space character. Presumably
     something to do with the styles in the template document. */
-    if (extract_odt_run_start(
-            alloc,
-            content,
-            styles,
-            "OpenSans",
-            10 /*font_size*/,
-            0 /*font_bold*/,
-            0 /*font_italic*/
-            )) goto end;
+    content_state.font.name = fontname;
+    content_state.font.size = 10;
+    content_state.font.bold = 0;
+    content_state.font.italic = 0;
+    if (extract_odt_run_start(alloc, content, styles, &content_state)) goto end;
     //docx_char_append_string(content, "&#160;");   /* &#160; is non-break space. */
-    if (extract_odt_run_finish(alloc, content)) goto end;
-    if (extract_odt_paragraph_finish(alloc, content)) goto end;
+    if (s_odt_run_finish(alloc, NULL /*content_state*/, content)) goto end;
+    if (s_odt_paragraph_finish(alloc, content)) goto end;
     e = 0;
     end:
     return e;
 }
 
 
-typedef struct
-{
-    const char* font_name;
-    double      font_size;
-    int         font_bold;
-    int         font_italic;
-    matrix_t*   ctm_prev;
-    /* todo: add extract_odt_styles_t member? */
-} content_state_t;
-/* Used to keep track of font information when writing paragraphs of odt
-content, e.g. so we know whether a font has changed so need to start a new odt
-span. */
-
-
-static int extract_document_to_odt_content_paragraph(
+static int s_document_to_odt_content_paragraph(
         extract_alloc_t*        alloc,
-        content_state_t*        state,
+        content_state_t*        content_state,
         paragraph_t*            paragraph,
         extract_astring_t*      content,
         extract_odt_styles_t*   styles
         )
-/* Append odt xml for <paragraph> to <content>. Updates *state if we change
-font. */
+/* Append odt xml for <paragraph> to <content>. Updates *content_state if we
+change font. */
 {
     int e = -1;
     int l;
 
-    if (extract_odt_paragraph_start(alloc, content)) goto end;
+    if (s_odt_paragraph_start(alloc, content)) goto end;
 
     for (l=0; l<paragraph->lines_num; ++l)
     {
@@ -286,50 +273,41 @@ font. */
             int si;
             span_t* span = line->spans[s];
             double font_size_new;
-            state->ctm_prev = &span->ctm;
+            content_state->ctm_prev = &span->ctm;
             font_size_new = extract_matrices_to_font_size(&span->ctm, &span->trm);
-            if (!state->font_name
-                    || strcmp(span->font_name, state->font_name)
-                    || span->flags.font_bold != state->font_bold
-                    || span->flags.font_italic != state->font_italic
-                    || font_size_new != state->font_size
+            if (!content_state->font.name
+                    || strcmp(span->font_name, content_state->font.name)
+                    || span->flags.font_bold != content_state->font.bold
+                    || span->flags.font_italic != content_state->font.italic
+                    || font_size_new != content_state->font.size
                     )
             {
-                if (state->font_name)
+                if (content_state->font.name)
                 {
-                    if (extract_odt_run_finish(alloc, content)) goto end;
+                    if (s_odt_run_finish(alloc, content_state, content)) goto end;
                 }
-                state->font_name = span->font_name;
-                state->font_bold = span->flags.font_bold;
-                state->font_italic = span->flags.font_italic;
-                state->font_size = font_size_new;
-                if (extract_odt_run_start(
-                        alloc,
-                        content,
-                        styles,
-                        state->font_name,
-                        state->font_size,
-                        state->font_bold,
-                        state->font_italic
-                        )) goto end;
+                content_state->font.name = span->font_name;
+                content_state->font.bold = span->flags.font_bold;
+                content_state->font.italic = span->flags.font_italic;
+                content_state->font.size = font_size_new;
+                if (extract_odt_run_start( alloc, content, styles, content_state)) goto end;
             }
 
             for (si=0; si<span->chars_num; ++si)
             {
                 char_t* char_ = &span->chars[si];
                 int c = char_->ucs;
-                if (extract_astring_cat_xmlc(alloc, content, c)) goto end;
+                if (extract_astring_catc_unicode_xml(alloc, content, c)) goto end;
             }
             /* Remove any trailing '-' at end of line. */
-            if (astring_char_truncate_if(content, '-')) goto end;
+            if (extract_astring_char_truncate_if(content, '-')) goto end;
         }
     }
-    if (state->font_name)
+    if (content_state->font.name)
     {
-        if (extract_odt_run_finish(alloc, content)) goto end;
-        state->font_name = NULL;
+        if (s_odt_run_finish(alloc, content_state, content)) goto end;
     }
-    if (extract_odt_paragraph_finish(alloc, content)) goto end;
+    if (s_odt_paragraph_finish(alloc, content)) goto end;
     
     e = 0;
     
@@ -337,7 +315,7 @@ font. */
     return e;
 }
 
-static int extract_document_append_image(
+static int s_odt_append_image(
         extract_alloc_t*    alloc,
         extract_astring_t*  content,
         image_t*            image
@@ -362,7 +340,7 @@ static int extract_document_append_image(
 }
 
 
-static int extract_document_output_rotated_paragraphs(
+static int s_odt_output_rotated_paragraphs(
         extract_alloc_t*    alloc,
         extract_page_t*     page,
         int                 paragraph_begin,
@@ -375,14 +353,14 @@ static int extract_document_output_rotated_paragraphs(
         int                 text_box_id,
         extract_astring_t*  content,
         extract_odt_styles_t* styles,
-        content_state_t*    state
+        content_state_t*    content_state
         )
 /* Writes paragraph to content inside rotated text box. */
 {
     int e = 0;
     int p;
     double pt_to_inch = 1/72.0;
-    outf("rotated paragraphs: rotation_rad=%f (x y)=(%i %i) (w h)=(%i %i)", rotation_rad, x_pt, y_pt, w_pt, h_pt);
+    outf("rotated paragraphs: rotation_rad=%f (x y)=(%f %f) (w h)=(%f %f)", rotation_rad, x_pt, y_pt, w_pt, h_pt);
     
     // https://docs.oasis-open.org/office/OpenDocument/v1.3/cs02/part3-schema/OpenDocument-v1.3-cs02-part3-schema.html#attribute-draw_transform
     // says rotation is in degrees, but we seem to require -radians.
@@ -414,7 +392,7 @@ static int extract_document_output_rotated_paragraphs(
     for (p=paragraph_begin; p<paragraph_end; ++p)
     {
         paragraph_t* paragraph = page->paragraphs[p];
-        if (!e) e = extract_document_to_odt_content_paragraph(alloc, state, paragraph, content, styles);
+        if (!e) e = s_document_to_odt_content_paragraph(alloc, content_state, paragraph, content, styles);
     }
     
     if (!e) e = extract_astring_cat(alloc, content, "\n");
@@ -427,6 +405,219 @@ static int extract_document_output_rotated_paragraphs(
 }
 
 
+static int s_odt_append_table(extract_alloc_t* alloc, table_t* table, extract_astring_t* content, extract_odt_styles_t* styles)
+{
+    int e = -1;
+    int y;
+    
+    {
+        int x;
+        static int table_number = 0;
+        table_number += 1;
+        if (extract_astring_catf(alloc, content,
+                "\n"
+                "    <table:table text:style-name=\"extract.table\" table:name=\"extract.table.%i\">\n"
+                "        <table:table-columns>\n"
+                ,
+                table_number
+                )) goto end;
+
+        for (x=0; x<table->cells_num_x; ++x)
+        {
+            if (extract_astring_cat(alloc, content,
+                    "            <table:table-column table:style-name=\"extract.table.column\"/>\n"
+                    )) goto end;
+        }
+        if (extract_astring_cat(alloc, content,
+                "        </table:table-columns>\n"
+                )) goto end;
+  }
+  for (y=0; y<table->cells_num_y; ++y)
+    {
+        int x;
+        if (extract_astring_cat(alloc, content,
+                "        <table:table-row>\n"
+                )) goto end;
+        
+        for (x=0; x<table->cells_num_x; ++x)
+        {
+            cell_t* cell = table->cells[y*table->cells_num_x + x];
+            if (!cell->above || !cell->left)
+            {
+                if (extract_astring_cat(alloc, content, "            <table:covered-table-cell/>\n")) goto end;
+                continue;
+            }
+            
+            if (extract_astring_cat(alloc, content, "            <table:table-cell")) goto end;
+            if (cell->extend_right > 1)
+            {
+                if (extract_astring_catf(alloc, content, " table:number-columns-spanned=\"%i\"", cell->extend_right)) goto end;
+            }
+            if (cell->extend_down > 1)
+            {
+                if (extract_astring_catf(alloc, content, " table:number-rows-spanned=\"%i\"", cell->extend_down)) goto end;
+            }
+            if (extract_astring_catf(alloc, content, ">\n")) goto end;
+            
+            /* Write contents of this cell. */
+            {
+                int p;
+                content_state_t content_state;
+                content_state.font.name = NULL;
+                content_state.ctm_prev = NULL;
+                for (p=0; p<cell->paragraphs_num; ++p)
+                {
+                    paragraph_t* paragraph = cell->paragraphs[p];
+                    if (s_document_to_odt_content_paragraph(alloc, &content_state, paragraph, content, styles)) goto end;
+                }
+                if (content_state.font.name)
+                {
+                    if (s_odt_run_finish(alloc, &content_state, content)) goto end;
+                }
+                if (extract_astring_cat(alloc, content, "\n")) goto end;
+            }
+            if (extract_astring_cat(alloc, content, "            </table:table-cell>\n")) goto end;
+        }
+        if (extract_astring_cat(alloc, content, "        </table:table-row>\n")) goto end;
+    }
+    if (extract_astring_cat(alloc, content, "    </table:table>\n")) goto end;
+    e = 0;
+    
+    end:
+    return e;
+}
+
+
+static int s_odt_append_rotated_paragraphs(
+        extract_alloc_t*    alloc,
+        extract_page_t*     page,
+        content_state_t*    content_state,
+        int*                p,
+        int*                text_box_id,
+        const matrix_t*     ctm,
+        double              rotate,
+        extract_astring_t*  content,
+        extract_odt_styles_t* styles
+        )
+/* Appends paragraphs with same rotation, starting with page->paragraphs[*p]
+and updates *p. */
+{
+    /* Find extent of paragraphs with this same rotation. extent
+    will contain max width and max height of paragraphs, in units
+    before application of ctm, i.e. before rotation. */
+    int e = -1;
+    point_t extent = {0, 0};
+    int p0 = *p;
+    int p1;
+    paragraph_t* paragraph = page->paragraphs[*p];
+
+    outf("rotate=%.2frad=%.1fdeg ctm: ef=(%f %f) abcd=(%f %f %f %f)",
+            rotate, rotate * 180 / pi,
+            ctm->e,
+            ctm->f,
+            ctm->a,
+            ctm->b,
+            ctm->c,
+            ctm->d
+            );
+
+    {
+        /* We assume that first span is at origin of text
+        block. This assumes left-to-right text. */
+        double rotate0 = rotate;
+        const matrix_t* ctm0 = ctm;
+        point_t origin =
+        {
+                paragraph->lines[0]->spans[0]->chars[0].x,
+                paragraph->lines[0]->spans[0]->chars[0].y
+        };
+        matrix_t ctm_inverse = {1, 0, 0, 1, 0, 0};
+        double ctm_det = ctm->a*ctm->d - ctm->b*ctm->c;
+        if (ctm_det != 0)
+        {
+            ctm_inverse.a = +ctm->d / ctm_det;
+            ctm_inverse.b = -ctm->b / ctm_det;
+            ctm_inverse.c = -ctm->c / ctm_det;
+            ctm_inverse.d = +ctm->a / ctm_det;
+        }
+        else
+        {
+            outf("cannot invert ctm=(%f %f %f %f)",
+                    ctm->a, ctm->b, ctm->c, ctm->d);
+        }
+
+        for (*p=p0; *p<page->paragraphs_num; ++*p)
+        {
+            paragraph = page->paragraphs[*p];
+            ctm = &paragraph->lines[0]->spans[0]->ctm;
+            rotate = atan2(ctm->b, ctm->a);
+            if (rotate != rotate0)
+            {
+                break;
+            }
+
+            /* Update <extent>. */
+            {
+                int l;
+                for (l=0; l<paragraph->lines_num; ++l)
+                {
+                    line_t* line = paragraph->lines[l];
+                    span_t* span = extract_line_span_last(line);
+                    char_t* char_ = extract_span_char_last(span);
+                    double adv = char_->adv * extract_matrix_expansion(span->trm);
+                    double x = char_->x + adv * cos(rotate);
+                    double y = char_->y + adv * sin(rotate);
+
+                    double dx = x - origin.x;
+                    double dy = y - origin.y;
+
+                    /* Position relative to origin and before box rotation. */
+                    double xx = ctm_inverse.a * dx + ctm_inverse.b * dy;
+                    double yy = ctm_inverse.c * dx + ctm_inverse.d * dy;
+                    yy = -yy;
+                    if (xx > extent.x) extent.x = xx;
+                    if (yy > extent.y) extent.y = yy;
+                    if (0) outf("rotate=%f *p=%i: origin=(%f %f) xy=(%f %f) dxy=(%f %f) xxyy=(%f %f) span: %s",
+                            rotate, *p, origin.x, origin.y, x, y, dx, dy, xx, yy, extract_span_string(alloc, span));
+                }
+            }
+        }
+        p1 = *p;
+        rotate = rotate0;
+        ctm = ctm0;
+        outf("rotate=%f p0=%i p1=%i. extent is: (%f %f)",
+                rotate, p0, p1, extent.x, extent.y);
+    }
+
+    /* Paragraphs p0..p1-1 have same rotation. We output them into
+    a single rotated text box. */
+
+    /* We need unique id for text box. */
+    *text_box_id += 1;
+
+    if (s_odt_output_rotated_paragraphs(
+            alloc,
+            page,
+            p0,
+            p1,
+            rotate,
+            ctm->e,
+            ctm->f,
+            extent.x,
+            extent.y,
+            *text_box_id,
+            content,
+            styles,
+            content_state
+            )) goto end;
+    *p = p1 - 1;
+    e = 0;
+    
+    end:
+    return e;
+}
+
+
 int extract_document_to_odt_content(
         extract_alloc_t*    alloc,
         document_t*         document,
@@ -445,156 +636,66 @@ int extract_document_to_odt_content(
     for (p=0; p<document->pages_num; ++p)
     {
         extract_page_t* page = document->pages[p];
-        int p;
-        content_state_t state;
-        state.font_name = NULL;
-        state.font_size = 0;
-        state.font_bold = 0;
-        state.font_italic = 0;
-        state.ctm_prev = NULL;
+        int p = 0;
+        int t = 0;
+        content_state_t content_state;
+        content_state.font.name = NULL;
+        content_state.font.size = 0;
+        content_state.font.bold = 0;
+        content_state.font.italic = 0;
+        content_state.ctm_prev = NULL;
         
-        for (p=0; p<page->paragraphs_num; ++p)
+        for(;;)
         {
-            paragraph_t* paragraph = page->paragraphs[p];
-            const matrix_t* ctm = &paragraph->lines[0]->spans[0]->ctm;
-            double rotate = atan2(ctm->b, ctm->a);
+            paragraph_t* paragraph = (p == page->paragraphs_num) ? NULL : page->paragraphs[p];
+            table_t* table = (t == page->tables_num) ? NULL : page->tables[t];
+            double y_paragraph;
+            double y_table;
+            if (!paragraph && !table)   break;
+            y_paragraph = (paragraph) ? paragraph->lines[0]->spans[0]->chars[0].y : DBL_MAX;
+            y_table = (table) ? table->pos.y : DBL_MAX;
             
-            if (spacing
-                    && state.ctm_prev
-                    && paragraph->lines_num
-                    && paragraph->lines[0]->spans_num
-                    && matrix_cmp4(
-                            state.ctm_prev,
-                            &paragraph->lines[0]->spans[0]->ctm
-                            )
-                    )
+            if (paragraph && y_paragraph < y_table)
             {
-                /* Extra vertical space between paragraphs that were at
-                different angles in the original document. */
-                if (extract_odt_paragraph_empty(alloc, content, styles)) goto end;
-            }
+                const matrix_t* ctm = &paragraph->lines[0]->spans[0]->ctm;
+                double rotate = atan2(ctm->b, ctm->a);
+
+                if (spacing
+                        && content_state.ctm_prev
+                        && paragraph->lines_num
+                        && paragraph->lines[0]->spans_num
+                        && extract_matrix_cmp4(
+                                content_state.ctm_prev,
+                                &paragraph->lines[0]->spans[0]->ctm
+                                )
+                        )
+                {
+                    /* Extra vertical space between paragraphs that were at
+                    different angles in the original document. */
+                    if (s_odt_append_empty_paragraph(alloc, content, styles)) goto end;
+                }
 
-            if (spacing)
-            {
-                /* Extra vertical space between paragraphs. */
-                if (extract_odt_paragraph_empty(alloc, content, styles)) goto end;
-            }
-            
-            if (rotation && rotate != 0)
-            {
-                /* Find extent of paragraphs with this same rotation. extent
-                will contain max width and max height of paragraphs, in units
-                before application of ctm, i.e. before rotation. */
-                point_t extent = {0, 0};
-                int p0 = p;
-                int p1;
-                
-                outf("rotate=%.2frad=%.1fdeg ctm: ef=(%f %f) abcd=(%f %f %f %f)",
-                        rotate, rotate * 180 / pi,
-                        ctm->e,
-                        ctm->f,
-                        ctm->a,
-                        ctm->b,
-                        ctm->c,
-                        ctm->d
-                        );
-                
+                if (spacing)
                 {
-                    /* We assume that first span is at origin of text
-                    block. This assumes left-to-right text. */
-                    double rotate0 = rotate;
-                    const matrix_t* ctm0 = ctm;
-                    point_t origin =
-                    {
-                            paragraph->lines[0]->spans[0]->chars[0].x,
-                            paragraph->lines[0]->spans[0]->chars[0].y
-                    };
-                    matrix_t ctm_inverse = {1, 0, 0, 1, 0, 0};
-                    double ctm_det = ctm->a*ctm->d - ctm->b*ctm->c;
-                    if (ctm_det != 0)
-                    {
-                        ctm_inverse.a = +ctm->d / ctm_det;
-                        ctm_inverse.b = -ctm->b / ctm_det;
-                        ctm_inverse.c = -ctm->c / ctm_det;
-                        ctm_inverse.d = +ctm->a / ctm_det;
-                    }
-                    else
-                    {
-                        outf("cannot invert ctm=(%f %f %f %f)",
-                                ctm->a, ctm->b, ctm->c, ctm->d);
-                    }
-
-                    for (p=p0; p<page->paragraphs_num; ++p)
-                    {
-                        paragraph = page->paragraphs[p];
-                        ctm = &paragraph->lines[0]->spans[0]->ctm;
-                        rotate = atan2(ctm->b, ctm->a);
-                        if (rotate != rotate0)
-                        {
-                            break;
-                        }
-
-                        /* Update <extent>. */
-                        {
-                            int l;
-                            for (l=0; l<paragraph->lines_num; ++l)
-                            {
-                                line_t* line = paragraph->lines[l];
-                                span_t* span = line_span_last(line);
-                                char_t* char_ = span_char_last(span);
-                                double adv = char_->adv * matrix_expansion(span->trm);
-                                double x = char_->x + adv * cos(rotate);
-                                double y = char_->y + adv * sin(rotate);
-
-                                double dx = x - origin.x;
-                                double dy = y - origin.y;
-
-                                /* Position relative to origin and before box rotation. */
-                                double xx = ctm_inverse.a * dx + ctm_inverse.b * dy;
-                                double yy = ctm_inverse.c * dx + ctm_inverse.d * dy;
-                                yy = -yy;
-                                if (xx > extent.x) extent.x = xx;
-                                if (yy > extent.y) extent.y = yy;
-                                if (0) outf("rotate=%f p=%i: origin=(%f %f) xy=(%f %f) dxy=(%f %f) xxyy=(%f %f) span: %s",
-                                        rotate, p, origin.x, origin.y, x, y, dx, dy, xx, yy, span_string(alloc, span));
-                            }
-                        }
-                    }
-                    p1 = p;
-                    rotate = rotate0;
-                    ctm = ctm0;
-                    outf("rotate=%f p0=%i p1=%i. extent is: (%f %f)",
-                            rotate, p0, p1, extent.x, extent.y);
+                    /* Extra vertical space between paragraphs. */
+                    if (s_odt_append_empty_paragraph(alloc, content, styles)) goto end;
                 }
-                
-                /* Paragraphs p0..p1-1 have same rotation. We output them into
-                a single rotated text box. */
-                
-                /* We need unique id for text box. */
-                text_box_id += 1;
-                
-                if (extract_document_output_rotated_paragraphs(
-                        alloc,
-                        page,
-                        p0,
-                        p1,
-                        rotate,
-                        ctm->e,
-                        ctm->f,
-                        extent.x,
-                        extent.y,
-                        text_box_id,
-                        content,
-                        styles,
-                        &state
-                        )) goto end;
-                p = p1 - 1;
+
+                if (rotation && rotate != 0)
+                {
+                    if (s_odt_append_rotated_paragraphs(alloc, page, &content_state, &p, &text_box_id, ctm, rotate, content, styles)) goto end;
+                }
+                else
+                {
+                    if (s_document_to_odt_content_paragraph(alloc, &content_state, paragraph, content, styles)) goto end;
+                }
+                p += 1;
             }
-            else
+            else if (table)
             {
-                if (extract_document_to_odt_content_paragraph(alloc, &state, paragraph, content, styles)) goto end;
+                if (s_odt_append_table(alloc, table, content, styles)) goto end;
+                t += 1;
             }
-        
         }
         
         outf("images=%i", images);
@@ -604,7 +705,7 @@ int extract_document_to_odt_content(
             outf("page->images_num=%i", page->images_num);
             for (i=0; i<page->images_num; ++i)
             {
-                extract_document_append_image(alloc, content, &page->images[i]);
+                s_odt_append_image(alloc, content, &page->images[i]);
             }
         }
     }
@@ -658,26 +759,39 @@ int extract_odt_content_item(
         char* text_intermediate = NULL;
         extract_astring_t   styles_definitions = {0};
 
+        /* Insert content before '</office:text>'. */
         if (extract_content_insert(
                 alloc,
                 text,
                 NULL /*single*/,
-                NULL,
-                "</office:text>",
+                NULL /*mid_begin_name*/,
+                "</office:text>" /*mid_end_name*/,
                 contentss,
                 contentss_num,
                 &text_intermediate
                 )) goto end;
         outf("text_intermediate: %s", text_intermediate);
         
-        if (extract_odt_styles_definitions(alloc, styles, &styles_definitions)) goto end;
+        /* Convert <styles> to text. */
+        if (s_odt_styles_definitions(alloc, styles, &styles_definitions)) goto end;
         
+        /* To make tables work, we seem to need to specify table and column
+        styles, and these can be empty. todo: maybe specify exact sizes based
+        on the pdf table and cell dimensions. */
+        if (extract_astring_cat(alloc, &styles_definitions,
+                "\n"
+                "<style:style style:name=\"extract.table\" style:family=\"table\"/>\n"
+                "<style:style style:name=\"extract.table.column\" style:family=\"table-column\"/>\n"
+                )) goto end;
+        
+        /* Replace '<office:automatic-styles/>' with text from
+        <styles_definitions>. */
         e = extract_content_insert(
                 alloc,
                 text_intermediate,
                 "<office:automatic-styles/>" /*single*/,
-                NULL,
-                NULL, //"</office:automatic-styles>",
+                NULL /*mid_begin_name*/,
+                NULL /*mid_end_name*/,
                 &styles_definitions,
                 1,
                 text2
@@ -719,14 +833,14 @@ int extract_odt_content_item(
     }
     e = 0;
     end:
-    outf("e=%i errno=%i text2=%s", e, errno, text2);
+    outf("e=%i errno=%i text2=%s", e, errno, text2 ? *text2 : "");
     if (e)
     {
         /* We might have set <text2> to new content. */
         extract_free(alloc, text2);
         /* We might have used <temp> as a temporary buffer. */
-        extract_astring_free(alloc, &temp);
     }
+    extract_astring_free(alloc, &temp);
     extract_astring_init(&temp);
     return e;
 }
@@ -747,7 +861,6 @@ int extract_odt_write_template(
     int     e = -1;
     int     i;
     char*   path_tempdir = NULL;
-    FILE*   f = NULL;
     char*   path = NULL;
     char*   text = NULL;
     char*   text2 = NULL;
@@ -827,7 +940,6 @@ int extract_odt_write_template(
     }
 
     /* Copy images into <path_tempdir>/Pictures/. */
-    outf("");
     extract_free(alloc, &path);
     if (extract_asprintf(alloc, &path, "%s/Pictures", path_tempdir) < 0) goto end;
     if (extract_mkdir(path, 0777))
@@ -835,7 +947,6 @@ int extract_odt_write_template(
         outf("Failed to mkdir %s", path);
         goto end;
     }
-    outf("");
     for (i=0; i<images->images_num; ++i)
     {
         image_t* image = &images->images[i];
@@ -869,8 +980,6 @@ int extract_odt_write_template(
     extract_free(alloc, &path);
     extract_free(alloc, &text);
     extract_free(alloc, &text2);
-    //extract_odt_styles_free(alloc, &styles);
-    if (f)  fclose(f);
 
     if (e)
     {
diff --git a/extract/src/outf.c b/extract/src/outf.c
index 95575c16..de7662f6 100644
--- a/extract/src/outf.c
+++ b/extract/src/outf.c
@@ -5,14 +5,14 @@
 #include <stdio.h>
 #include <string.h>
 
-static int s_verbose = 0;
+int extract_outf_verbose = 0;
 
-void outf_verbose_set(int verbose)
+void extract_outf_verbose_set(int verbose)
 {
-    s_verbose = verbose;
+    extract_outf_verbose = verbose;
 }
 
-void (outf)(
+void (extract_outf)(
         int         level,
         const char* file,
         int         line,
@@ -23,7 +23,7 @@ void (outf)(
         )
 {
     va_list va;
-    if (level > s_verbose) {
+    if (level > extract_outf_verbose) {
         return;
     }
     
diff --git a/extract/src/outf.h b/extract/src/outf.h
index a2b6c078..f9b97a93 100644
--- a/extract/src/outf.h
+++ b/extract/src/outf.h
@@ -1,32 +1,42 @@
 #ifndef ARTIFEX_EXTRACT_OUTF_H
 #define ARTIFEX_EXTRACT_OUTF_H
 
+/* Simple printf-style debug output. */
+
+#if defined(__GNUC__) || defined(__clang__) || defined(_WIN32)
+    #define extract_FUNCTION __FUNCTION__
+#else
+    #define extract_FUNCTION ""
+#endif
+
+#define outf(format, ...) \
+        (1 > extract_outf_verbose) ? (void) 0 : (extract_outf)(1, __FILE__, __LINE__, extract_FUNCTION, 1 /*ln*/, format, ##__VA_ARGS__)
+
+#define outf0(format, ...) \
+        (0 > extract_outf_verbose) ? (void) 0 : (extract_outf)(0, __FILE__, __LINE__, extract_FUNCTION, 1 /*ln*/, format, ##__VA_ARGS__)
+
+#define outfx(format, ...)
+
 /* Only for internal use by extract code.  */
 
-void (outf)(
+extern int extract_outf_verbose;
+
+void (extract_outf)(
         int level,
         const char* file, int line,
         const char* fn,
         int ln,
         const char* format,
         ...
-        );
+        )
+        #ifdef __GNUC__
+        __attribute__ ((format (printf, 6, 7)))
+        #endif
+        ;
 /* Outputs text if <level> is less than or equal to verbose value set by
 outf_level_set(). */
 
-#define outf(format, ...) \
-        (outf)(1, __FILE__, __LINE__, __FUNCTION__, 1 /*ln*/, format, ##__VA_ARGS__)
-
-#define outf0(format, ...) \
-        (outf)(0, __FILE__, __LINE__, __FUNCTION__, 1 /*ln*/, format, ##__VA_ARGS__)
-
-#define outfx(format, ...)
-
-/* Simple printf-style debug output. */
-
-#define outfx(format, ...)
-
-void outf_verbose_set(int verbose);
+void extract_outf_verbose_set(int verbose);
 /* Set verbose value. Higher values are more verbose. Initial value is 0. */
 
 #endif
diff --git a/extract/src/sys.c b/extract/src/sys.c
index 131f6312..2359acab 100644
--- a/extract/src/sys.c
+++ b/extract/src/sys.c
@@ -82,7 +82,7 @@ int  extract_read_all_path(extract_alloc_t* alloc, const char* path, char** o_te
     e = 0;
     end:
     if (f) fclose(f);
-    if (e) extract_free(alloc, &o_text);
+    if (e) extract_free(alloc, o_text);
     return e;
 }
 
diff --git a/extract/src/text.c b/extract/src/text.c
index f832baa2..e75e3e69 100644
--- a/extract/src/text.c
+++ b/extract/src/text.c
@@ -18,23 +18,6 @@ int extract_content_insert(
         int                 contentss_num,
         char**              o_out
         )
-/* Creates a new string by inserting sequence of strings into a template
-string.
-
-If <single_name> is in <original>, it is replaced by <contentss>.
-
-Otherwise the text between the end of <mid_begin_name> and beginning of
-<mid_end_name> is replaced by <contentss>.
-
-If <mid_begin_name> is NULL, we insert into the zero-length region before
-<mid_end_name>.
-
-If <mid_end_name> is NULL, we insert into the zero-length region after
-<mid_begin_name>.
-
-At least one of <single_name>, <mid_begin_name> and <mid_end_name> must be
-non-NULL.
-*/
 {
     int e = -1;
     const char* mid_begin = NULL;
@@ -92,6 +75,11 @@ non-NULL.
             if (extract_astring_catl(alloc, &out, contentss[i].chars, contentss[i].chars_num)) goto end;
         }
     }
+    assert( mid_end);
+    /* As per docs, at least one of <single_name>, <mid_begin_name> and
+    <mid_end_name> is non-null, and this ensures that mid_end must not be null.
+    */
+    /* coverity[var_deref_model] */
     if (extract_astring_cat(alloc, &out, mid_end)) goto end;
     
     *o_out = out.chars;
diff --git a/extract/src/xml.c b/extract/src/xml.c
index 8dab511b..24116f6d 100644
--- a/extract/src/xml.c
+++ b/extract/src/xml.c
@@ -349,7 +349,7 @@ int extract_xml_pparse_init(extract_alloc_t* alloc, extract_buffer_t* buffer, co
         }
         first_line_buffer[actual] = 0;
         if (strcmp(first_line, first_line_buffer)) {
-            outf("Unrecognised prefix: ", first_line_buffer);
+            outf("Unrecognised prefix: %s", first_line_buffer);
             errno = ESRCH;
             goto end;
         }
@@ -393,7 +393,10 @@ static const char* extract_xml_tag_string(extract_alloc_t* alloc, extract_xml_ta
 {
     static char* buffer = NULL;
     extract_free(alloc, &buffer);
-    extract_asprintf(alloc, &buffer, "<name=%s>", tag->name ? tag->name : "");
+    if (extract_asprintf(alloc, &buffer, "<name=%s>", tag->name ? tag->name : ""))
+    {
+        return "";
+    }
     return buffer;
 }
 
@@ -410,7 +413,9 @@ int extract_xml_pparse_next(extract_buffer_t* buffer, extract_xml_tag_t* out)
     assert(buffer);
     extract_xml_tag_free(alloc, out);
 
-    /* Read tag name. */
+    /* Read tag name. Initialise it to empty string so we never return
+    out->name==null on success. */
+    if (str_catl( alloc, &out->name, NULL, 0)) goto end;
     for( i=0;; ++i) {
         int e = extract_buffer_read(buffer, &c, 1, NULL);
         if (e) {
@@ -438,6 +443,7 @@ int extract_xml_pparse_next(extract_buffer_t* buffer, extract_xml_tag_t* out)
                 int quote_single = 0;
                 int quote_double = 0;
                 size_t l;
+                if (str_catl( alloc, &attribute_value, NULL, 0)) goto end;
                 for(;;) {
                     if (s_next(buffer, &ret, &c)) goto end;
                     if (c == '\'')      quote_single = !quote_single;
@@ -469,6 +475,10 @@ int extract_xml_pparse_next(extract_buffer_t* buffer, extract_xml_tag_t* out)
                 }
             }
 
+            /* Ensure name and value are not NULL. */
+            if (str_catl( alloc, &attribute_name, NULL, 0)) goto end;
+            if (str_catl( alloc, &attribute_value, NULL, 0)) goto end;
+
             if (extract_xml_tag_attributes_append(alloc, out, attribute_name, attribute_value)) goto end;
             attribute_name = NULL;
             attribute_value = NULL;
diff --git a/extract/src/xml.h b/extract/src/xml.h
index d11fd886..8bc4dae2 100644
--- a/extract/src/xml.h
+++ b/extract/src/xml.h
@@ -35,6 +35,9 @@ void extract_xml_tag_free(extract_alloc_t* alloc, extract_xml_tag_t* tag);
 int extract_xml_pparse_init(extract_alloc_t* alloc, extract_buffer_t* buffer, const char* first_line);
 /* extract_xml_pparse_*(): simple XML 'pull' parser.
 
+If <first_line> is not NULL, we require that <buffer> starts with the specified
+text. Usually one would include a final newline in <first_line>.
+
 extract_xml_pparse_init() merely consumes the initial '<'. Thereafter
 extract_xml_pparse_next() consumes the next '<' before returning the previous
 tag. */
@@ -53,6 +56,9 @@ int extract_xml_pparse_next(extract_buffer_t* buffer, extract_xml_tag_t* out);
 Returns 0 with *out containing next tag; or -1 with errno set if error; or +1
 with errno=ESRCH if EOF.
 
+If we return 0, we guarantee that out->name points to valid string and that
+each item in out->attributes has similarly valid name and value members.
+
 *out is initially passed to extract_xml_tag_free(), so *out must have been
 initialised, e.g. by by extract_xml_tag_init(). */
 
diff --git a/extract/src/zip.c b/extract/src/zip.c
index 03bfd024..691b743b 100644
--- a/extract/src/zip.c
+++ b/extract/src/zip.c
@@ -10,6 +10,7 @@
 #include <assert.h>
 #include <errno.h>
 #include <limits.h>
+#include <time.h>
 
 #ifdef _MSC_VER
     #include "compat_stdint.h"
@@ -74,8 +75,38 @@ int extract_zip_open(extract_buffer_t* buffer, extract_zip_t** o_zip)
     
     /* We could maybe convert current date/time to the ms-dos format required
     here, but using zeros doesn't seem to make a difference to Word etc. */
-    zip->mtime = 0;
-    zip->mdate = 0;
+        
+    {
+        time_t t = time(NULL);
+        struct tm*  tm;
+        #ifdef _POSIX_SOURCE
+            struct tm   tm_local;
+            tm = gmtime_r(&t, &tm_local);
+        #else
+            tm = gmtime(&t);
+        #endif
+        if (tm)
+        {
+            /* mdate and mtime are in MS DOS format:
+                mtime:
+                    bits 0-4: seconds / 2.
+                    bits 5-10: minute (0-59).
+                    bits 11-15: hour (0-23).
+                mdate:
+                    bits 0-4: day of month (1-31).
+                    bits 5-8: month (1=jan, 2=feb, etc).
+                    bits 9-15: year - 1980.
+            */
+            zip->mtime = (uint16_t) ((tm->tm_hour << 11) | (tm->tm_min << 5) | (tm->tm_sec / 2));
+            zip->mdate = (uint16_t) (((1900 + tm->tm_year - 1980) << 9) | ((tm->tm_mon + 1) << 5) | tm->tm_mday);
+        }
+        else
+        {
+            outf0("*** gmtime_r() failed");
+            zip->mtime = 0;
+            zip->mdate = 0;
+        }
+    }
     
     /* These are all copied from command-line zip on unix. */
     zip->version_creator = (0x3 << 8) + 30; /* 0x3 is unix, 30 means 3.0. */
@@ -115,7 +146,9 @@ static int s_native_little_endinesss(void)
         /* Native big-endiness. */
         return 0;
     }
-    abort();
+    /* Would like to call abort() here, but that breaks on AIX/gcc. */
+    assert(0);
+    return 0;
 }
 
 
@@ -148,7 +181,7 @@ static int s_write_compressed(
 /* Uses zlib to write raw deflate compressed data to zip->buffer. */
 {
     int ze;
-    z_stream    zstream;
+    z_stream    zstream = {0};  /* Initialise to keep Coverity quiet. */
     if (zip->errno_)    return -1;
     if (zip->eof)       return +1;
     
@@ -313,7 +346,7 @@ int extract_zip_write_file(
     cd_file->name = NULL;
     
     cd_file->mtime = zip->mtime;
-    cd_file->mdate = zip->mtime;
+    cd_file->mdate = zip->mdate;
     cd_file->crc_sum = (int32_t) crc32(crc32(0, NULL, 0), data, (int) data_length);
     cd_file->size_uncompressed = (int) data_length;
     if (zip->compression_method == 0)