diff options
Diffstat (limited to 'extract/src/document.h')
-rw-r--r-- | extract/src/document.h | 93 |
1 files changed, 76 insertions, 17 deletions
diff --git a/extract/src/document.h b/extract/src/document.h index 2dc4f1ee..69c4232c 100644 --- a/extract/src/document.h +++ b/extract/src/document.h @@ -26,6 +26,17 @@ typedef struct point_t max; } rect_t; +extern const rect_t extract_rect_infinite; +extern const rect_t extract_rect_empty; + +rect_t extract_rect_intersect(rect_t a, rect_t b); + +rect_t extract_rect_union(rect_t a, rect_t b); + +int extract_rect_contains_rect(rect_t a, rect_t b); + +int extract_rect_valid(rect_t a); + const char* extract_rect_string(const rect_t* rect); typedef struct @@ -56,13 +67,15 @@ typedef struct /* (x,y) before transformation by ctm and trm. */ double pre_x; double pre_y; - + /* (x,y) after transformation by ctm and trm. */ double x; double y; - + unsigned ucs; double adv; + + rect_t bbox; } char_t; /* A single char in a span. */ @@ -72,15 +85,15 @@ typedef struct matrix_t ctm; matrix_t trm; char* font_name; - + /* font size is extract_matrix_cmp4(trm). */ - + struct { unsigned font_bold : 1; unsigned font_italic : 1; unsigned wmode : 1; } flags; - + char_t* chars; int chars_num; } span_t; @@ -138,10 +151,10 @@ typedef struct double h; void* data; size_t data_size; - + extract_image_data_free data_free; void* data_free_handle; - + } image_t; /* Information about an image. <type> is as passed to extract_add_image(); <name> and <id> are created to be unique identifiers for use in generated docx @@ -166,18 +179,18 @@ typedef struct typedef struct { rect_t rect; - + /* If left/above is true, this cell is not obscured by cell to its left/above. */ uint8_t left; uint8_t above; - + /* extend_right and extend_down are 1 for normal cells, 2 for cells which extend right/down to cover an additional column/row, 3 to cover two additional columns/rows etc. */ int extend_right; int extend_down; - + /* Contents of this cell. */ line_t** lines; int lines_num; @@ -192,7 +205,7 @@ void extract_cell_free(extract_alloc_t* alloc, cell_t** pcell); typedef struct { point_t pos; /* top-left. */ - + /* Array of cells_num_x*cells_num_y cells; cell (x, y) is: cells_num_x * y + x. */ @@ -202,11 +215,30 @@ typedef struct } table_t; +typedef enum +{ + SPLIT_NONE = 0, + SPLIT_HORIZONTAL, + SPLIT_VERTICAL +} split_type_t; + + +typedef struct split_t +{ + split_type_t type; + double weight; + int count; + struct split_t *split[1]; +} split_t; + + typedef struct { + rect_t mediabox; + span_t** spans; int spans_num; - + image_t* images; int images_num; @@ -219,16 +251,27 @@ typedef struct int paragraphs_num; /* These refer to items in .lines. Initially empty, then set by extract_join(). */ - + tablelines_t tablelines_horizontal; tablelines_t tablelines_vertical; - + table_t** tables; int tables_num; +} subpage_t; +/* A subpage. Contains different representations of the list of spans. */ + +typedef struct +{ + rect_t mediabox; + + subpage_t** subpages; + int subpages_num; + + split_t* split; } extract_page_t; -/* A page. Contains different representations of the list of spans. NB not -+called page_t because this clashes with a system type on hpux. */ +/* A page. Contains a list of subpages. NB not +called page_t because this clashes with a system type on hpux. */ typedef struct @@ -248,7 +291,7 @@ typedef struct } images_t; -int extract_document_join(extract_alloc_t* alloc, document_t* document); +int extract_document_join(extract_alloc_t* alloc, document_t* document, int layout_analysis); /* This does all the work of finding paragraphs and tables. */ double extract_matrices_to_font_size(matrix_t* ctm, matrix_t* trm); @@ -273,5 +316,21 @@ typedef struct content, e.g. so we know whether a font has changed so need to start a new odt span. */ +int extract_page_analyse(extract_alloc_t* alloc, extract_page_t* page); +/* Analyse page content for layouts. */ + +int extract_subpage_alloc(extract_alloc_t* extract, rect_t mediabox, extract_page_t* page, subpage_t** psubpage); +/* content_t constructor. */ + +void extract_subpage_free(extract_alloc_t* alloc, subpage_t** psubpage); +/* subpage_t destructor. */ + +int subpage_span_append(extract_alloc_t* alloc, subpage_t* subpage, span_t* span); +/* Push span onto the end of subpage. */ + +int extract_split_alloc(extract_alloc_t* alloc, split_type_t type, int count, split_t** psplit); +/* Allocate a split_t. */ + +void extract_split_free(extract_alloc_t* alloc, split_t** psplit); #endif |