diff options
author | Robin H. Johnson <robbat2@gentoo.org> | 2010-08-09 00:21:00 +0000 |
---|---|---|
committer | Robin H. Johnson <robbat2@gentoo.org> | 2010-08-09 00:21:00 +0000 |
commit | 4b509e569a5d958c4a81e18dedd3df31a6092391 (patch) | |
tree | 18750b10edc3f28dc63d67ed9549ec1fb53bdd04 /percona | |
parent | Updated 07110 patch for mysql-5.1.49. (diff) | |
download | mysql-extras-4b509e569a5d958c4a81e18dedd3df31a6092391.tar.gz mysql-extras-4b509e569a5d958c4a81e18dedd3df31a6092391.tar.bz2 mysql-extras-4b509e569a5d958c4a81e18dedd3df31a6092391.zip |
Adding latest Percona patches.
Diffstat (limited to 'percona')
30 files changed, 25042 insertions, 0 deletions
diff --git a/percona/5.0.91-b22-20100522/CHECKSUM.MD5 b/percona/5.0.91-b22-20100522/CHECKSUM.MD5 new file mode 100644 index 0000000..87cc289 --- /dev/null +++ b/percona/5.0.91-b22-20100522/CHECKSUM.MD5 @@ -0,0 +1,28 @@ +f94d0861b72103d54a8ff3800847fe02 series +7738fd1556b03bd512cfab64432b8d96 userstatv2.patch +b40d08e599ffbb8b3d654190bd278f28 show_patches.patch +c89ae66aad25d102b1b57ccbea0883dc profiling_slow.patch +23b1125d15f1624bd920ab94333e4ec6 mysqld_safe_syslog.patch +d53be78eae8f3680c4ed90ba1122cbb0 mysql-test.patch +043ffa3cdc3d4f65ed0cc7626e667945 mirror_binlog.patch +1d4b5e4be5d4ec9af9d991579bddfef7 microslow_innodb.patch +ea02d7cf5de508217194d846b1877fa3 microsec_process.patch +74c970feb2f4d7997ee9d10ee0151c8c innodb_use_sys_malloc.patch +824e96231eae8adf47abf3cc2dc7f06b innodb_thread_concurrency_timer_based.patch +10b228f7c1df9441bef5951c4fdcfb33 innodb_split_buf_pool_mutex.patch +6e85bdacf5192de313f4959e1b77441d innodb_show_hashed_memory_standalone.patch +15e9cf8e77330df9dc0f9fed0542cf93 innodb_show_hashed_memory.patch +01397a91f4dede07f869c320342fdb3c innodb_show_bp.patch +4050142f7c8cc1d5857f972c24e4390c innodb_rw_lock_old.patch +75ca0cd1f878afe6360746ecba82726b innodb_rw_lock.patch +7cef98cb62b4620de17955e0f371211b innodb_recovery_patches.patch +f5e5492fa8e2608c29ef781a9448af3e innodb_misc_patch.patch +5de06fbcbb7c2f8562d43670be84c4d7 innodb_locks_held.patch +bf1e0ce08175b3aff68e36e468817cc3 innodb_io_tune.patch +05f6558f5d85308a78e24661807aa95a innodb_io_pattern.patch +0d868a2f57fa762bceb24749ca819190 innodb_io_patches.patch +e6eb72d8c4bc5a922c390530858b69b8 innodb_fsync_source.patch +e7ec26dfed29892247434ac51e432ce6 innodb_extra_status.patch +640f4bf96bec774576648e019c595e4b innodb_extra_rseg.patch +df9f80c668652720a7a89675a153a99a innodb_dict_size_limit.patch +3ca5baf8836512e28e24e5fa3210d903 innodb_check_fragmentation.patch diff --git a/percona/5.0.91-b22-20100522/README-GENTOO b/percona/5.0.91-b22-20100522/README-GENTOO new file mode 100644 index 0000000..a4e2724 --- /dev/null +++ b/percona/5.0.91-b22-20100522/README-GENTOO @@ -0,0 +1,8 @@ +The following patches, while distributed by Percona, are NOT applied in their +specfile. As such, we do not apply them in Gentoo either: +========= +innodb_extra_status.patch +innodb_io_tune.patch +innodb_rw_lock_old.patch +innodb_show_hashed_memory_standalone.patch +mirror_binlog.patch diff --git a/percona/5.0.91-b22-20100522/innodb_check_fragmentation.patch b/percona/5.0.91-b22-20100522/innodb_check_fragmentation.patch new file mode 100644 index 0000000..4b16731 --- /dev/null +++ b/percona/5.0.91-b22-20100522/innodb_check_fragmentation.patch @@ -0,0 +1,275 @@ +diff -r 936d427a9a15 innobase/btr/btr0cur.c +--- a/innobase/btr/btr0cur.c Mon Dec 22 00:33:03 2008 -0800 ++++ b/innobase/btr/btr0cur.c Mon Dec 22 00:33:11 2008 -0800 +@@ -516,6 +516,14 @@ + == index->table->comp); + } + ++ if (level == 0) { ++ /* Initializes status counters */ ++ innobase_mysql_thd_init_innodb_scan_cont(); ++ innobase_mysql_thd_init_innodb_scan_jump(); ++ innobase_mysql_thd_init_innodb_scan_data(); ++ innobase_mysql_thd_init_innodb_scan_garbage(); ++ } ++ + break; + } + +@@ -663,6 +671,12 @@ + btr_cur_add_path_info(cursor, height, + root_height); + } ++ ++ /* Initializes status counters */ ++ innobase_mysql_thd_init_innodb_scan_cont(); ++ innobase_mysql_thd_init_innodb_scan_jump(); ++ innobase_mysql_thd_init_innodb_scan_data(); ++ innobase_mysql_thd_init_innodb_scan_garbage(); + + break; + } +diff -r 936d427a9a15 innobase/btr/btr0pcur.c +--- a/innobase/btr/btr0pcur.c Mon Dec 22 00:33:03 2008 -0800 ++++ b/innobase/btr/btr0pcur.c Mon Dec 22 00:33:11 2008 -0800 +@@ -381,6 +381,7 @@ + last record of the current page */ + mtr_t* mtr) /* in: mtr */ + { ++ ulint page_no; + ulint next_page_no; + ulint space; + page_t* page; +@@ -393,11 +394,22 @@ + cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; + + page = btr_pcur_get_page(cursor); ++ page_no = buf_frame_get_page_no(page); + + next_page_no = btr_page_get_next(page, mtr); + space = buf_frame_get_space_id(page); + + ut_ad(next_page_no != FIL_NULL); ++ ++ if (next_page_no - page_no == 1) { ++ innobase_mysql_thd_increment_innodb_scan_cont(1); ++ } else { ++ innobase_mysql_thd_increment_innodb_scan_jump(1); ++ } ++ innobase_mysql_thd_increment_innodb_scan_data( ++ page_get_data_size(page)); ++ innobase_mysql_thd_increment_innodb_scan_garbage( ++ page_header_get_field(page, PAGE_GARBAGE)); + + next_page = btr_page_get(space, next_page_no, cursor->latch_mode, mtr); + ut_a(page_is_comp(next_page) == page_is_comp(page)); +@@ -427,6 +439,7 @@ + record of the current page */ + mtr_t* mtr) /* in: mtr */ + { ++ ulint page_no; + ulint prev_page_no; + ulint space; + page_t* page; +@@ -462,9 +475,20 @@ + btr_pcur_restore_position(latch_mode2, cursor, mtr); + + page = btr_pcur_get_page(cursor); ++ page_no = buf_frame_get_page_no(page); + + prev_page_no = btr_page_get_prev(page, mtr); + space = buf_frame_get_space_id(page); ++ ++ if (page_no - prev_page_no == 1) { ++ innobase_mysql_thd_increment_innodb_scan_cont(1); ++ } else { ++ innobase_mysql_thd_increment_innodb_scan_jump(1); ++ } ++ innobase_mysql_thd_increment_innodb_scan_data( ++ page_get_data_size(page)); ++ innobase_mysql_thd_increment_innodb_scan_garbage( ++ page_header_get_field(page, PAGE_GARBAGE)); + + if (btr_pcur_is_before_first_on_page(cursor, mtr) + && (prev_page_no != FIL_NULL)) { +diff -r 936d427a9a15 innobase/btr/btr0sea.c +--- a/innobase/btr/btr0sea.c Mon Dec 22 00:33:03 2008 -0800 ++++ b/innobase/btr/btr0sea.c Mon Dec 22 00:33:11 2008 -0800 +@@ -861,6 +861,12 @@ + + buf_pool->n_page_gets++; + ++ /* Initializes status counters */ ++ innobase_mysql_thd_init_innodb_scan_cont(); ++ innobase_mysql_thd_init_innodb_scan_jump(); ++ innobase_mysql_thd_init_innodb_scan_data(); ++ innobase_mysql_thd_init_innodb_scan_garbage(); ++ + return(TRUE); + + /*-------------------------------------------*/ +diff -r 936d427a9a15 innobase/include/btr0cur.h +--- a/innobase/include/btr0cur.h Mon Dec 22 00:33:03 2008 -0800 ++++ b/innobase/include/btr0cur.h Mon Dec 22 00:33:11 2008 -0800 +@@ -697,6 +697,17 @@ + extern ulint btr_cur_n_non_sea_old; + extern ulint btr_cur_n_sea_old; + ++/*--------------------------------------*/ ++/* prototypes for new functions added to ha_innodb.cc */ ++void innobase_mysql_thd_init_innodb_scan_cont(); ++void innobase_mysql_thd_increment_innodb_scan_cont(ulong length); ++void innobase_mysql_thd_init_innodb_scan_jump(); ++void innobase_mysql_thd_increment_innodb_scan_jump(ulong length); ++void innobase_mysql_thd_init_innodb_scan_data(); ++void innobase_mysql_thd_increment_innodb_scan_data(ulong length); ++void innobase_mysql_thd_init_innodb_scan_garbage(); ++void innobase_mysql_thd_increment_innodb_scan_garbage(ulong length); ++ + #ifndef UNIV_NONINL + #include "btr0cur.ic" + #endif +diff -r 936d427a9a15 patch_info/innodb_check_fragmentation.info +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/patch_info/innodb_check_fragmentation.info Mon Dec 22 00:33:11 2008 -0800 +@@ -0,0 +1,6 @@ ++File=innodb_check_fragmentation.patch ++Name=Session status to check fragmentation of the last InnoDB scan ++Version=1.0 ++Author=Percona <info@percona.com> ++License=GPL ++Comment=The names are Innodb_scan_* +diff -r 936d427a9a15 sql/ha_innodb.cc +--- a/sql/ha_innodb.cc Mon Dec 22 00:33:03 2008 -0800 ++++ b/sql/ha_innodb.cc Mon Dec 22 00:33:11 2008 -0800 +@@ -760,6 +760,102 @@ + } + + /************************************************************************* ++Initializes Innodb_scan_blocks_contiguous. */ ++extern "C" ++void ++innobase_mysql_thd_init_innodb_scan_cont() ++{ ++ THD *thd=current_thd; ++ if (likely(thd != 0)) { ++ thd->status_var.innodb_scan_cont = 0; ++ } ++} ++ ++/************************************************************************* ++Increments Innodb_scan_blocks_contiguous. */ ++extern "C" ++void ++innobase_mysql_thd_increment_innodb_scan_cont(ulong length) ++{ ++ THD *thd=current_thd; ++ if (likely(thd != 0)) { ++ thd->status_var.innodb_scan_cont+= length; ++ } ++} ++ ++/************************************************************************* ++Initializes Innodb_scan_blocks_jumpy. */ ++extern "C" ++void ++innobase_mysql_thd_init_innodb_scan_jump() ++{ ++ THD *thd=current_thd; ++ if (likely(thd != 0)) { ++ thd->status_var.innodb_scan_jump = 0; ++ } ++} ++ ++/************************************************************************* ++Increments Innodb_scan_blocks_jumpy. */ ++extern "C" ++void ++innobase_mysql_thd_increment_innodb_scan_jump(ulong length) ++{ ++ THD *thd=current_thd; ++ if (likely(thd != 0)) { ++ thd->status_var.innodb_scan_jump+= length; ++ } ++} ++ ++/************************************************************************* ++Initializes Innodb_scan_data_in_pages. */ ++extern "C" ++void ++innobase_mysql_thd_init_innodb_scan_data() ++{ ++ THD *thd=current_thd; ++ if (likely(thd != 0)) { ++ thd->status_var.innodb_scan_data = 0; ++ } ++} ++ ++/************************************************************************* ++Increments Innodb_scan_data_in_pages. */ ++extern "C" ++void ++innobase_mysql_thd_increment_innodb_scan_data(ulong length) ++{ ++ THD *thd=current_thd; ++ if (likely(thd != 0)) { ++ thd->status_var.innodb_scan_data+= length; ++ } ++} ++ ++/************************************************************************* ++Initializes Innodb_scan_garbages_in_pages. */ ++extern "C" ++void ++innobase_mysql_thd_init_innodb_scan_garbage() ++{ ++ THD *thd=current_thd; ++ if (likely(thd != 0)) { ++ thd->status_var.innodb_scan_garbage = 0; ++ } ++} ++ ++/************************************************************************* ++Increments Innodb_scan_garbages_in_pages. */ ++extern "C" ++void ++innobase_mysql_thd_increment_innodb_scan_garbage(ulong length) ++{ ++ THD *thd=current_thd; ++ if (likely(thd != 0)) { ++ thd->status_var.innodb_scan_garbage+= length; ++ } ++} ++ ++/************************************************************************* + Gets the InnoDB transaction handle for a MySQL handler object, creates + an InnoDB transaction struct if the corresponding MySQL thread struct still + lacks one. */ +diff -r 936d427a9a15 sql/mysqld.cc +--- a/sql/mysqld.cc Mon Dec 22 00:33:03 2008 -0800 ++++ b/sql/mysqld.cc Mon Dec 22 00:33:11 2008 -0800 +@@ -6673,6 +6673,10 @@ + {"Handler_write", (char*) offsetof(STATUS_VAR, ha_write_count), SHOW_LONG_STATUS}, + #ifdef HAVE_INNOBASE_DB + {"Innodb_", (char*) &innodb_status_variables, SHOW_VARS}, ++ {"Innodb_scan_pages_contiguous",(char*) offsetof(STATUS_VAR, innodb_scan_cont), SHOW_LONGLONG_STATUS}, ++ {"Innodb_scan_pages_jumpy", (char*) offsetof(STATUS_VAR, innodb_scan_jump), SHOW_LONGLONG_STATUS}, ++ {"Innodb_scan_data_in_pages",(char*) offsetof(STATUS_VAR, innodb_scan_data), SHOW_LONGLONG_STATUS}, ++ {"Innodb_scan_garbages_in_pages",(char*) offsetof(STATUS_VAR, innodb_scan_garbage), SHOW_LONGLONG_STATUS}, + #endif /*HAVE_INNOBASE_DB*/ + {"Key_blocks_not_flushed", (char*) &dflt_key_cache_var.global_blocks_changed, SHOW_KEY_CACHE_LONG}, + {"Key_blocks_unused", (char*) &dflt_key_cache_var.blocks_unused, SHOW_KEY_CACHE_CONST_LONG}, +diff -r 936d427a9a15 sql/sql_class.h +--- a/sql/sql_class.h Mon Dec 22 00:33:03 2008 -0800 ++++ b/sql/sql_class.h Mon Dec 22 00:33:11 2008 -0800 +@@ -729,6 +729,10 @@ + sense to add to the /global/ status variable counter. + */ + double last_query_cost; ++ ulonglong innodb_scan_cont; ++ ulonglong innodb_scan_jump; ++ ulonglong innodb_scan_data; ++ ulonglong innodb_scan_garbage; + } STATUS_VAR; + + /* diff --git a/percona/5.0.91-b22-20100522/innodb_dict_size_limit.patch b/percona/5.0.91-b22-20100522/innodb_dict_size_limit.patch new file mode 100644 index 0000000..ced1aec --- /dev/null +++ b/percona/5.0.91-b22-20100522/innodb_dict_size_limit.patch @@ -0,0 +1,633 @@ +diff -ruN a/innobase/btr/btr0sea.c b/innobase/btr/btr0sea.c +--- a/innobase/btr/btr0sea.c 2009-08-27 18:42:17.000000000 +0900 ++++ b/innobase/btr/btr0sea.c 2009-08-27 18:43:11.000000000 +0900 +@@ -1077,6 +1077,124 @@ + } + + /************************************************************************ ++Drops a page hash index based on index */ ++ ++void ++btr_search_drop_page_hash_index_on_index( ++/*=====================================*/ ++ dict_index_t* index) /* in: record descriptor */ ++{ ++ page_t* page; ++ hash_table_t* table; ++ buf_block_t* block; ++ ulint n_fields; ++ ulint n_bytes; ++ rec_t* rec; ++ ulint fold; ++ ulint prev_fold; ++ dulint tree_id; ++ ulint n_cached; ++ ulint n_recs; ++ ulint* folds; ++ ulint i; ++ mem_heap_t* heap = NULL; ++ ulint* offsets; ++ ++ rw_lock_x_lock(&btr_search_latch); ++ mutex_enter(&buf_pool->mutex); ++ ++ table = btr_search_sys->hash_index; ++ ++ block = UT_LIST_GET_LAST(buf_pool->LRU); ++ ++ while (block != NULL) { ++ if (block->index == index && block->is_hashed) { ++ page = block->frame; ++ ++ /* from btr_search_drop_page_hash_index() */ ++ n_fields = block->curr_n_fields; ++ n_bytes = block->curr_n_bytes; ++ ++ ut_a(n_fields + n_bytes > 0); ++ ++ n_recs = page_get_n_recs(page); ++ ++ /* Calculate and cache fold values into an array for fast deletion ++ from the hash index */ ++ ++ folds = mem_alloc(n_recs * sizeof(ulint)); ++ ++ n_cached = 0; ++ ++ rec = page_get_infimum_rec(page); ++ rec = page_rec_get_next(rec); ++ ++ tree_id = btr_page_get_index_id(page); ++ ++ ut_a(0 == ut_dulint_cmp(tree_id, index->id)); ++ ++ prev_fold = 0; ++ ++ offsets = NULL; ++ ++ while (!page_rec_is_supremum(rec)) { ++ /* FIXME: in a mixed tree, not all records may have enough ++ ordering fields: */ ++ offsets = rec_get_offsets(rec, index, offsets, ++ n_fields + (n_bytes > 0), &heap); ++ ut_a(rec_offs_n_fields(offsets) == n_fields + (n_bytes > 0)); ++ fold = rec_fold(rec, offsets, n_fields, n_bytes, tree_id); ++ ++ if (fold == prev_fold && prev_fold != 0) { ++ ++ goto next_rec; ++ } ++ ++ /* Remove all hash nodes pointing to this page from the ++ hash chain */ ++ ++ folds[n_cached] = fold; ++ n_cached++; ++next_rec: ++ rec = page_rec_get_next(rec); ++ prev_fold = fold; ++ } ++ ++ for (i = 0; i < n_cached; i++) { ++ ++ ha_remove_all_nodes_to_page(table, folds[i], page); ++ } ++ ++ ut_a(index->search_info->ref_count > 0); ++ index->search_info->ref_count--; ++ ++ block->is_hashed = FALSE; ++ block->index = NULL; ++ ++ if (UNIV_UNLIKELY(block->n_pointers)) { ++ /* Corruption */ ++ ut_print_timestamp(stderr); ++ fprintf(stderr, ++" InnoDB: Corruption of adaptive hash index. After dropping\n" ++"InnoDB: the hash index to a page of %s, still %lu hash nodes remain.\n", ++ index->name, (ulong) block->n_pointers); ++ } ++ ++ mem_free(folds); ++ } ++ ++ block = UT_LIST_GET_PREV(LRU, block); ++ } ++ ++ mutex_exit(&buf_pool->mutex); ++ rw_lock_x_unlock(&btr_search_latch); ++ ++ if (UNIV_LIKELY_NULL(heap)) { ++ mem_heap_free(heap); ++ } ++} ++ ++/************************************************************************ + Drops a page hash index when a page is freed from a fseg to the file system. + Drops possible hash index if the page happens to be in the buffer pool. */ + +diff -ruN a/innobase/dict/dict0boot.c b/innobase/dict/dict0boot.c +--- a/innobase/dict/dict0boot.c 2009-07-07 21:53:58.000000000 +0900 ++++ b/innobase/dict/dict0boot.c 2009-08-27 18:42:59.000000000 +0900 +@@ -247,6 +247,7 @@ + system tables */ + /*-------------------------*/ + table = dict_mem_table_create("SYS_TABLES", DICT_HDR_SPACE, 8, FALSE); ++ table->n_mysql_handles_opened = 1; /* for pin */ + + dict_mem_table_add_col(table, "NAME", DATA_BINARY, 0, 0, 0); + dict_mem_table_add_col(table, "ID", DATA_BINARY, 0, 0, 0); +@@ -283,6 +284,7 @@ + ut_a(success); + /*-------------------------*/ + table = dict_mem_table_create("SYS_COLUMNS", DICT_HDR_SPACE, 7, FALSE); ++ table->n_mysql_handles_opened = 1; /* for pin */ + + dict_mem_table_add_col(table, "TABLE_ID", DATA_BINARY,0,0,0); + dict_mem_table_add_col(table, "POS", DATA_INT, 0, 4, 0); +@@ -309,6 +311,7 @@ + ut_a(success); + /*-------------------------*/ + table = dict_mem_table_create("SYS_INDEXES", DICT_HDR_SPACE, 7, FALSE); ++ table->n_mysql_handles_opened = 1; /* for pin */ + + dict_mem_table_add_col(table, "TABLE_ID", DATA_BINARY, 0,0,0); + dict_mem_table_add_col(table, "ID", DATA_BINARY, 0, 0, 0); +@@ -345,6 +348,7 @@ + ut_a(success); + /*-------------------------*/ + table = dict_mem_table_create("SYS_FIELDS", DICT_HDR_SPACE, 3, FALSE); ++ table->n_mysql_handles_opened = 1; /* for pin */ + + dict_mem_table_add_col(table, "INDEX_ID", DATA_BINARY, 0,0,0); + dict_mem_table_add_col(table, "POS", DATA_INT, 0, 4, 0); +diff -ruN a/innobase/dict/dict0crea.c b/innobase/dict/dict0crea.c +--- a/innobase/dict/dict0crea.c 2009-07-07 21:53:58.000000000 +0900 ++++ b/innobase/dict/dict0crea.c 2009-08-27 18:42:59.000000000 +0900 +@@ -1178,6 +1178,9 @@ + /* Foreign constraint system tables have already been + created, and they are ok */ + ++ table1->n_mysql_handles_opened = 1; /* for pin */ ++ table2->n_mysql_handles_opened = 1; /* for pin */ ++ + mutex_exit(&(dict_sys->mutex)); + + return(DB_SUCCESS); +@@ -1267,6 +1270,11 @@ + + trx->op_info = ""; + ++ table1 = dict_table_get_low("SYS_FOREIGN"); ++ table2 = dict_table_get_low("SYS_FOREIGN_COLS"); ++ table1->n_mysql_handles_opened = 1; /* for pin */ ++ table2->n_mysql_handles_opened = 1; /* for pin */ ++ + row_mysql_unlock_data_dictionary(trx); + + trx_free_for_mysql(trx); +diff -ruN a/innobase/dict/dict0dict.c b/innobase/dict/dict0dict.c +--- a/innobase/dict/dict0dict.c 2009-07-07 21:53:58.000000000 +0900 ++++ b/innobase/dict/dict0dict.c 2009-08-27 18:43:11.000000000 +0900 +@@ -638,6 +638,8 @@ + mutex_enter(&(dict_sys->mutex)); + + table = dict_table_get_on_id_low(table_id, trx); ++ ++ dict_table_LRU_trim(table); + + mutex_exit(&(dict_sys->mutex)); + +@@ -752,6 +754,8 @@ + + table = dict_table_get_low(table_name); + ++ dict_table_LRU_trim(table); ++ + mutex_exit(&(dict_sys->mutex)); + + if (table != NULL) { +@@ -787,6 +791,8 @@ + table->n_mysql_handles_opened++; + } + ++ dict_table_LRU_trim(table); ++ + mutex_exit(&(dict_sys->mutex)); + + if (table != NULL) { +@@ -1267,20 +1273,64 @@ + too much space. Currently not used! */ + + void +-dict_table_LRU_trim(void) +-/*=====================*/ ++dict_table_LRU_trim( ++/*================*/ ++ dict_table_t* self) + { + dict_table_t* table; + dict_table_t* prev_table; ++ dict_foreign_t* foreign; ++ ulint n_removed; ++ ulint n_have_parent; ++ ulint cached_foreign_tables; + +- ut_error; ++ //ut_error; + + #ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(dict_sys->mutex))); + #endif /* UNIV_SYNC_DEBUG */ + ++retry: ++ n_removed = n_have_parent = 0; + table = UT_LIST_GET_LAST(dict_sys->table_LRU); + ++ while ( srv_dict_size_limit && table ++ && ((dict_sys->table_hash->n_cells ++ + dict_sys->table_id_hash->n_cells ++ + dict_sys->col_hash->n_cells) * sizeof(hash_cell_t) ++ + dict_sys->size) > srv_dict_size_limit ) { ++ prev_table = UT_LIST_GET_PREV(table_LRU, table); ++ ++ if (table == self || table->n_mysql_handles_opened) ++ goto next_loop; ++ ++ cached_foreign_tables = 0; ++ foreign = UT_LIST_GET_FIRST(table->foreign_list); ++ while (foreign != NULL) { ++ if (foreign->referenced_table) ++ cached_foreign_tables++; ++ foreign = UT_LIST_GET_NEXT(foreign_list, foreign); ++ } ++ ++ /* TODO: use table->mem_fix also, if it becomes exact. */ ++ ++ if (cached_foreign_tables == 0) { ++ dict_table_remove_from_cache(table); ++ n_removed++; ++ } else { ++ n_have_parent++; ++ } ++next_loop: ++ table = prev_table; ++ } ++ ++ if ( srv_dict_size_limit && n_have_parent && n_removed ++ && ((dict_sys->table_hash->n_cells ++ + dict_sys->table_id_hash->n_cells ++ + dict_sys->col_hash->n_cells) * sizeof(hash_cell_t) ++ + dict_sys->size) > srv_dict_size_limit ) ++ goto retry; ++/* + while (table && (dict_sys->size > + buf_pool_get_max_size() / DICT_POOL_PER_VARYING)) { + +@@ -1292,6 +1342,7 @@ + + table = prev_table; + } ++*/ + } + + /************************************************************************** +@@ -1565,6 +1616,10 @@ + #ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(dict_sys->mutex))); + #endif /* UNIV_SYNC_DEBUG */ ++ /* remove all entry of the index from adaptive hash index, ++ because removing from adaptive hash index needs dict_index */ ++ if (srv_use_adaptive_hash_indexes && srv_dict_size_limit) ++ btr_search_drop_page_hash_index_on_index(index); + + /* We always create search info whether or not adaptive + hash index is enabled or not. */ +diff -ruN a/innobase/ibuf/ibuf0ibuf.c b/innobase/ibuf/ibuf0ibuf.c +--- a/innobase/ibuf/ibuf0ibuf.c 2009-08-27 18:42:17.000000000 +0900 ++++ b/innobase/ibuf/ibuf0ibuf.c 2009-08-27 18:42:59.000000000 +0900 +@@ -535,6 +535,7 @@ + sprintf(buf, "SYS_IBUF_TABLE_%lu", (ulong) space); + /* use old-style record format for the insert buffer */ + table = dict_mem_table_create(buf, space, 2, FALSE); ++ table->n_mysql_handles_opened = 1; /* for pin */ + + dict_mem_table_add_col(table, "PAGE_NO", DATA_BINARY, 0, 0, 0); + dict_mem_table_add_col(table, "TYPES", DATA_BINARY, 0, 0, 0); +diff -ruN a/innobase/include/btr0sea.h b/innobase/include/btr0sea.h +--- a/innobase/include/btr0sea.h 2009-07-07 21:54:00.000000000 +0900 ++++ b/innobase/include/btr0sea.h 2009-08-27 18:43:11.000000000 +0900 +@@ -97,6 +97,13 @@ + /*============================*/ + page_t* page); /* in: index page, s- or x-latched */ + /************************************************************************ ++Drops a page hash index based on index */ ++ ++void ++btr_search_drop_page_hash_index_on_index( ++/*=====================================*/ ++ dict_index_t* index); /* in: record descriptor */ ++/************************************************************************ + Drops a page hash index when a page is freed from a fseg to the file system. + Drops possible hash index if the page happens to be in the buffer pool. */ + +diff -ruN a/innobase/include/dict0dict.h b/innobase/include/dict0dict.h +--- a/innobase/include/dict0dict.h 2009-07-07 21:54:01.000000000 +0900 ++++ b/innobase/include/dict0dict.h 2009-08-27 18:42:59.000000000 +0900 +@@ -938,6 +938,11 @@ + const char* ptr, /* in: scan from */ + const char* string);/* in: look for this */ + ++void ++dict_table_LRU_trim( ++/*================*/ ++ dict_table_t* self); ++ + /* Buffers for storing detailed information about the latest foreign key + and unique key errors */ + extern FILE* dict_foreign_err_file; +diff -ruN a/innobase/include/dict0dict.ic b/innobase/include/dict0dict.ic +--- a/innobase/include/dict0dict.ic 2009-07-07 21:54:01.000000000 +0900 ++++ b/innobase/include/dict0dict.ic 2009-08-27 18:42:59.000000000 +0900 +@@ -533,6 +533,13 @@ + + HASH_SEARCH(name_hash, dict_sys->table_hash, table_fold, table, + ut_strcmp(table->name, table_name) == 0); ++ ++ /* make young in table_LRU */ ++ if (table) { ++ UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table); ++ UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table); ++ } ++ + return(table); + } + +@@ -592,6 +599,10 @@ + if (table != NULL) { + table->mem_fix++; + ++ /* make young in table_LRU */ ++ UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table); ++ UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table); ++ + /* lock_push(trx, table, LOCK_DICT_MEM_FIX) */ + } + +diff -ruN a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h +--- a/innobase/include/srv0srv.h 2009-08-27 18:42:17.000000000 +0900 ++++ b/innobase/include/srv0srv.h 2009-08-27 18:42:59.000000000 +0900 +@@ -147,6 +147,8 @@ + extern uint srv_read_ahead; + extern uint srv_adaptive_checkpoint; + ++extern ulint srv_dict_size_limit; ++ + extern volatile ibool srv_io_pattern; + extern ulong srv_io_pattern_trace; + extern ulong srv_io_pattern_trace_running; +@@ -552,6 +554,7 @@ + ulint innodb_data_writes; + ulint innodb_data_written; + ulint innodb_data_reads; ++ ulint innodb_dict_tables; + ulint innodb_buffer_pool_pages_total; + ulint innodb_buffer_pool_pages_data; + ulint innodb_buffer_pool_pages_dirty; +diff -ruN a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c +--- a/innobase/srv/srv0srv.c 2009-08-27 18:42:17.000000000 +0900 ++++ b/innobase/srv/srv0srv.c 2009-08-27 18:42:59.000000000 +0900 +@@ -353,6 +353,8 @@ + uint srv_read_ahead = 3; /* 1: random 2: linear 3: Both */ + uint srv_adaptive_checkpoint = 0; /* 0: none 1: reflex 2: estimate */ + ++ulint srv_dict_size_limit = 0; ++ + volatile ibool srv_io_pattern = FALSE; + ulint srv_io_pattern_trace = 0; + ulint srv_io_pattern_trace_running = 0; +@@ -1953,6 +1955,7 @@ + export_vars.innodb_data_reads= os_n_file_reads; + export_vars.innodb_data_writes= os_n_file_writes; + export_vars.innodb_data_written= srv_data_written; ++ export_vars.innodb_dict_tables= (dict_sys ? UT_LIST_GET_LEN(dict_sys->table_LRU) : 0); + export_vars.innodb_buffer_pool_read_requests= buf_pool->n_page_gets; + export_vars.innodb_buffer_pool_write_requests= srv_buf_pool_write_requests; + export_vars.innodb_buffer_pool_wait_free= srv_buf_pool_wait_free; +diff -ruN a/mysql-test/r/innodb_dict_size_limit.result b/mysql-test/r/innodb_dict_size_limit.result +--- /dev/null 1970-01-01 09:00:00.000000000 +0900 ++++ b/mysql-test/r/innodb_dict_size_limit.result 2009-08-27 18:42:59.000000000 +0900 +@@ -0,0 +1,60 @@ ++DROP TABLE IF EXISTS `test_5`; ++DROP TABLE IF EXISTS `test_4`; ++DROP TABLE IF EXISTS `test_3`; ++DROP TABLE IF EXISTS `test_2`; ++DROP TABLE IF EXISTS `test_1`; ++SET storage_engine=InnoDB; ++SET GLOBAL innodb_dict_size_limit=1; ++FLUSH TABLES; ++CREATE TABLE `test_1` (`a` int, `b` int, PRIMARY KEY (`a`)); ++CREATE TABLE `test_2` (`a` int, `b` int, PRIMARY KEY (`a`)); ++CREATE TABLE `test_3` (`a` int, `b` int, PRIMARY KEY (`a`)); ++CREATE TABLE `test_4` (`a` int, `b` int, PRIMARY KEY (`a`)); ++CREATE TABLE `test_5` (`a` int, `b` int, PRIMARY KEY (`a`)); ++ALTER TABLE `test_5` ADD CONSTRAINT FOREIGN KEY(`b`) REFERENCES `test_4`(`a`); ++ALTER TABLE `test_4` ADD CONSTRAINT FOREIGN KEY(`b`) REFERENCES `test_3`(`a`); ++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables'; ++Variable_name Value ++Innodb_dict_tables 9 ++FLUSH TABLES; ++SELECT * FROM `test_1`; ++a b ++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables'; ++Variable_name Value ++Innodb_dict_tables 8 ++SELECT * FROM `test_3`; ++a b ++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables'; ++Variable_name Value ++Innodb_dict_tables 11 ++FLUSH TABLES; ++SELECT * FROM `test_2`; ++a b ++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables'; ++Variable_name Value ++Innodb_dict_tables 8 ++SELECT * FROM `test_1`; ++a b ++FLUSH TABLES; ++SELECT * FROM `test_4`; ++a b ++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables'; ++Variable_name Value ++Innodb_dict_tables 9 ++SELECT * FROM `test_3`; ++a b ++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables'; ++Variable_name Value ++Innodb_dict_tables 10 ++SET GLOBAL innodb_dict_size_limit=0; ++FLUSH TABLES; ++SELECT * FROM `test_2`; ++a b ++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables'; ++Variable_name Value ++Innodb_dict_tables 11 ++DROP TABLE `test_5`; ++DROP TABLE `test_4`; ++DROP TABLE `test_3`; ++DROP TABLE `test_2`; ++DROP TABLE `test_1`; +diff -ruN a/mysql-test/t/innodb_dict_size_limit.test b/mysql-test/t/innodb_dict_size_limit.test +--- /dev/null 1970-01-01 09:00:00.000000000 +0900 ++++ b/mysql-test/t/innodb_dict_size_limit.test 2009-08-27 18:42:59.000000000 +0900 +@@ -0,0 +1,63 @@ ++# ++# Test for new variable innodb_dict_size_limit; ++# ++-- source include/have_innodb.inc ++ ++--disable_warnings ++DROP TABLE IF EXISTS `test_5`; ++DROP TABLE IF EXISTS `test_4`; ++DROP TABLE IF EXISTS `test_3`; ++DROP TABLE IF EXISTS `test_2`; ++DROP TABLE IF EXISTS `test_1`; ++--enable_warnings ++ ++SET storage_engine=InnoDB; ++SET GLOBAL innodb_dict_size_limit=1; ++ ++FLUSH TABLES; ++ ++CREATE TABLE `test_1` (`a` int, `b` int, PRIMARY KEY (`a`)); ++CREATE TABLE `test_2` (`a` int, `b` int, PRIMARY KEY (`a`)); ++CREATE TABLE `test_3` (`a` int, `b` int, PRIMARY KEY (`a`)); ++CREATE TABLE `test_4` (`a` int, `b` int, PRIMARY KEY (`a`)); ++CREATE TABLE `test_5` (`a` int, `b` int, PRIMARY KEY (`a`)); ++ ++ALTER TABLE `test_5` ADD CONSTRAINT FOREIGN KEY(`b`) REFERENCES `test_4`(`a`); ++ALTER TABLE `test_4` ADD CONSTRAINT FOREIGN KEY(`b`) REFERENCES `test_3`(`a`); ++ ++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables'; ++ ++FLUSH TABLES; ++SELECT * FROM `test_1`; ++ ++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables'; ++ ++SELECT * FROM `test_3`; ++ ++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables'; ++ ++FLUSH TABLES; ++SELECT * FROM `test_2`; ++ ++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables'; ++ ++SELECT * FROM `test_1`; ++FLUSH TABLES; ++SELECT * FROM `test_4`; ++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables'; ++ ++SELECT * FROM `test_3`; ++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables'; ++ ++SET GLOBAL innodb_dict_size_limit=0; ++FLUSH TABLES; ++SELECT * FROM `test_2`; ++ ++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables'; ++ ++DROP TABLE `test_5`; ++DROP TABLE `test_4`; ++DROP TABLE `test_3`; ++DROP TABLE `test_2`; ++DROP TABLE `test_1`; ++ +diff -ruN a/patch_info/innodb_dict_size_limit.info b/patch_info/innodb_dict_size_limit.info +--- /dev/null 1970-01-01 09:00:00.000000000 +0900 ++++ b/patch_info/innodb_dict_size_limit.info 2009-08-27 18:42:59.000000000 +0900 +@@ -0,0 +1,9 @@ ++File=innodb_dict_size_limit.patch ++Name=Limit dictionary cache size ++Version=1.0 ++Author=Percona ++License=GPL ++Comment=Variable innodb_dict_size_limit in bytes ++ChangeLog= ++2009-01-26 ++YK: Initial release +diff -ruN a/sql/ha_innodb.cc b/sql/ha_innodb.cc +--- a/sql/ha_innodb.cc 2009-08-27 18:42:17.000000000 +0900 ++++ b/sql/ha_innodb.cc 2009-08-27 18:42:59.000000000 +0900 +@@ -288,6 +288,8 @@ + (char*) &export_vars.innodb_dblwr_pages_written, SHOW_LONG}, + {"dblwr_writes", + (char*) &export_vars.innodb_dblwr_writes, SHOW_LONG}, ++ {"dict_tables", ++ (char*) &export_vars.innodb_dict_tables, SHOW_LONG}, + {"log_waits", + (char*) &export_vars.innodb_log_waits, SHOW_LONG}, + {"log_write_requests", +diff -ruN a/sql/ha_innodb.h b/sql/ha_innodb.h +--- a/sql/ha_innodb.h 2009-08-27 18:42:17.000000000 +0900 ++++ b/sql/ha_innodb.h 2009-08-27 18:42:59.000000000 +0900 +@@ -243,6 +243,7 @@ + extern ulong srv_enable_unsafe_group_commit; + extern uint srv_read_ahead; + extern uint srv_adaptive_checkpoint; ++extern ulong srv_dict_size_limit; + extern ulong srv_show_locks_held; + extern ulong srv_show_verbose_locks; + extern ulong srv_io_pattern_trace; +diff -ruN a/sql/mysqld.cc b/sql/mysqld.cc +--- a/sql/mysqld.cc 2009-08-27 18:42:17.000000000 +0900 ++++ b/sql/mysqld.cc 2009-08-27 18:42:59.000000000 +0900 +@@ -5101,6 +5101,7 @@ + OPT_INNODB_ADAPTIVE_CHECKPOINT, + OPT_INNODB_READ_IO_THREADS, + OPT_INNODB_WRITE_IO_THREADS, ++ OPT_INNODB_DICT_SIZE_LIMIT, + OPT_INNODB_ADAPTIVE_HASH_INDEX, + OPT_FEDERATED, + OPT_INNODB_USE_LEGACY_CARDINALITY_ALGORITHM +@@ -5464,6 +5465,10 @@ + "Number of background write I/O threads in InnoDB.", + (gptr*) &innobase_write_io_threads, (gptr*) &innobase_write_io_threads, + 0, GET_LONG, REQUIRED_ARG, 8, 1, 64, 0, 0, 0}, ++ {"innodb_dict_size_limit", OPT_INNODB_DICT_SIZE_LIMIT, ++ "Limit the allocated memory for dictionary cache. (0: unlimited)", ++ (gptr*) &srv_dict_size_limit, (gptr*) &srv_dict_size_limit, 0, ++ GET_ULONG, REQUIRED_ARG, 0, 0, ULONG_MAX, 0, 0 ,0}, + {"innodb_io_pattern_trace", OPT_INNODB_IO_PATTERN_TRACE, + "Create/Drop the internal hash table for IO pattern tracing.", + (gptr*) &srv_io_pattern_trace, (gptr*) &srv_io_pattern_trace, +diff -ruN a/sql/set_var.cc b/sql/set_var.cc +--- a/sql/set_var.cc 2009-08-27 18:42:17.000000000 +0900 ++++ b/sql/set_var.cc 2009-08-27 18:42:59.000000000 +0900 +@@ -540,6 +540,8 @@ + sys_var_enum sys_innodb_adaptive_checkpoint("innodb_adaptive_checkpoint", + &srv_adaptive_checkpoint, + &innodb_adaptive_checkpoint_typelib, fix_innodb_adaptive_checkpoint); ++sys_var_long_ptr sys_innodb_dict_size_limit("innodb_dict_size_limit", ++ &srv_dict_size_limit); + sys_var_long_ptr sys_innodb_show_locks_held( + "innodb_show_locks_held", + &srv_show_locks_held); +@@ -930,6 +932,7 @@ + &sys_innodb_read_ahead, + &sys_innodb_enable_unsafe_group_commit, + &sys_innodb_adaptive_checkpoint, ++ &sys_innodb_dict_size_limit, + &sys_innodb_show_locks_held, + &sys_innodb_show_verbose_locks, + &sys_innodb_io_pattern_trace, +@@ -1084,6 +1087,7 @@ + {sys_innodb_adaptive_checkpoint.name, (char*) &sys_innodb_adaptive_checkpoint, SHOW_SYS}, + {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG}, + {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG}, ++ {sys_innodb_dict_size_limit.name, (char*) &sys_innodb_dict_size_limit, SHOW_SYS}, + {sys_innodb_io_pattern_trace.name, (char*) &sys_innodb_io_pattern_trace, SHOW_SYS}, + {sys_innodb_io_pattern_trace_running.name, (char*) &sys_innodb_io_pattern_trace_running, SHOW_SYS}, + {sys_innodb_io_pattern_size_limit.name, (char*) &sys_innodb_io_pattern_size_limit, SHOW_SYS}, diff --git a/percona/5.0.91-b22-20100522/innodb_extra_rseg.patch b/percona/5.0.91-b22-20100522/innodb_extra_rseg.patch new file mode 100644 index 0000000..cab3b26 --- /dev/null +++ b/percona/5.0.91-b22-20100522/innodb_extra_rseg.patch @@ -0,0 +1,243 @@ +diff -r 85e7025cf2d1 innobase/include/srv0srv.h +--- a/innobase/include/srv0srv.h Fri Jul 03 15:41:41 2009 -0700 ++++ b/innobase/include/srv0srv.h Fri Jul 03 15:41:47 2009 -0700 +@@ -146,6 +146,8 @@ + extern ulint srv_enable_unsafe_group_commit; + extern uint srv_read_ahead; + extern uint srv_adaptive_checkpoint; ++ ++extern ulint srv_extra_rsegments; + + extern ulint srv_dict_size_limit; + +diff -r 85e7025cf2d1 innobase/include/trx0sys.h +--- a/innobase/include/trx0sys.h Fri Jul 03 15:41:41 2009 -0700 ++++ b/innobase/include/trx0sys.h Fri Jul 03 15:41:47 2009 -0700 +@@ -105,6 +105,13 @@ + void + trx_sys_create(void); + /*================*/ ++/********************************************************************* ++Create extra rollback segments when create_new_db */ ++ ++void ++trx_sys_create_extra_rseg( ++/*======================*/ ++ ulint num); /* in: number of extra user rollback segments */ + /******************************************************************** + Looks for a free slot for a rollback segment in the trx system file copy. */ + +diff -r 85e7025cf2d1 innobase/srv/srv0srv.c +--- a/innobase/srv/srv0srv.c Fri Jul 03 15:41:41 2009 -0700 ++++ b/innobase/srv/srv0srv.c Fri Jul 03 15:41:47 2009 -0700 +@@ -352,6 +352,8 @@ + + uint srv_read_ahead = 3; /* 1: random 2: linear 3: Both */ + uint srv_adaptive_checkpoint = 0; /* 0: none 1: reflex 2: estimate */ ++ ++ulint srv_extra_rsegments = 0; /* extra rseg for users */ + + ulint srv_dict_size_limit = 0; + +diff -r 85e7025cf2d1 innobase/srv/srv0start.c +--- a/innobase/srv/srv0start.c Fri Jul 03 15:41:41 2009 -0700 ++++ b/innobase/srv/srv0start.c Fri Jul 03 15:41:47 2009 -0700 +@@ -1418,6 +1418,8 @@ + dict_create(); + srv_startup_is_before_trx_rollback_phase = FALSE; + ++ if (srv_extra_rsegments) ++ trx_sys_create_extra_rseg(srv_extra_rsegments); + #ifdef UNIV_LOG_ARCHIVE + } else if (srv_archive_recovery) { + fprintf(stderr, +diff -r 85e7025cf2d1 innobase/trx/trx0sys.c +--- a/innobase/trx/trx0sys.c Fri Jul 03 15:41:41 2009 -0700 ++++ b/innobase/trx/trx0sys.c Fri Jul 03 15:41:47 2009 -0700 +@@ -944,3 +944,28 @@ + + trx_sys_init_at_db_start(); + } ++ ++/********************************************************************* ++Create extra rollback segments when create_new_db */ ++ ++void ++trx_sys_create_extra_rseg( ++/*======================*/ ++ ulint num) /* in: number of extra user rollback segments */ ++{ ++ mtr_t mtr; ++ ulint slot_no; ++ ulint i; ++ ++ /* Craete extra rollback segments */ ++ mtr_start(&mtr); ++ for (i = 1; i < num + 1; i++) { ++ if(!trx_rseg_create(TRX_SYS_SPACE, ULINT_MAX, &slot_no, &mtr)) { ++ fprintf(stderr, ++"InnoDB: Warning: Failed to create extra rollback segments.\n"); ++ break; ++ } ++ ut_a(slot_no == i); ++ } ++ mtr_commit(&mtr); ++} +diff -r 85e7025cf2d1 patch_info/innodb_extra_rseg.info +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/patch_info/innodb_extra_rseg.info Fri Jul 03 15:41:47 2009 -0700 +@@ -0,0 +1,6 @@ ++File=innodb_extra_rseg.patch ++Name=allow to create extra rollback segments ++Version=1.0 ++Author=Percona <info@percona.com> ++License=GPL ++Comment +diff -r 85e7025cf2d1 sql/ha_innodb.cc +--- a/sql/ha_innodb.cc Fri Jul 03 15:41:41 2009 -0700 ++++ b/sql/ha_innodb.cc Fri Jul 03 15:41:47 2009 -0700 +@@ -152,6 +152,7 @@ + innobase_open_files; + + long innobase_read_io_threads, innobase_write_io_threads; ++long innobase_extra_rsegments; + longlong innobase_buffer_pool_size, innobase_log_file_size; + + /* The default values for the following char* start-up parameters +@@ -1521,6 +1522,8 @@ + srv_n_read_io_threads = (ulint) innobase_read_io_threads; + srv_n_write_io_threads = (ulint) innobase_write_io_threads; + ++ srv_extra_rsegments = (ulint) innobase_extra_rsegments; ++ + srv_lock_wait_timeout = (ulint) innobase_lock_wait_timeout; + srv_force_recovery = (ulint) innobase_force_recovery; + +diff -r 85e7025cf2d1 sql/ha_innodb.h +--- a/sql/ha_innodb.h Fri Jul 03 15:41:41 2009 -0700 ++++ b/sql/ha_innodb.h Fri Jul 03 15:41:47 2009 -0700 +@@ -205,6 +205,7 @@ + extern long innobase_buffer_pool_awe_mem_mb; + extern long innobase_file_io_threads, innobase_lock_wait_timeout; + extern long innobase_read_io_threads, innobase_write_io_threads; ++extern long innobase_extra_rsegments; + extern long innobase_force_recovery; + extern long innobase_open_files; + extern char *innobase_data_home_dir, *innobase_data_file_path; +diff -r 85e7025cf2d1 sql/mysqld.cc +--- a/sql/mysqld.cc Fri Jul 03 15:41:41 2009 -0700 ++++ b/sql/mysqld.cc Fri Jul 03 15:41:47 2009 -0700 +@@ -5101,6 +5101,7 @@ + OPT_INNODB_ADAPTIVE_CHECKPOINT, + OPT_INNODB_READ_IO_THREADS, + OPT_INNODB_WRITE_IO_THREADS, ++ OPT_INNODB_EXTRA_RSEGMENTS, + OPT_INNODB_DICT_SIZE_LIMIT, + OPT_INNODB_ADAPTIVE_HASH_INDEX, + OPT_FEDERATED, +@@ -5465,6 +5466,10 @@ + "Number of background write I/O threads in InnoDB.", + (gptr*) &innobase_write_io_threads, (gptr*) &innobase_write_io_threads, + 0, GET_LONG, REQUIRED_ARG, 8, 1, 64, 0, 0, 0}, ++ {"innodb_extra_rsegments", OPT_INNODB_EXTRA_RSEGMENTS, ++ "Number of extra user rollback segments when create new database.", ++ (gptr*) &innobase_extra_rsegments, (gptr*) &innobase_extra_rsegments, ++ 0, GET_LONG, REQUIRED_ARG, 0, 0, 127, 0, 0, 0}, + {"innodb_dict_size_limit", OPT_INNODB_DICT_SIZE_LIMIT, + "Limit the allocated memory for dictionary cache. (0: unlimited)", + (gptr*) &srv_dict_size_limit, (gptr*) &srv_dict_size_limit, 0, +diff -r 85e7025cf2d1 sql/set_var.cc +--- a/sql/set_var.cc Fri Jul 03 15:41:41 2009 -0700 ++++ b/sql/set_var.cc Fri Jul 03 15:41:47 2009 -0700 +@@ -1087,6 +1087,7 @@ + {sys_innodb_adaptive_checkpoint.name, (char*) &sys_innodb_adaptive_checkpoint, SHOW_SYS}, + {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG}, + {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG}, ++ {"innodb_extra_rsegments", (char*) &innobase_extra_rsegments, SHOW_LONG}, + {sys_innodb_dict_size_limit.name, (char*) &sys_innodb_dict_size_limit, SHOW_SYS}, + {sys_innodb_io_pattern_trace.name, (char*) &sys_innodb_io_pattern_trace, SHOW_SYS}, + {sys_innodb_io_pattern_trace_running.name, (char*) &sys_innodb_io_pattern_trace_running, SHOW_SYS}, +diff -r 85e7025cf2d1 sql/sql_show.cc +--- a/sql/sql_show.cc Fri Jul 03 15:41:41 2009 -0700 ++++ b/sql/sql_show.cc Fri Jul 03 15:41:47 2009 -0700 +@@ -39,6 +39,8 @@ + #include "srv0srv.h" + #include "buf0buf.h" + #include "dict0dict.h" ++#include "trx0rseg.h" /* for trx_rseg_struct */ ++#include "trx0sys.h" /* for trx_sys */ + } + /* We need to undef it in InnoDB */ + #undef byte +@@ -4180,6 +4182,45 @@ + DBUG_RETURN(returnable); + } + ++int ++innodb_rseg_fill( ++/*=================*/ ++ THD* thd, /* in: thread */ ++ TABLE_LIST* tables, /* in/out: tables to fill */ ++ COND* cond) /* in: condition (ignored) */ ++{ ++ TABLE* table = (TABLE *) tables->table; ++ int status = 0; ++ trx_rseg_t* rseg; ++ ++ DBUG_ENTER("innodb_rseg_fill"); ++ ++ /* deny access to non-superusers */ ++ if (check_global_access(thd, PROCESS_ACL)) { ++ ++ DBUG_RETURN(0); ++ } ++ ++ rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); ++ ++ while (rseg) { ++ table->field[0]->store(rseg->id); ++ table->field[1]->store(rseg->space); ++ table->field[2]->store(rseg->page_no); ++ table->field[3]->store(rseg->max_size); ++ table->field[4]->store(rseg->curr_size); ++ ++ if (schema_table_store_record(thd, table)) { ++ status = 1; ++ break; ++ } ++ ++ rseg = UT_LIST_GET_NEXT(rseg_list, rseg); ++ } ++ ++ DBUG_RETURN(status); ++} ++ + /* + Find schema_tables elment by name + +@@ -4996,6 +5037,16 @@ + {"INDEX_NAME", 32, MYSQL_TYPE_STRING, 0, 0, "index name"}, + {"N_READ", 11, MYSQL_TYPE_LONG, 0, 0, "read ios"}, + {"N_WRITE", 11, MYSQL_TYPE_LONG, 0, 0, "write ios"}, ++ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0} ++}; ++ ++ST_FIELD_INFO innodb_rseg_fields_info[]= ++{ ++ {"RSEG_ID", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, ""}, ++ {"SPACE_ID", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, ""}, ++ {"PAGE_NO", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, ""}, ++ {"MAX_SIZE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, ""}, ++ {"CURR_SIZE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, ""}, + {0, 0, MYSQL_TYPE_STRING, 0, 0, 0} + }; + #endif +@@ -5177,6 +5228,8 @@ + #ifdef HAVE_INNOBASE_DB + {"INNODB_IO_PATTERN", innodb_io_pattern_field_info, create_schema_table, + innodb_io_pattern_fill_table, 0, 0, -1, -1, 0}, ++ {"INNODB_RSEG", innodb_rseg_fields_info, create_schema_table, ++ innodb_rseg_fill, 0, 0, -1, -1, 0}, + #endif + {0, 0, 0, 0, 0, 0, 0, 0, 0} + }; diff --git a/percona/5.0.91-b22-20100522/innodb_extra_status.patch b/percona/5.0.91-b22-20100522/innodb_extra_status.patch new file mode 100644 index 0000000..adc1642 --- /dev/null +++ b/percona/5.0.91-b22-20100522/innodb_extra_status.patch @@ -0,0 +1,747 @@ +diff -r b059d02ec814 innobase/buf/buf0buf.c +--- a/innobase/buf/buf0buf.c Mon Nov 03 05:08:52 2008 -0800 ++++ b/innobase/buf/buf0buf.c Mon Nov 03 05:09:34 2008 -0800 +@@ -2353,6 +2353,7 @@ + "AWE: Database pages and free buffers mapped in frames %lu\n", + (ulong) UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped)); + } ++ if (file) { + fprintf(file, + "Buffer pool size %lu\n" + "Free buffers %lu\n" +@@ -2371,11 +2372,13 @@ + + buf_pool->init_flush[BUF_FLUSH_LIST], + (ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]); + ++ } // if (file) + current_time = time(NULL); + time_elapsed = 0.001 + difftime(current_time, + buf_pool->last_printout_time); + buf_pool->last_printout_time = current_time; + ++ if (file) { + fprintf(file, + "Pages read %lu, created %lu, written %lu\n" + "%.2f reads/s, %.2f creates/s, %.2f writes/s\n", +@@ -2405,6 +2408,7 @@ + } else { + fputs("No buffer pool page gets since the last printout\n", + file); ++ } + } + + buf_pool->n_page_gets_old = buf_pool->n_page_gets; +diff -r b059d02ec814 innobase/ibuf/ibuf0ibuf.c +--- a/innobase/ibuf/ibuf0ibuf.c Mon Nov 03 05:08:52 2008 -0800 ++++ b/innobase/ibuf/ibuf0ibuf.c Mon Nov 03 05:09:34 2008 -0800 +@@ -3519,9 +3519,15 @@ + + mutex_enter(&ibuf_mutex); + ++ inno_ibuf_size = 0; ++ inno_ibuf_inserts = 0; ++ inno_ibuf_merged_recs = 0; ++ inno_ibuf_merges = 0; ++ + data = UT_LIST_GET_FIRST(ibuf->data_list); + + while (data) { ++ if (file) { + fprintf(file, + "Ibuf: size %lu, free list len %lu, seg size %lu,\n" + "%lu inserts, %lu merged recs, %lu merges\n", +@@ -3542,6 +3548,12 @@ + } + } + #endif ++ } // if (file) ++ inno_ibuf_size += (ulong) data->size; ++ inno_ibuf_inserts += (ulong) data->n_inserts; ++ inno_ibuf_merged_recs += (ulong) data->n_merged_recs; ++ inno_ibuf_merges += (ulong) data->n_merges; ++ + data = UT_LIST_GET_NEXT(data_list, data); + } + +diff -r b059d02ec814 innobase/include/lock0lock.h +--- a/innobase/include/lock0lock.h Mon Nov 03 05:08:52 2008 -0800 ++++ b/innobase/include/lock0lock.h Mon Nov 03 05:09:34 2008 -0800 +@@ -24,6 +24,10 @@ + #endif /* UNIV_DEBUG */ + /* Buffer for storing information about the most recent deadlock error */ + extern FILE* lock_latest_err_file; ++ ++/* number of deadlocks happened so far */ ++extern ulint innodb_deadlocks; ++ + + /************************************************************************* + Gets the size of a lock struct. */ +diff -r b059d02ec814 innobase/include/srv0srv.h +--- a/innobase/include/srv0srv.h Mon Nov 03 05:08:52 2008 -0800 ++++ b/innobase/include/srv0srv.h Mon Nov 03 05:09:34 2008 -0800 +@@ -261,6 +261,12 @@ + /* variable to count the number of random read-aheads were done */ + extern ulint srv_read_ahead_rnd; + ++/* variable to identify if there is currently a long semaphore wait */ ++extern ibool srv_long_lock_wait; ++ ++/* variable to count the number long semaphore waits noticed */ ++extern ulint srv_long_lock_waits; ++ + /* Number of IO operations read/write done for all threads */ + extern ulint os_aio_read_requests; + extern ulint os_aio_write_requests; +@@ -278,6 +284,26 @@ + extern ulint inno_pending_ibuf_aio_reads; + extern ulint inno_pending_log_ios; + extern ulint inno_pending_sync_ios; ++ ++/* all 24 innodb status variables, exported to status */ ++extern ulint inno_transaction_count; ++extern ulint inno_transaction_purge_count; ++extern ulint inno_transaction_purge_lag; ++extern ulint inno_num_active_transactions; ++extern ulint inno_summed_transaction_age; ++extern ulint inno_longest_transaction_age; ++extern ulint inno_lock_wait_timeouts; ++extern ulint inno_num_lock_waiters; ++extern ulint inno_summed_lock_wait_time; ++extern ulint inno_longest_lock_wait; ++extern ulint inno_os_reads; ++extern ulint inno_os_writes; ++extern ulint inno_os_fsyncs; ++extern ulint inno_ibuf_size; ++extern ulint inno_ibuf_inserts; ++extern ulint inno_ibuf_merged_recs; ++extern ulint inno_ibuf_merges; ++extern ulint inno_log_ios_done; + + /* In this structure we store status variables to be passed to MySQL */ + typedef struct export_var_struct export_struc; +@@ -552,6 +578,7 @@ + ulint innodb_data_writes; + ulint innodb_data_written; + ulint innodb_data_reads; ++ ulint innodb_dict_size; + ulint innodb_buffer_pool_pages_total; + ulint innodb_buffer_pool_pages_data; + ulint innodb_buffer_pool_pages_dirty; +@@ -587,6 +614,43 @@ + ulint innodb_rows_inserted; + ulint innodb_rows_updated; + ulint innodb_rows_deleted; ++ ibool innodb_long_lock_wait; ++ ulint innodb_long_lock_waits; ++ ++ ulint innodb_os_aio_read_requests; ++ ulint innodb_os_aio_write_requests; ++ ulint innodb_os_aio_pages_read; ++ ulint innodb_os_aio_pages_written; ++ ib_longlong innodb_os_aio_read_time; ++ ib_longlong innodb_os_aio_write_time; ++ ib_longlong innodb_os_aio_read_time_avg; ++ ib_longlong innodb_os_aio_write_time_avg; ++ ulint innodb_deadlocks; ++ ++ // the following 24 variables are exported to "show status" ++ ulint inno_transaction_count; ++ ulint inno_transaction_purge_count; ++ ulint inno_transaction_purge_lag; ++ ulint inno_num_active_transactions; ++ ulint inno_summed_transaction_age; ++ ulint inno_longest_transaction_age; ++ ulint inno_lock_wait_timeouts; ++ ulint inno_num_lock_waiters; ++ ulint inno_summed_lock_wait_time; ++ ulint inno_longest_lock_wait; ++ ulint inno_pending_normal_aio_reads; ++ ulint inno_pending_normal_aio_writes; ++ ulint inno_pending_ibuf_aio_reads; ++ ulint inno_pending_log_ios; ++ ulint inno_pending_sync_ios; ++ ulint inno_os_reads; ++ ulint inno_os_writes; ++ ulint inno_os_fsyncs; ++ ulint inno_ibuf_size; ++ ulint inno_ibuf_inserts; ++ ulint inno_ibuf_merged_recs; ++ ulint inno_ibuf_merges; ++ ulint inno_log_ios_done; + }; + + /* The server system struct */ +diff -r b059d02ec814 innobase/lock/lock0lock.c +--- a/innobase/lock/lock0lock.c Mon Nov 03 05:08:52 2008 -0800 ++++ b/innobase/lock/lock0lock.c Mon Nov 03 05:09:34 2008 -0800 +@@ -360,6 +360,9 @@ + ibool lock_deadlock_found = FALSE; + FILE* lock_latest_err_file; + ++/* number of deadlocks happened so far */ ++ulint innodb_deadlocks = 0; ++ + /* Flags for recursive deadlock search */ + #define LOCK_VICTIM_IS_START 1 + #define LOCK_VICTIM_IS_OTHER 2 +@@ -3304,6 +3307,7 @@ + + FILE* ef = lock_latest_err_file; + ++ innodb_deadlocks++; + rewind(ef); + ut_print_timestamp(ef); + +@@ -4238,6 +4242,7 @@ + innobase_mysql_prepare_print_arbitrary_thd(); + lock_mutex_enter_kernel(); + ++ if (file) { + if (lock_deadlock_found) { + fputs( + "------------------------\n" +@@ -4269,6 +4274,12 @@ + fprintf(file, + "Total number of lock structs in row lock hash table %lu\n", + (ulong) lock_get_n_rec_locks()); ++ } // if (file) ++ inno_transaction_purge_count = ++ (ulong) ut_dulint_get_low(purge_sys->purge_trx_no); ++ inno_transaction_count = ++ (ulong) ut_dulint_get_low(trx_sys->max_trx_id); ++ inno_transaction_purge_lag = (ulong) trx_sys->rseg_history_len; + } + + /************************************************************************* +@@ -4289,7 +4300,17 @@ + ulint i; + mtr_t mtr; + trx_t* trx; +- ++ time_t current_time = time(NULL); ++ ++ /* init all counters to be updated */ ++ inno_num_lock_waiters = 0; ++ inno_summed_lock_wait_time = 0; ++ inno_longest_lock_wait = 0; ++ inno_num_active_transactions = 0; ++ inno_summed_transaction_age = 0; ++ inno_longest_transaction_age = 0; ++ ++ if (file) { + fprintf(file, "LIST OF TRANSACTIONS FOR EACH SESSION:\n"); + + /* First print info on non-active transactions */ +@@ -4304,6 +4325,7 @@ + + trx = UT_LIST_GET_NEXT(mysql_trx_list, trx); + } ++ } // if (file) + + loop: + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); +@@ -4330,6 +4352,7 @@ + } + + if (nth_lock == 0) { ++ if (file) { + fputs("---", file); + trx_print(file, trx, 600); + +@@ -4341,11 +4364,27 @@ + (ulong) ut_dulint_get_high(trx->read_view->up_limit_id), + (ulong) ut_dulint_get_low(trx->read_view->up_limit_id)); + } ++ } // if (file) ++ ++ if (trx->conc_state == TRX_ACTIVE) { ++ ulong trx_age = (ulong)difftime(time(NULL), trx->start_time); ++ inno_num_active_transactions++; ++ inno_summed_transaction_age += trx_age; ++ if (inno_longest_transaction_age > trx_age) ++ inno_longest_transaction_age = trx_age; ++ } + + if (trx->que_state == TRX_QUE_LOCK_WAIT) { ++ ulong wait_time = (ulong)difftime(current_time, ++ trx->wait_started); ++ inno_num_lock_waiters++; ++ inno_summed_lock_wait_time += wait_time; ++ if (inno_longest_lock_wait < wait_time) ++ inno_longest_lock_wait = wait_time; ++ if (file) { + fprintf(file, + "------- TRX HAS BEEN WAITING %lu SEC FOR THIS LOCK TO BE GRANTED:\n", +- (ulong)difftime(time(NULL), trx->wait_started)); ++ wait_time); + + if (lock_get_type(trx->wait_lock) == LOCK_REC) { + lock_rec_print(file, trx->wait_lock); +@@ -4354,10 +4393,16 @@ + } + + fputs("------------------\n", file); +- } +- } +- +- if (!srv_print_innodb_lock_monitor) { ++ } // if (file) ++ } ++ } ++ ++ /* don't print locks per transaction if either ++ 1) srv_print_innodb_lock_monitor is NOT set, ++ ie no magic table innodb_lock_monitor is created, or ++ 2) file == NULL, ie, at counter updating stage from "show status" ++ */ ++ if (!srv_print_innodb_lock_monitor || !file) { + nth_trx++; + goto loop; + } +diff -r b059d02ec814 innobase/srv/srv0srv.c +--- a/innobase/srv/srv0srv.c Mon Nov 03 05:08:52 2008 -0800 ++++ b/innobase/srv/srv0srv.c Mon Nov 03 05:09:34 2008 -0800 +@@ -267,6 +267,35 @@ + ulint inno_pending_log_ios = 0; + ulint inno_pending_sync_ios = 0; + ++/* variable to identify if there is currently a long semaphore wait */ ++ibool srv_long_lock_wait = FALSE; ++ ++/* variable to count the number long semaphore waits noticed */ ++ulint srv_long_lock_waits = 0; ++ ++/* time interval in seconds allowed to calling innodb_show_status functions */ ++extern long innobase_min_status_update_time_interval; ++ ++/* all 24 innodb status variables, exported to status */ ++ulint inno_transaction_count = 0; ++ulint inno_transaction_purge_count = 0; ++ulint inno_transaction_purge_lag = 0; ++ulint inno_num_active_transactions = 0; ++ulint inno_summed_transaction_age = 0; ++ulint inno_longest_transaction_age = 0; ++ulint inno_lock_wait_timeouts = 0; /* Counts number of lock wait timeouts. */ ++ulint inno_num_lock_waiters = 0; ++ulint inno_summed_lock_wait_time = 0; ++ulint inno_longest_lock_wait = 0; ++ulint inno_os_reads = 0; ++ulint inno_os_writes = 0; ++ulint inno_os_fsyncs = 0; ++ulint inno_ibuf_size = 0; ++ulint inno_ibuf_inserts = 0; ++ulint inno_ibuf_merged_recs = 0; ++ulint inno_ibuf_merges = 0; ++ulint inno_log_ios_done = 0; ++ + /* structure to pass status variables to MySQL */ + export_struc export_vars; + +@@ -419,6 +448,10 @@ + const char* srv_io_thread_function[SRV_MAX_N_IO_THREADS]; + + time_t srv_last_monitor_time; ++ ++/* last time innodb status were updated thru show status */ ++time_t srv_last_innodb_status_time = 0; ++ + + mutex_t srv_innodb_monitor_mutex; + +@@ -677,6 +710,24 @@ + + ulint srv_n_threads_active[SRV_MASTER + 1]; + ulint srv_n_threads[SRV_MASTER + 1]; ++ ++/************************************************************************* ++Prints counters for work done by srv_master_thread. */ ++ ++static ++void ++srv_print_extra( ++/*===================*/ ++ FILE *file) /* in: output stream */ ++{ ++ fprintf(file, "srv_master_thread loops: %lu 1_second, %lu sleeps, " ++ "%lu 10_second, %lu background, %lu flush\n", ++ srv_main_1_second_loops, srv_main_sleeps, ++ srv_main_10_second_loops, srv_main_background_loops, ++ srv_main_flush_loops); ++ fprintf(file, "srv_master_thread log flush: %lu sync, %lu async\n", ++ srv_sync_flush, srv_async_flush); ++} + + /************************************************************************* + Sets the info describing an i/o thread current state. */ +@@ -1685,12 +1736,13 @@ + fputs("----------\n" + "BACKGROUND THREAD\n" + "----------\n", file); ++ srv_print_extra(file); + fil_print(file); +- + + fputs("----------\n" + "SEMAPHORES\n" + "----------\n", file); ++ fprintf(file, "Lock wait timeouts %lu\n", inno_lock_wait_timeouts); + sync_print(file); + + /* Conceptually, srv_innodb_monitor_mutex has a very high latching +@@ -1709,24 +1761,6 @@ + + mutex_exit(&dict_foreign_err_mutex); + +- lock_print_info_summary(file); +- if (trx_start) { +- long t = ftell(file); +- if (t < 0) { +- *trx_start = ULINT_UNDEFINED; +- } else { +- *trx_start = (ulint) t; +- } +- } +- lock_print_info_all_transactions(file); +- if (trx_end) { +- long t = ftell(file); +- if (t < 0) { +- *trx_end = ULINT_UNDEFINED; +- } else { +- *trx_end = (ulint) t; +- } +- } + fputs("--------\n" + "FILE I/O\n" + "--------\n", file); +@@ -1815,6 +1849,27 @@ + (srv_n_rows_read - srv_n_rows_read_old) + / time_elapsed); + ++ /* Print open transaction details */ ++ lock_print_info_summary(file); ++ ++ if (trx_start) { ++ long t = ftell(file); ++ if (t < 0) { ++ *trx_start = ULINT_UNDEFINED; ++ } else { ++ *trx_start = (ulint) t; ++ } ++ } ++ lock_print_info_all_transactions(file); ++ if (trx_end) { ++ long t = ftell(file); ++ if (t < 0) { ++ *trx_end = ULINT_UNDEFINED; ++ } else { ++ *trx_end = (ulint) t; ++ } ++ } ++ + srv_n_rows_inserted_old = srv_n_rows_inserted; + srv_n_rows_updated_old = srv_n_rows_updated; + srv_n_rows_deleted_old = srv_n_rows_deleted; +@@ -1833,7 +1888,8 @@ + void + srv_export_innodb_status(void) + { +- ++ long time_elapsed; ++ time_t current_time; + mutex_enter(&srv_innodb_monitor_mutex); + export_vars.innodb_data_pending_reads= os_n_pending_reads; + export_vars.innodb_data_pending_writes= os_n_pending_writes; +@@ -1844,6 +1900,7 @@ + export_vars.innodb_data_reads= os_n_file_reads; + export_vars.innodb_data_writes= os_n_file_writes; + export_vars.innodb_data_written= srv_data_written; ++ export_vars.innodb_dict_size= dict_sys->size; + export_vars.innodb_buffer_pool_read_requests= buf_pool->n_page_gets; + export_vars.innodb_buffer_pool_write_requests= srv_buf_pool_write_requests; + export_vars.innodb_buffer_pool_wait_free= srv_buf_pool_wait_free; +@@ -1854,10 +1911,12 @@ + export_vars.innodb_buffer_pool_pages_data= UT_LIST_GET_LEN(buf_pool->LRU); + export_vars.innodb_buffer_pool_pages_dirty= UT_LIST_GET_LEN(buf_pool->flush_list); + export_vars.innodb_buffer_pool_pages_free= UT_LIST_GET_LEN(buf_pool->free); +- export_vars.innodb_buffer_pool_pages_latched= buf_get_latched_pages_number(); ++ /* This function uses too much CPU for large buffer caches. */ ++ export_vars.innodb_buffer_pool_pages_latched= 1; /* buf_get_latched_pages_number(); */ + export_vars.innodb_buffer_pool_pages_total= buf_pool->curr_size; + export_vars.innodb_buffer_pool_pages_misc= buf_pool->max_size - + UT_LIST_GET_LEN(buf_pool->LRU) - UT_LIST_GET_LEN(buf_pool->free); ++ + export_vars.innodb_page_size= UNIV_PAGE_SIZE; + export_vars.innodb_log_waits= srv_log_waits; + export_vars.innodb_os_log_written= srv_os_log_written; +@@ -1885,6 +1944,103 @@ + export_vars.innodb_rows_inserted= srv_n_rows_inserted; + export_vars.innodb_rows_updated= srv_n_rows_updated; + export_vars.innodb_rows_deleted= srv_n_rows_deleted; ++ export_vars.innodb_long_lock_wait = srv_long_lock_wait; ++ export_vars.innodb_long_lock_waits = srv_long_lock_waits; ++ ++ export_vars.innodb_os_aio_read_requests = os_aio_read_requests; ++ export_vars.innodb_os_aio_write_requests = os_aio_write_requests; ++ ++ export_vars.innodb_os_aio_pages_read = os_aio_pages_read; ++ export_vars.innodb_os_aio_pages_written = os_aio_pages_written; ++ ++ export_vars.innodb_os_aio_read_time = os_aio_read_time; ++ export_vars.innodb_os_aio_write_time = os_aio_write_time; ++ ++ if (os_aio_read_requests > 0 ) { ++ export_vars.innodb_os_aio_read_time_avg ++ = os_aio_read_time / os_aio_read_requests; ++ } else { ++ export_vars.innodb_os_aio_read_time_avg = 0; ++ } ++ if (os_aio_write_requests > 0 ) { ++ export_vars.innodb_os_aio_write_time_avg ++ = os_aio_write_time / os_aio_write_requests; ++ } else { ++ export_vars.innodb_os_aio_write_time_avg = 0; ++ } ++ ++ export_vars.innodb_deadlocks = innodb_deadlocks; ++ ++ // simulate srv_printf_innodb_monitor, invoked by innodb_show_status ++ // 0. direct printout inno_lock_wait_timeouts, declared in srv0srv.c ++ // total # of variable(s) updated: 1 ++ export_vars.inno_lock_wait_timeouts = inno_lock_wait_timeouts; ++ ++ // *_print functions are allowed to be called once every ++ // some seconds to prevent too frequent invocation. ++ // the number is innobase_min_status_update_time_interval ++ current_time = time(NULL); ++ time_elapsed = difftime(current_time, srv_last_innodb_status_time); ++ if (time_elapsed >= innobase_min_status_update_time_interval) { ++ os_aio_print(NULL); ++ ibuf_print(NULL); ++ buf_print_io(NULL); ++ lock_print_info_summary(NULL); ++ lock_print_info_all_transactions(NULL); ++ ++ srv_last_innodb_status_time = current_time; ++ } ++ ++ // 1. os_aio_print ++ // the following were filled by calling os_aio_print ++ // total # of variable(s) updated: 8 ++ ++ export_vars.inno_pending_normal_aio_reads = ++ inno_pending_normal_aio_reads; ++ export_vars.inno_pending_normal_aio_writes = ++ inno_pending_normal_aio_writes; ++ export_vars.inno_pending_ibuf_aio_reads = inno_pending_ibuf_aio_reads; ++ export_vars.inno_pending_log_ios = inno_pending_log_ios; ++ export_vars.inno_pending_sync_ios = inno_pending_sync_ios; ++ export_vars.inno_os_reads = os_n_file_reads; ++ export_vars.inno_os_writes = os_n_file_writes; ++ export_vars.inno_os_fsyncs = os_n_fsyncs; ++ ++ // 2. ibuf_print() ++ // total # of variable(s) updated: 4 ++ ++ export_vars.inno_ibuf_size = inno_ibuf_size; ++ export_vars.inno_ibuf_inserts = inno_ibuf_inserts; ++ export_vars.inno_ibuf_merged_recs = inno_ibuf_merged_recs; ++ export_vars.inno_ibuf_merges = inno_ibuf_merges; ++ ++ // 3. log_print ++ // total # of variable(s) updated: 1 ++ export_vars.inno_log_ios_done = (ulong) log_sys->n_log_ios; ++ ++ // 5. lock_print_info_summary ++ // it enters the mutexes ++ // 1) innobase_mysql_prepare_print_arbitrary_thd() ++ // 2) lock_mutex_enter_kernel() ++ // total # of variable(s) updated: 3 ++ ++ export_vars.inno_transaction_count = inno_transaction_count; ++ export_vars.inno_transaction_purge_count = ++ inno_transaction_purge_count; ++ export_vars.inno_transaction_purge_lag = inno_transaction_purge_lag; ++ ++ // 6. lock_print_info_all_transactions(NULL) ++ // it exits two mutexes entered from lock_print_info_summary(NULL) ++ // total # of variable(s) updated: 6 ++ ++ export_vars.inno_num_active_transactions = inno_num_active_transactions; ++ export_vars.inno_summed_transaction_age = inno_summed_transaction_age; ++ export_vars.inno_longest_transaction_age = inno_longest_transaction_age; ++ ++ export_vars.inno_num_lock_waiters = inno_num_lock_waiters; ++ export_vars.inno_summed_lock_wait_time = inno_summed_lock_wait_time; ++ export_vars.inno_longest_lock_wait = inno_longest_lock_wait; ++ + mutex_exit(&srv_innodb_monitor_mutex); + + } +@@ -2026,6 +2182,7 @@ + if (thr_get_trx(slot->thr)->wait_lock) { + lock_cancel_waiting_and_release( + thr_get_trx(slot->thr)->wait_lock); ++ ++inno_lock_wait_timeouts; + } + } + } +diff -r b059d02ec814 patch_info/innodb_extra_status.info +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/patch_info/innodb_extra_status.info Mon Nov 03 05:09:34 2008 -0800 +@@ -0,0 +1,9 @@ ++File=innodb_extra_status.patch ++Name=Adds additional information of InnoDB counters into SHOW STATUS ++Version=1.0 ++Author=Google ++License=GPL ++Comment= ++ChangeLog= ++2008-11-03 ++VT: Initial porting +diff -r b059d02ec814 sql/ha_innodb.cc +--- a/sql/ha_innodb.cc Mon Nov 03 05:08:52 2008 -0800 ++++ b/sql/ha_innodb.cc Mon Nov 03 05:09:34 2008 -0800 +@@ -299,12 +299,36 @@ + (char*) &export_vars.innodb_dblwr_pages_written, SHOW_LONG}, + {"dblwr_writes", + (char*) &export_vars.innodb_dblwr_writes, SHOW_LONG}, ++ {"dict_size", ++ (char*) &export_vars.innodb_dict_size, SHOW_LONG}, + {"log_waits", + (char*) &export_vars.innodb_log_waits, SHOW_LONG}, + {"log_write_requests", + (char*) &export_vars.innodb_log_write_requests, SHOW_LONG}, + {"log_writes", + (char*) &export_vars.innodb_log_writes, SHOW_LONG}, ++ {"long_lock_wait", ++ (char*) &export_vars.innodb_long_lock_wait, SHOW_BOOL}, ++ {"long_lock_waits", ++ (char*) &export_vars.innodb_long_lock_waits, SHOW_LONG}, ++ ++ {"os_read_requests", ++ (char*) &export_vars.innodb_os_aio_read_requests, SHOW_LONG}, ++ {"os_write_requests", ++ (char*) &export_vars.innodb_os_aio_write_requests, SHOW_LONG}, ++ {"os_pages_read", ++ (char*) &export_vars.innodb_os_aio_pages_read, SHOW_LONG}, ++ {"os_pages_written", ++ (char*) &export_vars.innodb_os_aio_pages_written, SHOW_LONG}, ++ {"os_read_time", ++ (char*) &export_vars.innodb_os_aio_read_time, SHOW_LONGLONG}, ++ {"os_write_time", ++ (char*) &export_vars.innodb_os_aio_write_time, SHOW_LONGLONG}, ++ {"time_per_read", ++ (char*) &export_vars.innodb_os_aio_read_time_avg, SHOW_LONGLONG}, ++ {"time_per_write", ++ (char*) &export_vars.innodb_os_aio_write_time_avg, SHOW_LONGLONG}, ++ + {"os_log_fsyncs", + (char*) &export_vars.innodb_os_log_fsyncs, SHOW_LONG}, + {"os_log_pending_fsyncs", +@@ -339,6 +363,56 @@ + (char*) &export_vars.innodb_rows_read, SHOW_LONG}, + {"rows_updated", + (char*) &export_vars.innodb_rows_updated, SHOW_LONG}, ++ {"deadlocks", ++ (char*) &export_vars.innodb_deadlocks, SHOW_LONG}, ++ ++ /* 24 innodb status variables exported to status */ ++ {"transaction_count", ++ (char*) &export_vars.inno_transaction_count, SHOW_LONG}, ++ {"transaction_purge_count", ++ (char*) &export_vars.inno_transaction_purge_count, SHOW_LONG}, ++ {"transaction_purge_lag", ++ (char*) &export_vars.inno_transaction_purge_lag, SHOW_LONG}, ++ {"active_transactions", ++ (char*) &export_vars.inno_num_active_transactions, SHOW_LONG}, ++ {"summed_transaction_age", ++ (char*) &export_vars.inno_summed_transaction_age, SHOW_LONG}, ++ {"longest_transaction_age", ++ (char*) &export_vars.inno_longest_transaction_age, SHOW_LONG}, ++ {"lock_wait_timeouts", ++ (char*) &export_vars.inno_lock_wait_timeouts, SHOW_LONG}, ++ {"lock_waiters", ++ (char*) &export_vars.inno_num_lock_waiters, SHOW_LONG}, ++ {"summed_lock_wait_time", ++ (char*) &export_vars.inno_summed_lock_wait_time, SHOW_LONG}, ++ {"longest_lock_wait", ++ (char*) &export_vars.inno_longest_lock_wait, SHOW_LONG}, ++ {"pending_normal_aio_reads", ++ (char*) &export_vars.inno_pending_normal_aio_reads, SHOW_LONG}, ++ {"pending_normal_aio_writes", ++ (char*) &export_vars.inno_pending_normal_aio_writes, SHOW_LONG}, ++ {"pending_ibuf_aio_reads", ++ (char*) &export_vars.inno_pending_ibuf_aio_reads, SHOW_LONG}, ++ {"pending_log_ios", ++ (char*) &export_vars.inno_pending_log_ios, SHOW_LONG}, ++ {"pending_sync_ios", ++ (char*) &export_vars.inno_pending_sync_ios, SHOW_LONG}, ++ {"os_reads", ++ (char*) &export_vars.inno_os_reads, SHOW_LONG}, ++ {"os_writes", ++ (char*) &export_vars.inno_os_writes, SHOW_LONG}, ++ {"os_fsyncs", ++ (char*) &export_vars.inno_os_fsyncs, SHOW_LONG}, ++ {"ibuf_inserts", ++ (char*) &export_vars.inno_ibuf_size, SHOW_LONG}, ++ {"ibuf_size", ++ (char*) &export_vars.inno_ibuf_inserts, SHOW_LONG}, ++ {"ibuf_merged_recs", ++ (char*) &export_vars.inno_ibuf_merged_recs, SHOW_LONG}, ++ {"ibuf_merges", ++ (char*) &export_vars.inno_ibuf_merges, SHOW_LONG}, ++ {"log_ios_done", ++ (char*) &export_vars.inno_log_ios_done, SHOW_LONG}, + {NullS, NullS, SHOW_LONG}}; + + /* General functions */ +diff -r b059d02ec814 sql/ha_innodb.h +--- a/sql/ha_innodb.h Mon Nov 03 05:08:52 2008 -0800 ++++ b/sql/ha_innodb.h Mon Nov 03 05:09:34 2008 -0800 +@@ -198,6 +198,7 @@ + extern struct show_var_st innodb_status_variables[]; + extern ulong innobase_fast_shutdown; + extern long innobase_max_merged_io; ++extern long innobase_min_status_update_time_interval; + extern ulong innobase_large_page_size; + extern long innobase_mirrored_log_groups, innobase_log_files_in_group; + extern longlong innobase_buffer_pool_size, innobase_log_file_size; +diff -r b059d02ec814 sql/mysqld.cc +--- a/sql/mysqld.cc Mon Nov 03 05:08:52 2008 -0800 ++++ b/sql/mysqld.cc Mon Nov 03 05:09:34 2008 -0800 +@@ -4950,6 +4950,7 @@ + OPT_INNODB_SYNC_SPIN_LOOPS, + OPT_INNODB_CONCURRENCY_TICKETS, + OPT_INNODB_THREAD_SLEEP_DELAY, ++ OPT_INNODB_MIN_STATUS_UPDATE_TIME_INTERVAL, + OPT_BDB_CACHE_SIZE, + OPT_BDB_LOG_BUFFER_SIZE, + OPT_BDB_MAX_LOCK, +@@ -6031,6 +6032,14 @@ + (gptr*) &srv_thread_sleep_delay, + (gptr*) &srv_thread_sleep_delay, + 0, GET_ULONG, REQUIRED_ARG, 10000L, 0L, ULONG_MAX, 0, 1L, 0}, ++ {"innodb_status_update_interval", ++ OPT_INNODB_MIN_STATUS_UPDATE_TIME_INTERVAL, ++ "Minimum time interval in seconds before InnoDB status counters " ++ "are updated during SHOW STATUS. " ++ "InnoDB counters are always updated during SHOW INNODB STATUS.", ++ (gptr*) &innobase_min_status_update_time_interval, ++ (gptr*) &innobase_min_status_update_time_interval, ++ 0, GET_LONG, REQUIRED_ARG, 30, 0, 3600, 0, 1, 0}, + #endif /* HAVE_INNOBASE_DB */ + {"interactive_timeout", OPT_INTERACTIVE_TIMEOUT, + "The number of seconds the server waits for activity on an interactive connection before closing it.", +diff -r b059d02ec814 sql/set_var.cc +--- a/sql/set_var.cc Mon Nov 03 05:08:52 2008 -0800 ++++ b/sql/set_var.cc Mon Nov 03 05:09:34 2008 -0800 +@@ -948,6 +948,8 @@ + {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG }, + {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG }, + {"innodb_max_merged_io", (char*) &innobase_max_merged_io, SHOW_LONG}, ++ {"innodb_status_update_interval", ++ (char*) &innobase_min_status_update_time_interval, SHOW_LONG}, + #endif + {sys_interactive_timeout.name,(char*) &sys_interactive_timeout, SHOW_SYS}, + {sys_join_buffer_size.name, (char*) &sys_join_buffer_size, SHOW_SYS}, diff --git a/percona/5.0.91-b22-20100522/innodb_fsync_source.patch b/percona/5.0.91-b22-20100522/innodb_fsync_source.patch new file mode 100644 index 0000000..2961b78 --- /dev/null +++ b/percona/5.0.91-b22-20100522/innodb_fsync_source.patch @@ -0,0 +1,594 @@ +diff -r ef44d8017b6b innobase/buf/buf0flu.c +--- a/innobase/buf/buf0flu.c Fri Jul 03 15:41:25 2009 -0700 ++++ b/innobase/buf/buf0flu.c Fri Jul 03 15:41:32 2009 -0700 +@@ -341,7 +341,7 @@ + + /* Now flush the doublewrite buffer data to disk */ + +- fil_flush(TRX_SYS_SPACE); ++ fil_flush(TRX_SYS_SPACE, FLUSH_FROM_DIRTY_BUFFER); + + /* We know that the writes have been flushed to disk now + and in recovery we will find them in the doublewrite buffer +@@ -381,7 +381,7 @@ + + /* Now we flush the data to disk (for example, with fsync) */ + +- fil_flush_file_spaces(FIL_TABLESPACE); ++ fil_flush_file_spaces(FIL_TABLESPACE, FLUSH_FROM_DIRTY_BUFFER); + + /* We can now reuse the doublewrite memory buffer: */ + +@@ -501,7 +501,8 @@ + } + #else + /* Force the log to the disk before writing the modified block */ +- log_write_up_to(block->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE); ++ log_write_up_to(block->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE, ++ LOG_WRITE_FROM_DIRTY_BUFFER); + #endif + buf_flush_init_for_writing(block->frame, block->newest_modification, + block->space, block->offset); +diff -r ef44d8017b6b innobase/fil/fil0fil.c +--- a/innobase/fil/fil0fil.c Fri Jul 03 15:41:25 2009 -0700 ++++ b/innobase/fil/fil0fil.c Fri Jul 03 15:41:32 2009 -0700 +@@ -245,6 +245,7 @@ + request */ + UT_LIST_BASE_NODE_T(fil_space_t) space_list; + /* list of all file spaces */ ++ ulint flush_types[FLUSH_FROM_NUMBER];/* calls to fil_flush by caller */ + }; + + /* The tablespace memory cache. This variable is NULL before the module is +@@ -849,7 +850,7 @@ + /* Flush tablespaces so that we can close modified files in the LRU + list */ + +- fil_flush_file_spaces(FIL_TABLESPACE); ++ fil_flush_file_spaces(FIL_TABLESPACE, FLUSH_FROM_OTHER); + + count++; + +@@ -1309,7 +1310,10 @@ + + UT_LIST_INIT(system->unflushed_spaces); + UT_LIST_INIT(system->space_list); +- ++ { ++ int x; ++ for (x = 0; x < FLUSH_FROM_NUMBER; ++x) system->flush_types[x] = 0; ++ } + return(system); + } + +@@ -1437,6 +1441,23 @@ + } + + mutex_exit(&(system->mutex)); ++} ++ ++/******************************************************************** ++Prints internal counters */ ++ ++void ++fil_print(FILE *file) ++{ ++ fprintf(file, ++ "fsync callers: %lu buffer pool, %lu other, %lu checkpoint, " ++ "%lu log aio, %lu log sync, %lu archive\n", ++ fil_system->flush_types[FLUSH_FROM_DIRTY_BUFFER], ++ fil_system->flush_types[FLUSH_FROM_OTHER], ++ fil_system->flush_types[FLUSH_FROM_CHECKPOINT], ++ fil_system->flush_types[FLUSH_FROM_LOG_IO_COMPLETE], ++ fil_system->flush_types[FLUSH_FROM_LOG_WRITE_UP_TO], ++ fil_system->flush_types[FLUSH_FROM_ARCHIVE]); + } + + /******************************************************************** +@@ -2256,7 +2277,7 @@ + + os_thread_sleep(20000); + +- fil_flush(id); ++ fil_flush(id, FLUSH_FROM_OTHER); + + goto retry; + +@@ -3574,7 +3595,7 @@ + size_after_extend, *actual_size); */ + mutex_exit(&(system->mutex)); + +- fil_flush(space_id); ++ fil_flush(space_id, FLUSH_FROM_OTHER); + + return(success); + } +@@ -4167,8 +4188,9 @@ + void + fil_flush( + /*======*/ +- ulint space_id) /* in: file space id (this can be a group of ++ ulint space_id, /* in: file space id (this can be a group of + log files or a tablespace of the database) */ ++ flush_from_type flush_type)/* in: identifies the caller */ + { + fil_system_t* system = fil_system; + fil_space_t* space; +@@ -4177,7 +4199,7 @@ + ib_longlong old_mod_counter; + + mutex_enter(&(system->mutex)); +- ++ system->flush_types[flush_type]++; + HASH_SEARCH(hash, system->spaces, space_id, space, + space->id == space_id); + if (!space || space->is_being_deleted) { +@@ -4282,7 +4304,8 @@ + void + fil_flush_file_spaces( + /*==================*/ +- ulint purpose) /* in: FIL_TABLESPACE, FIL_LOG */ ++ ulint purpose, /* in: FIL_TABLESPACE, FIL_LOG */ ++ flush_from_type flush_type)/* in: identifies the caller */ + { + fil_system_t* system = fil_system; + fil_space_t* space; +@@ -4323,7 +4346,7 @@ + a non-existing space id. */ + for (i = 0; i < n_space_ids; i++) { + +- fil_flush(space_ids[i]); ++ fil_flush(space_ids[i], flush_type); + } + + mem_free(space_ids); +diff -r ef44d8017b6b innobase/include/fil0fil.h +--- a/innobase/include/fil0fil.h Fri Jul 03 15:41:25 2009 -0700 ++++ b/innobase/include/fil0fil.h Fri Jul 03 15:41:32 2009 -0700 +@@ -197,6 +197,13 @@ + fil_init( + /*=====*/ + ulint max_n_open); /* in: max number of open files */ ++/******************************************************************** ++ * Prints internal counters. */ ++ ++void ++fil_print( ++ /*=====*/ ++ FILE* file); /* in: output stream */ + /*********************************************************************** + Opens all log files and system tablespace data files. They stay open until the + database server shutdown. This should be called at a server startup after the +@@ -625,14 +632,26 @@ + ulint segment); /* in: the number of the segment in the aio + array to wait for */ + /************************************************************************** ++Identifies the caller of fil_flush. */ ++typedef enum { ++ FLUSH_FROM_DIRTY_BUFFER, ++ FLUSH_FROM_OTHER, ++ FLUSH_FROM_CHECKPOINT, ++ FLUSH_FROM_LOG_IO_COMPLETE, ++ FLUSH_FROM_LOG_WRITE_UP_TO, ++ FLUSH_FROM_ARCHIVE, ++ FLUSH_FROM_NUMBER ++} flush_from_type; ++/************************************************************************** + Flushes to disk possible writes cached by the OS. If the space does not exist + or is being dropped, does not do anything. */ + + void + fil_flush( + /*======*/ +- ulint space_id); /* in: file space id (this can be a group of ++ ulint space_id, /* in: file space id (this can be a group of + log files or a tablespace of the database) */ ++ flush_from_type flush_type);/* in: identifies the caller */ + /************************************************************************** + Flushes to disk writes in file spaces of the given type possibly cached by + the OS. */ +@@ -640,7 +659,8 @@ + void + fil_flush_file_spaces( + /*==================*/ +- ulint purpose); /* in: FIL_TABLESPACE, FIL_LOG */ ++ ulint purpose, /* in: FIL_TABLESPACE, FIL_LOG */ ++ flush_from_type flush_type);/* in: identifies the caller */ + /********************************************************************** + Checks the consistency of the tablespace cache. */ + +diff -r ef44d8017b6b innobase/include/log0log.h +--- a/innobase/include/log0log.h Fri Jul 03 15:41:25 2009 -0700 ++++ b/innobase/include/log0log.h Fri Jul 03 15:41:32 2009 -0700 +@@ -146,6 +146,22 @@ + log_io_complete( + /*============*/ + log_group_t* group); /* in: log group */ ++ ++/********************************************************** ++Describes the caller of log_write_up_to. */ ++ ++typedef enum { ++ LOG_WRITE_FROM_DIRTY_BUFFER, ++ LOG_WRITE_FROM_BACKGROUND_SYNC, ++ LOG_WRITE_FROM_BACKGROUND_ASYNC, ++ LOG_WRITE_FROM_INTERNAL, ++ LOG_WRITE_FROM_CHECKPOINT_SYNC, ++ LOG_WRITE_FROM_CHECKPOINT_ASYNC, ++ LOG_WRITE_FROM_LOG_ARCHIVE, ++ LOG_WRITE_FROM_COMMIT_SYNC, ++ LOG_WRITE_FROM_COMMIT_ASYNC, ++ LOG_WRITE_FROM_NUMBER ++} log_sync_type; + /********************************************************** + This function is called, e.g., when a transaction wants to commit. It checks + that the log has been written to the log file up to the last log entry written +@@ -159,14 +175,21 @@ + be written, ut_dulint_max if not specified */ + ulint wait, /* in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP, + or LOG_WAIT_ALL_GROUPS */ +- ibool flush_to_disk); +- /* in: TRUE if we want the written log also to be +- flushed to disk */ ++ ibool flush_to_disk, ++ /* in: TRUE if we want the written log also to be flushed to disk */ ++ log_sync_type caller);/* in: identifies the caller */ + /******************************************************************** + Does a syncronous flush of the log buffer to disk. */ + + void + log_buffer_flush_to_disk(void); ++/*==========================*/ ++/******************************************************************** ++Flushes the log buffer. Forces it to disk depending on the value of ++the configuration parameter innodb_flush_log_at_trx_commit. */ ++ ++void ++log_buffer_flush_maybe_sync(void); + /*==========================*/ + /******************************************************************** + Flushes the log buffer. Forces it to disk depending on the value of +@@ -751,6 +774,12 @@ + AND flushed to disk */ + ulint n_pending_writes;/* number of currently pending flushes + or writes */ ++ ulint log_sync_callers[LOG_WRITE_FROM_NUMBER]; ++ /* counts calls to log_write_up_to */ ++ ulint log_sync_syncers[LOG_WRITE_FROM_NUMBER]; ++ /* counts calls to log_write_up_to when log file is sync'd */ ++ ulint n_syncs; /* number of fsyncs done for log file */ ++ ulint n_checkpoints; /* number of calls to log_checkpoint */ + /* NOTE on the 'flush' in names of the fields below: starting from + 4.0.14, we separate the write of the log file and the actual fsync() + or other method to flush it to disk. The names below shhould really +diff -r ef44d8017b6b innobase/log/log0log.c +--- a/innobase/log/log0log.c Fri Jul 03 15:41:25 2009 -0700 ++++ b/innobase/log/log0log.c Fri Jul 03 15:41:32 2009 -0700 +@@ -782,6 +782,15 @@ + log_sys->written_to_all_lsn = log_sys->lsn; + + log_sys->n_pending_writes = 0; ++ { ++ int x; ++ for (x = 0; x < LOG_WRITE_FROM_NUMBER; ++x) { ++ log_sys->log_sync_callers[x] = 0; ++ log_sys->log_sync_syncers[x] = 0; ++ } ++ } ++ log_sys->n_syncs = 0; ++ log_sys->n_checkpoints = 0; + + log_sys->no_flush_event = os_event_create(NULL); + +@@ -1066,7 +1075,7 @@ + if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC + && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) { + +- fil_flush(group->space_id); ++ fil_flush(group->space_id, FLUSH_FROM_LOG_IO_COMPLETE); + } + + #ifdef UNIV_DEBUG +@@ -1088,7 +1097,7 @@ + && srv_unix_file_flush_method != SRV_UNIX_NOSYNC + && srv_flush_log_at_trx_commit != 2) { + +- fil_flush(group->space_id); ++ fil_flush(group->space_id, FLUSH_FROM_LOG_IO_COMPLETE); + } + + mutex_enter(&(log_sys->mutex)); +@@ -1303,9 +1312,10 @@ + be written, ut_dulint_max if not specified */ + ulint wait, /* in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP, + or LOG_WAIT_ALL_GROUPS */ +- ibool flush_to_disk) ++ ibool flush_to_disk, + /* in: TRUE if we want the written log also to be + flushed to disk */ ++ log_sync_type caller) /* in: identifies caller */ + { + log_group_t* group; + ulint start_offset; +@@ -1315,6 +1325,7 @@ + ulint loop_count; + ulint unlock; + ++ log_sys->log_sync_callers[caller]++; + if (recv_no_ibuf_operations) { + /* Recovery is running and no operations on the log files are + allowed yet (the variable name .._no_ibuf_.. is misleading) */ +@@ -1465,13 +1476,17 @@ + so we have also flushed to disk what we have written */ + + log_sys->flushed_to_disk_lsn = log_sys->write_lsn; ++ log_sys->n_syncs++; ++ log_sys->log_sync_syncers[caller]++; + + } else if (flush_to_disk) { + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + +- fil_flush(group->space_id); ++ fil_flush(group->space_id, FLUSH_FROM_LOG_WRITE_UP_TO); + log_sys->flushed_to_disk_lsn = log_sys->write_lsn; ++ log_sys->n_syncs++; ++ log_sys->log_sync_syncers[caller]++; + } + + mutex_enter(&(log_sys->mutex)); +@@ -1520,7 +1535,8 @@ + + mutex_exit(&(log_sys->mutex)); + +- log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS, TRUE); ++ log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS, TRUE, ++ LOG_WRITE_FROM_BACKGROUND_SYNC); + } + + /******************************************************************** +@@ -1574,7 +1590,7 @@ + mutex_exit(&(log->mutex)); + + if (do_flush) { +- log_write_up_to(lsn, LOG_NO_WAIT, FALSE); ++ log_write_up_to(lsn, LOG_NO_WAIT, FALSE, LOG_WRITE_FROM_INTERNAL); + } + } + +@@ -1944,11 +1960,11 @@ + } + + if (srv_unix_file_flush_method != SRV_UNIX_NOSYNC) { +- fil_flush_file_spaces(FIL_TABLESPACE); ++ fil_flush_file_spaces(FIL_TABLESPACE, FLUSH_FROM_CHECKPOINT); + } + + mutex_enter(&(log_sys->mutex)); +- ++ log_sys->n_checkpoints++; + oldest_lsn = log_buf_pool_get_oldest_modification(); + + mutex_exit(&(log_sys->mutex)); +@@ -1961,7 +1977,8 @@ + write-ahead-logging algorithm ensures that the log has been flushed + up to oldest_lsn. */ + +- log_write_up_to(oldest_lsn, LOG_WAIT_ALL_GROUPS, TRUE); ++ log_write_up_to(oldest_lsn, LOG_WAIT_ALL_GROUPS, TRUE, ++ LOG_WRITE_FROM_CHECKPOINT_SYNC); + + mutex_enter(&(log_sys->mutex)); + +@@ -2589,7 +2606,7 @@ + + mutex_exit(&(log_sys->mutex)); + +- fil_flush(group->archive_space_id); ++ fil_flush(group->archive_space_id, FLUSH_FROM_ARCHIVE); + + mutex_enter(&(log_sys->mutex)); + +@@ -2670,7 +2687,8 @@ + + mutex_exit(&(log_sys->mutex)); + +- log_write_up_to(limit_lsn, LOG_WAIT_ALL_GROUPS, TRUE); ++ log_write_up_to(limit_lsn, LOG_WAIT_ALL_GROUPS, TRUE, ++ LOG_WRITE_FROM_LOG_ARCHIVE); + + calc_new_limit = FALSE; + +@@ -3207,8 +3225,8 @@ + } + mutex_exit(&kernel_mutex); + +- fil_flush_file_spaces(FIL_TABLESPACE); +- fil_flush_file_spaces(FIL_LOG); ++ fil_flush_file_spaces(FIL_TABLESPACE, FLUSH_FROM_OTHER); ++ fil_flush_file_spaces(FIL_LOG, FLUSH_FROM_OTHER); + + /* The call fil_write_flushed_lsn_to_data_files() will pass the buffer + pool: therefore it is essential that the buffer pool has been +@@ -3241,7 +3259,7 @@ + + fil_write_flushed_lsn_to_data_files(lsn, arch_log_no); + +- fil_flush_file_spaces(FIL_TABLESPACE); ++ fil_flush_file_spaces(FIL_TABLESPACE, FLUSH_FROM_OTHER); + + fil_close_all_files(); + +@@ -3363,15 +3381,45 @@ + time_elapsed = 0.001 + difftime(current_time, + log_sys->last_printout_time); + fprintf(file, +- "%lu pending log writes, %lu pending chkp writes\n" +- "%lu log i/o's done, %.2f log i/o's/second\n", +- (ulong) log_sys->n_pending_writes, +- (ulong) log_sys->n_pending_checkpoint_writes, +- (ulong) log_sys->n_log_ios, +- ((log_sys->n_log_ios - log_sys->n_log_ios_old) / time_elapsed)); ++ "%lu pending log writes, %lu pending chkp writes\n" ++ "%lu log i/o's done, %.2f log i/o's/second, %lu syncs, %lu checkpoints\n", ++ (ulong) log_sys->n_pending_writes, ++ (ulong) log_sys->n_pending_checkpoint_writes, ++ (ulong) log_sys->n_log_ios, ++ (log_sys->n_log_ios - log_sys->n_log_ios_old) / time_elapsed, ++ log_sys->n_syncs, ++ log_sys->n_checkpoints); + + log_sys->n_log_ios_old = log_sys->n_log_ios; + log_sys->last_printout_time = current_time; ++ ++ fprintf(file, ++ "log sync callers: %lu buffer pool, background %lu sync and %lu async, " ++ "%lu internal, checkpoint %lu sync and %lu async, %lu archive, " ++ "commit %lu sync and %lu async\n", ++ log_sys->log_sync_callers[LOG_WRITE_FROM_DIRTY_BUFFER], ++ log_sys->log_sync_callers[LOG_WRITE_FROM_BACKGROUND_SYNC], ++ log_sys->log_sync_callers[LOG_WRITE_FROM_BACKGROUND_ASYNC], ++ log_sys->log_sync_callers[LOG_WRITE_FROM_INTERNAL], ++ log_sys->log_sync_callers[LOG_WRITE_FROM_CHECKPOINT_SYNC], ++ log_sys->log_sync_callers[LOG_WRITE_FROM_CHECKPOINT_ASYNC], ++ log_sys->log_sync_callers[LOG_WRITE_FROM_LOG_ARCHIVE], ++ log_sys->log_sync_callers[LOG_WRITE_FROM_COMMIT_SYNC], ++ log_sys->log_sync_callers[LOG_WRITE_FROM_COMMIT_ASYNC]); ++ ++ fprintf(file, ++ "log sync syncers: %lu buffer pool, background %lu sync and %lu async, " ++ "%lu internal, checkpoint %lu sync and %lu async, %lu archive, " ++ "commit %lu sync and %lu async\n", ++ log_sys->log_sync_syncers[LOG_WRITE_FROM_DIRTY_BUFFER], ++ log_sys->log_sync_syncers[LOG_WRITE_FROM_BACKGROUND_SYNC], ++ log_sys->log_sync_syncers[LOG_WRITE_FROM_BACKGROUND_ASYNC], ++ log_sys->log_sync_syncers[LOG_WRITE_FROM_INTERNAL], ++ log_sys->log_sync_syncers[LOG_WRITE_FROM_CHECKPOINT_SYNC], ++ log_sys->log_sync_syncers[LOG_WRITE_FROM_CHECKPOINT_ASYNC], ++ log_sys->log_sync_syncers[LOG_WRITE_FROM_LOG_ARCHIVE], ++ log_sys->log_sync_syncers[LOG_WRITE_FROM_COMMIT_SYNC], ++ log_sys->log_sync_syncers[LOG_WRITE_FROM_COMMIT_ASYNC]); + + mutex_exit(&(log_sys->mutex)); + } +diff -r ef44d8017b6b innobase/srv/srv0srv.c +--- a/innobase/srv/srv0srv.c Fri Jul 03 15:41:25 2009 -0700 ++++ b/innobase/srv/srv0srv.c Fri Jul 03 15:41:32 2009 -0700 +@@ -1698,6 +1698,12 @@ + (ulong)time_elapsed); + + fputs("----------\n" ++ "BACKGROUND THREAD\n" ++ "----------\n", file); ++ fil_print(file); ++ ++ ++ fputs("----------\n" + "SEMAPHORES\n" + "----------\n", file); + sync_print(file); +diff -r ef44d8017b6b innobase/trx/trx0sys.c +--- a/innobase/trx/trx0sys.c Fri Jul 03 15:41:25 2009 -0700 ++++ b/innobase/trx/trx0sys.c Fri Jul 03 15:41:32 2009 -0700 +@@ -511,7 +511,7 @@ + page += UNIV_PAGE_SIZE; + } + +- fil_flush_file_spaces(FIL_TABLESPACE); ++ fil_flush_file_spaces(FIL_TABLESPACE, FLUSH_FROM_OTHER); + + leave_func: + ut_free(unaligned_read_buf); +diff -r ef44d8017b6b innobase/trx/trx0trx.c +--- a/innobase/trx/trx0trx.c Fri Jul 03 15:41:25 2009 -0700 ++++ b/innobase/trx/trx0trx.c Fri Jul 03 15:41:32 2009 -0700 +@@ -942,19 +942,21 @@ + if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { + /* Write the log but do not flush it to disk */ + +- log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, +- FALSE); ++ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE, ++ LOG_WRITE_FROM_COMMIT_ASYNC); + } else { + /* Write the log to the log files AND flush + them to disk */ + +- log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); ++ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE, ++ LOG_WRITE_FROM_COMMIT_SYNC); + } + } else if (srv_flush_log_at_trx_commit == 2) { + + /* Write the log but do not flush it to disk */ + +- log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); ++ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE, ++ LOG_WRITE_FROM_COMMIT_ASYNC); + } else { + ut_error; + } +@@ -1701,18 +1703,21 @@ + if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { + /* Write the log but do not flush it to disk */ + +- log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); ++ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE, ++ LOG_WRITE_FROM_COMMIT_ASYNC); + } else { + /* Write the log to the log files AND flush them to + disk */ + +- log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); ++ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE, ++ LOG_WRITE_FROM_COMMIT_SYNC); + } + } else if (srv_flush_log_at_trx_commit == 2) { + + /* Write the log but do not flush it to disk */ + +- log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); ++ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE, ++ LOG_WRITE_FROM_COMMIT_ASYNC); + } else { + ut_error; + } +@@ -1948,19 +1953,21 @@ + if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { + /* Write the log but do not flush it to disk */ + +- log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, +- FALSE); ++ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE, ++ LOG_WRITE_FROM_COMMIT_ASYNC); + } else { + /* Write the log to the log files AND flush + them to disk */ + +- log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); ++ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE, ++ LOG_WRITE_FROM_COMMIT_SYNC); + } + } else if (srv_flush_log_at_trx_commit == 2) { + + /* Write the log but do not flush it to disk */ + +- log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); ++ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE, ++ LOG_WRITE_FROM_COMMIT_ASYNC); + } else { + ut_error; + } +diff -r ef44d8017b6b patch_info/innodb_fsync_source.info +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/patch_info/innodb_fsync_source.info Fri Jul 03 15:41:32 2009 -0700 +@@ -0,0 +1,9 @@ ++File=innodb_fsync_source.patch ++Name=Information of fsync callers in InnoDB ++Version=1.0 ++Author=Google ++License=GPL ++Comment= ++ChangeLog= ++2008-11-01 ++VT: Initial porting diff --git a/percona/5.0.91-b22-20100522/innodb_io_patches.patch b/percona/5.0.91-b22-20100522/innodb_io_patches.patch new file mode 100644 index 0000000..aaef29a --- /dev/null +++ b/percona/5.0.91-b22-20100522/innodb_io_patches.patch @@ -0,0 +1,1379 @@ +diff -ruN a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c +--- a/innobase/buf/buf0flu.c 2009-05-08 06:12:03.000000000 +0900 ++++ b/innobase/buf/buf0flu.c 2009-07-02 16:44:49.000000000 +0900 +@@ -898,10 +898,17 @@ + + old_page_count = page_count; + ++ if (srv_flush_neighbor_pages) { + /* Try to flush also all the neighbors */ + page_count += + buf_flush_try_neighbors(space, offset, + flush_type); ++ } else { ++ /* Try to flush the page only */ ++ page_count += ++ buf_flush_try_page(space, offset, ++ flush_type); ++ } + /* fprintf(stderr, + "Flush type %lu, page no %lu, neighb %lu\n", + flush_type, offset, +diff -ruN a/innobase/buf/buf0rea.c b/innobase/buf/buf0rea.c +--- a/innobase/buf/buf0rea.c 2009-07-02 16:43:23.000000000 +0900 ++++ b/innobase/buf/buf0rea.c 2009-07-02 16:44:49.000000000 +0900 +@@ -20,6 +20,7 @@ + #include "os0file.h" + #include "srv0start.h" + ++extern uint srv_read_ahead; + extern ulint srv_read_ahead_rnd; + extern ulint srv_read_ahead_seq; + extern ulint srv_buf_pool_reads; +@@ -189,6 +190,10 @@ + ulint err; + ulint i; + ++ if (!(srv_read_ahead & 1)) { ++ return(0); ++ } ++ + if (srv_startup_is_before_trx_rollback_phase) { + /* No read-ahead to avoid thread deadlocks */ + return(0); +@@ -396,6 +401,10 @@ + ulint err; + ulint i; + ++ if (!(srv_read_ahead & 2)) { ++ return(0); ++ } ++ + if (srv_startup_is_before_trx_rollback_phase) { + /* No read-ahead to avoid thread deadlocks */ + return(0); +diff -ruN a/innobase/ibuf/ibuf0ibuf.c b/innobase/ibuf/ibuf0ibuf.c +--- a/innobase/ibuf/ibuf0ibuf.c 2009-05-08 06:12:04.000000000 +0900 ++++ b/innobase/ibuf/ibuf0ibuf.c 2009-07-02 16:44:49.000000000 +0900 +@@ -370,8 +370,9 @@ + grow in size, as the references on the upper levels of the tree can + change */ + +- ibuf->max_size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE +- / IBUF_POOL_SIZE_PER_MAX_SIZE; ++ ibuf->max_size = ut_min( buf_pool_get_curr_size() / UNIV_PAGE_SIZE ++ / IBUF_POOL_SIZE_PER_MAX_SIZE, (ulint) srv_ibuf_max_size / UNIV_PAGE_SIZE); ++ srv_ibuf_max_size = (long long) ibuf->max_size * UNIV_PAGE_SIZE; + ibuf->meter = IBUF_THRESHOLD + 1; + + UT_LIST_INIT(ibuf->data_list); +@@ -2258,11 +2259,13 @@ + + mutex_enter(&ibuf_mutex); + ++ if (!srv_ibuf_active_contract) { + if (ibuf->size < ibuf->max_size + IBUF_CONTRACT_ON_INSERT_NON_SYNC) { + mutex_exit(&ibuf_mutex); + + return; + } ++ } + + sync = FALSE; + +diff -ruN a/innobase/include/log0log.h b/innobase/include/log0log.h +--- a/innobase/include/log0log.h 2009-05-08 06:12:06.000000000 +0900 ++++ b/innobase/include/log0log.h 2009-07-02 16:44:49.000000000 +0900 +@@ -169,6 +169,13 @@ + log_buffer_flush_to_disk(void); + /*==========================*/ + /******************************************************************** ++Flushes the log buffer. Forces it to disk depending on the value of ++the configuration parameter innodb_flush_log_at_trx_commit. */ ++ ++void ++log_buffer_flush_maybe_sync(void); ++/*=============================*/ ++/******************************************************************** + Advances the smallest lsn for which there are unflushed dirty blocks in the + buffer pool and also may make a new checkpoint. NOTE: this function may only + be called if the calling thread owns no synchronization objects! */ +diff -ruN a/innobase/include/os0file.h b/innobase/include/os0file.h +--- a/innobase/include/os0file.h 2009-07-02 16:43:23.000000000 +0900 ++++ b/innobase/include/os0file.h 2009-07-02 16:44:49.000000000 +0900 +@@ -551,8 +551,10 @@ + /*========*/ + ulint n, /* in: maximum number of pending aio operations + allowed; n must be divisible by n_segments */ +- ulint n_segments, /* in: combined number of segments in the four +- first aio arrays; must be >= 4 */ ++// ulint n_segments, /* in: combined number of segments in the four ++// first aio arrays; must be >= 4 */ ++ ulint n_read_threads, /* n_segments == 2 + n_read_threads + n_write_threads */ ++ ulint n_write_threads, /**/ + ulint n_slots_sync); /* in: number of slots in the sync aio array */ + /*********************************************************************** + Requests an asynchronous i/o operation. */ +diff -ruN a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h +--- a/innobase/include/srv0srv.h 2009-07-02 16:43:23.000000000 +0900 ++++ b/innobase/include/srv0srv.h 2009-07-02 18:02:38.000000000 +0900 +@@ -89,6 +89,8 @@ + extern ulint srv_lock_table_size; + + extern ulint srv_n_file_io_threads; ++extern ulint srv_n_read_io_threads; ++extern ulint srv_n_write_io_threads; + + #ifdef UNIV_LOG_ARCHIVE + extern ibool srv_log_archive_on; +@@ -133,6 +135,15 @@ + extern ulong srv_max_purge_lag; + extern ibool srv_use_awe; + extern ibool srv_use_adaptive_hash_indexes; ++ ++extern ulint srv_io_capacity; ++extern long long srv_ibuf_max_size; ++extern ulint srv_ibuf_active_contract; ++extern ulint srv_ibuf_accel_rate; ++extern ulint srv_flush_neighbor_pages; ++extern ulint srv_enable_unsafe_group_commit; ++extern uint srv_read_ahead; ++extern uint srv_adaptive_checkpoint; + /*-------------------------------------------*/ + + extern ulint srv_n_rows_inserted; +diff -ruN a/innobase/log/log0log.c b/innobase/log/log0log.c +--- a/innobase/log/log0log.c 2009-05-08 06:12:10.000000000 +0900 ++++ b/innobase/log/log0log.c 2009-07-02 16:44:49.000000000 +0900 +@@ -1524,6 +1524,29 @@ + } + + /******************************************************************** ++Flush the log buffer. Force it to disk depending on the value of ++innodb_flush_log_at_trx_commit. */ ++ ++void ++log_buffer_flush_maybe_sync(void) ++/*=============================*/ ++{ ++ dulint lsn; ++ ++ mutex_enter(&(log_sys->mutex)); ++ ++ lsn = log_sys->lsn; ++ ++ mutex_exit(&(log_sys->mutex)); ++ ++ /* Force log buffer to disk when innodb_flush_log_at_trx_commit = 1. */ ++ log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS, ++ srv_flush_log_at_trx_commit == 1 ? TRUE : FALSE, ++ srv_flush_log_at_trx_commit == 1 ? ++ LOG_WRITE_FROM_BACKGROUND_SYNC : ++ LOG_WRITE_FROM_BACKGROUND_ASYNC); ++} ++/******************************************************************** + Tries to establish a big enough margin of free space in the log buffer, such + that a new log entry can be catenated without an immediate need for a flush. */ + static +@@ -3326,6 +3349,15 @@ + (ulong) ut_dulint_get_high(log_sys->last_checkpoint_lsn), + (ulong) ut_dulint_get_low(log_sys->last_checkpoint_lsn)); + ++ fprintf(file, ++ "Max checkpoint age %lu\n" ++ "Modified age %lu\n" ++ "Checkpoint age %lu\n", ++ (ulong) log_sys->max_checkpoint_age, ++ (ulong) ut_dulint_minus(log_sys->lsn, ++ log_buf_pool_get_oldest_modification()), ++ (ulong) ut_dulint_minus(log_sys->lsn, log_sys->last_checkpoint_lsn)); ++ + current_time = time(NULL); + + time_elapsed = 0.001 + difftime(current_time, +diff -ruN a/innobase/os/os0file.c b/innobase/os/os0file.c +--- a/innobase/os/os0file.c 2009-07-02 16:43:23.000000000 +0900 ++++ b/innobase/os/os0file.c 2009-07-02 16:44:49.000000000 +0900 +@@ -66,6 +66,28 @@ + + ibool os_aio_print_debug = FALSE; + ++/* State for the state of an IO request in simulated AIO. ++ Protocol for simulated aio: ++ client requests IO: find slot with reserved = FALSE. Add entry with ++ status = OS_AIO_NOT_ISSUED. ++ IO thread wakes: find adjacent slots with reserved = TRUE and status = ++ OS_AIO_NOT_ISSUED. Change status for slots to ++ OS_AIO_ISSUED. ++ IO operation completes: set status for slots to OS_AIO_DONE. set status ++ for the first slot to OS_AIO_CLAIMED and return ++ result for that slot. ++ When there are multiple read and write threads, they all compete to execute ++ the requests in the array (os_aio_array_t). This avoids the need to load ++ balance requests at the time the request is made at the cost of waking all ++ threads when a request is available. ++*/ ++typedef enum { ++ OS_AIO_NOT_ISSUED, /* Available to be processed by an IO thread. */ ++ OS_AIO_ISSUED, /* Being processed by an IO thread. */ ++ OS_AIO_DONE, /* Request processed. */ ++ OS_AIO_CLAIMED /* Result being returned to client. */ ++} os_aio_status; ++ + /* The aio array slot structure */ + typedef struct os_aio_slot_struct os_aio_slot_t; + +@@ -74,6 +96,8 @@ + ulint pos; /* index of the slot in the aio + array */ + ibool reserved; /* TRUE if this slot is reserved */ ++ os_aio_status status; /* Status for current request. Valid when reserved ++ is TRUE. Used only in simulated aio. */ + time_t reservation_time;/* time when reserved */ + ulint len; /* length of the block to read or + write */ +@@ -84,11 +108,11 @@ + ulint offset_high; /* 32 high bits of file offset */ + os_file_t file; /* file where to read or write */ + const char* name; /* file name or path */ +- ibool io_already_done;/* used only in simulated aio: +- TRUE if the physical i/o already +- made and only the slot message +- needs to be passed to the caller +- of os_aio_simulated_handle */ ++// ibool io_already_done;/* used only in simulated aio: ++// TRUE if the physical i/o already ++// made and only the slot message ++// needs to be passed to the caller ++// of os_aio_simulated_handle */ + fil_node_t* message1; /* message which is given by the */ + void* message2; /* the requester of an aio operation + and which can be used to identify +@@ -137,6 +161,13 @@ + /* Array of events used in simulated aio */ + os_event_t* os_aio_segment_wait_events = NULL; + ++/* Number for the first global segment for reading. */ ++const ulint os_aio_first_read_segment = 2; ++ ++/* Number for the first global segment for writing. Set to ++2 + os_aio_read_write_threads. */ ++ulint os_aio_first_write_segment = 0; ++ + /* The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These + are NULL when the module has not yet been initialized. */ + static os_aio_array_t* os_aio_read_array = NULL; +@@ -145,11 +176,17 @@ + static os_aio_array_t* os_aio_log_array = NULL; + static os_aio_array_t* os_aio_sync_array = NULL; + ++/* Per thread buffer used for merged IO requests. Used by ++os_aio_simulated_handle so that a buffer doesn't have to be allocated ++for each request. */ ++static char* os_aio_thread_buffer[SRV_MAX_N_IO_THREADS]; ++static ulint os_aio_thread_buffer_size[SRV_MAX_N_IO_THREADS]; ++ + static ulint os_aio_n_segments = ULINT_UNDEFINED; + + /* If the following is TRUE, read i/o handler threads try to + wait until a batch of new read requests have been posted */ +-static ibool os_aio_recommend_sleep_for_read_threads = FALSE; ++static volatile ibool os_aio_recommend_sleep_for_read_threads = FALSE; + + ulint os_n_file_reads = 0; + ulint os_bytes_read_since_printout = 0; +@@ -2878,8 +2915,10 @@ + /*========*/ + ulint n, /* in: maximum number of pending aio operations + allowed; n must be divisible by n_segments */ +- ulint n_segments, /* in: combined number of segments in the four +- first aio arrays; must be >= 4 */ ++// ulint n_segments, /* in: combined number of segments in the four ++// first aio arrays; must be >= 4 */ ++ ulint n_read_threads, /* n_segments == 2 + n_read_threads + n_write_threads*/ ++ ulint n_write_threads, /**/ + ulint n_slots_sync) /* in: number of slots in the sync aio array */ + { + ulint n_read_segs; +@@ -2889,6 +2928,8 @@ + #ifdef POSIX_ASYNC_IO + sigset_t sigset; + #endif ++ ulint n_segments = 2 + n_read_threads + n_write_threads; ++ + ut_ad(n % n_segments == 0); + ut_ad(n_segments >= 4); + +@@ -2896,14 +2937,17 @@ + + for (i = 0; i < n_segments; i++) { + srv_set_io_thread_op_info(i, "not started yet"); ++ os_aio_thread_buffer[i] = 0; ++ os_aio_thread_buffer_size[i] = 0; + } + + n_per_seg = n / n_segments; +- n_write_segs = (n_segments - 2) / 2; +- n_read_segs = n_segments - 2 - n_write_segs; ++ n_write_segs = n_write_threads; ++ n_read_segs = n_read_threads; + + /* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */ + ++ os_aio_first_write_segment = os_aio_first_read_segment + n_read_threads; + os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1); + + srv_io_thread_function[0] = "insert buffer thread"; +@@ -2912,14 +2956,14 @@ + + srv_io_thread_function[1] = "log thread"; + +- os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg, ++ os_aio_read_array = os_aio_array_create(n_per_seg, + n_read_segs); + for (i = 2; i < 2 + n_read_segs; i++) { + ut_a(i < SRV_MAX_N_IO_THREADS); + srv_io_thread_function[i] = "read thread"; + } + +- os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg, ++ os_aio_write_array = os_aio_array_create(n_per_seg, + n_write_segs); + for (i = 2 + n_read_segs; i < n_segments; i++) { + ut_a(i < SRV_MAX_N_IO_THREADS); +@@ -3181,6 +3225,13 @@ + struct aiocb* control; + #endif + ulint i; ++ ulint prim_segment; ++ ulint n; ++ ++ n = array->n_slots / array->n_segments; ++ /* 64 blocks' striping ( aligning max(BUF_READ_AHEAD_AREA) ) */ ++ prim_segment = ( offset >> (UNIV_PAGE_SIZE_SHIFT + 6) ) % (array->n_segments); ++ + loop: + os_mutex_enter(array->mutex); + +@@ -3199,6 +3250,16 @@ + goto loop; + } + ++ for (i = prim_segment * n; i < array->n_slots; i++) { ++ slot = os_aio_array_get_nth_slot(array, i); ++ ++ if (slot->reserved == FALSE) { ++ break; ++ } ++ } ++ ++ if (slot->reserved == TRUE){ ++ /* Not found after the intended segment. So we should search before. */ + for (i = 0;; i++) { + slot = os_aio_array_get_nth_slot(array, i); + +@@ -3206,6 +3267,7 @@ + break; + } + } ++ } + + array->n_reserved++; + +@@ -3228,7 +3290,8 @@ + slot->buf = buf; + slot->offset = offset; + slot->offset_high = offset_high; +- slot->io_already_done = FALSE; ++// slot->io_already_done = FALSE; ++ slot->status = OS_AIO_NOT_ISSUED; + + #ifdef WIN_ASYNC_IO + control = &(slot->control); +@@ -3281,6 +3344,7 @@ + ut_ad(slot->reserved); + + slot->reserved = FALSE; ++ slot->status = OS_AIO_NOT_ISSUED; + + array->n_reserved--; + +@@ -3317,16 +3381,18 @@ + + segment = os_aio_get_array_and_local_segment(&array, global_segment); + +- n = array->n_slots / array->n_segments; ++ n = array->n_slots; + + /* Look through n slots after the segment * n'th slot */ + + os_mutex_enter(array->mutex); + + for (i = 0; i < n; i++) { +- slot = os_aio_array_get_nth_slot(array, i + segment * n); ++ slot = os_aio_array_get_nth_slot(array, i); + +- if (slot->reserved) { ++ if (slot->reserved && ++ (slot->status == OS_AIO_NOT_ISSUED || ++ slot->status == OS_AIO_DONE)) { + /* Found an i/o request */ + + break; +@@ -3336,7 +3402,25 @@ + os_mutex_exit(array->mutex); + + if (i < n) { +- os_event_set(os_aio_segment_wait_events[global_segment]); ++ if (array == os_aio_ibuf_array) { ++ os_event_set(os_aio_segment_wait_events[0]); ++ ++ } else if (array == os_aio_log_array) { ++ os_event_set(os_aio_segment_wait_events[1]); ++ ++ } else if (array == os_aio_read_array) { ++ ulint x; ++ for (x = os_aio_first_read_segment; x < os_aio_first_write_segment; x++) ++ os_event_set(os_aio_segment_wait_events[x]); ++ ++ } else if (array == os_aio_write_array) { ++ ulint x; ++ for (x = os_aio_first_write_segment; x < os_aio_n_segments; x++) ++ os_event_set(os_aio_segment_wait_events[x]); ++ ++ } else { ++ ut_a(0); ++ } + } + } + +@@ -3347,8 +3431,6 @@ + os_aio_simulated_wake_handler_threads(void) + /*=======================================*/ + { +- ulint i; +- + if (os_aio_use_native_aio) { + /* We do not use simulated aio: do nothing */ + +@@ -3357,9 +3439,10 @@ + + os_aio_recommend_sleep_for_read_threads = FALSE; + +- for (i = 0; i < os_aio_n_segments; i++) { +- os_aio_simulated_wake_handler_thread(i); +- } ++ os_aio_simulated_wake_handler_thread(0); ++ os_aio_simulated_wake_handler_thread(1); ++ os_aio_simulated_wake_handler_thread(os_aio_first_read_segment); ++ os_aio_simulated_wake_handler_thread(os_aio_first_write_segment); + } + + /************************************************************************** +@@ -3640,7 +3723,7 @@ + ut_ad(os_aio_validate()); + ut_ad(segment < array->n_segments); + +- n = array->n_slots / array->n_segments; ++ n = array->n_slots; + + if (array == os_aio_sync_array) { + os_event_wait(os_aio_array_get_nth_slot(array, pos)->event); +@@ -3648,12 +3731,12 @@ + } else { + srv_set_io_thread_op_info(orig_seg, "wait Windows aio"); + i = os_event_wait_multiple(n, +- (array->native_events) + segment * n); ++ (array->native_events)); + } + + os_mutex_enter(array->mutex); + +- slot = os_aio_array_get_nth_slot(array, i + segment * n); ++ slot = os_aio_array_get_nth_slot(array, i); + + ut_a(slot->reserved); + +@@ -3830,10 +3913,13 @@ + os_aio_slot_t* slot; + os_aio_slot_t* slot2; + os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE]; ++ os_aio_slot_t* lowest_request; ++ os_aio_slot_t* oldest_request; + ulint n_consecutive; + ulint total_len; + ulint offs; + ulint lowest_offset; ++ ulint oldest_offset; + ulint biggest_age; + ulint age; + byte* combined_buf; +@@ -3841,6 +3927,7 @@ + ibool ret; + ulint n; + ulint i; ++ time_t now; + + segment = os_aio_get_array_and_local_segment(&array, global_segment); + +@@ -3853,7 +3940,7 @@ + ut_ad(os_aio_validate()); + ut_ad(segment < array->n_segments); + +- n = array->n_slots / array->n_segments; ++ n = array->n_slots; + + /* Look through n slots after the segment * n'th slot */ + +@@ -3875,9 +3962,9 @@ + done */ + + for (i = 0; i < n; i++) { +- slot = os_aio_array_get_nth_slot(array, i + segment * n); ++ slot = os_aio_array_get_nth_slot(array, i); + +- if (slot->reserved && slot->io_already_done) { ++ if (slot->reserved && slot->status == OS_AIO_DONE) { + + if (os_aio_print_debug) { + fprintf(stderr, +@@ -3897,67 +3984,57 @@ + then pick the one at the lowest offset. */ + + biggest_age = 0; +- lowest_offset = ULINT_MAX; ++ now = time(NULL); ++ oldest_request = lowest_request = NULL; ++ oldest_offset = lowest_offset = ULINT_MAX; + ++ /* Find the oldest request and the request with the smallest offset */ + for (i = 0; i < n; i++) { +- slot = os_aio_array_get_nth_slot(array, i + segment * n); ++ slot = os_aio_array_get_nth_slot(array, i); + +- if (slot->reserved) { +- age = (ulint)difftime(time(NULL), +- slot->reservation_time); ++ if (slot->reserved && slot->status == OS_AIO_NOT_ISSUED) { ++ age = (ulint)difftime(now, slot->reservation_time); + + if ((age >= 2 && age > biggest_age) + || (age >= 2 && age == biggest_age +- && slot->offset < lowest_offset)) { ++ && slot->offset < oldest_offset)) { + + /* Found an i/o request */ +- consecutive_ios[0] = slot; +- +- n_consecutive = 1; +- + biggest_age = age; +- lowest_offset = slot->offset; ++ oldest_request = slot; ++ oldest_offset = slot->offset; + } +- } +- } +- +- if (n_consecutive == 0) { +- /* There were no old requests. Look for an i/o request at the +- lowest offset in the array (we ignore the high 32 bits of the +- offset in these heuristics) */ +- +- lowest_offset = ULINT_MAX; +- +- for (i = 0; i < n; i++) { +- slot = os_aio_array_get_nth_slot(array, +- i + segment * n); +- +- if (slot->reserved && slot->offset < lowest_offset) { + ++ /* Look for an i/o request at the lowest offset in the array ++ * (we ignore the high 32 bits of the offset) */ ++ if (slot->offset < lowest_offset) { + /* Found an i/o request */ +- consecutive_ios[0] = slot; +- +- n_consecutive = 1; +- ++ lowest_request = slot; + lowest_offset = slot->offset; + } + } + } + +- if (n_consecutive == 0) { ++ if (!lowest_request && !oldest_request) { + + /* No i/o requested at the moment */ + + goto wait_for_io; + } + +- slot = consecutive_ios[0]; ++ if (oldest_request) { ++ slot = oldest_request; ++ } else { ++ slot = lowest_request; ++ } ++ consecutive_ios[0] = slot; ++ n_consecutive = 1; + + /* Check if there are several consecutive blocks to read or write */ + + consecutive_loop: + for (i = 0; i < n; i++) { +- slot2 = os_aio_array_get_nth_slot(array, i + segment * n); ++ slot2 = os_aio_array_get_nth_slot(array, i); + + if (slot2->reserved && slot2 != slot + && slot2->offset == slot->offset + slot->len +@@ -3965,7 +4042,8 @@ + sum does not wrap over */ + && slot2->offset_high == slot->offset_high + && slot2->type == slot->type +- && slot2->file == slot->file) { ++ && slot2->file == slot->file ++ && slot2->status == OS_AIO_NOT_ISSUED) { + + /* Found a consecutive i/o request */ + +@@ -3994,6 +4072,8 @@ + + for (i = 0; i < n_consecutive; i++) { + total_len += consecutive_ios[i]->len; ++ ut_a(consecutive_ios[i]->status == OS_AIO_NOT_ISSUED); ++ consecutive_ios[i]->status = OS_AIO_ISSUED; + } + + if (n_consecutive == 1) { +@@ -4001,7 +4081,14 @@ + combined_buf = slot->buf; + combined_buf2 = NULL; + } else { +- combined_buf2 = ut_malloc(total_len + UNIV_PAGE_SIZE); ++ if ((total_len + UNIV_PAGE_SIZE) > os_aio_thread_buffer_size[global_segment]) { ++ if (os_aio_thread_buffer[global_segment]) ++ ut_free(os_aio_thread_buffer[global_segment]); ++ ++ os_aio_thread_buffer[global_segment] = ut_malloc(total_len + UNIV_PAGE_SIZE); ++ os_aio_thread_buffer_size[global_segment] = total_len + UNIV_PAGE_SIZE; ++ } ++ combined_buf2 = os_aio_thread_buffer[global_segment]; + + ut_a(combined_buf2); + +@@ -4012,6 +4099,9 @@ + this assumes that there is just one i/o-handler thread serving + a single segment of slots! */ + ++ ut_a(slot->reserved); ++ ut_a(slot->status == OS_AIO_ISSUED); ++ + os_mutex_exit(array->mutex); + + if (slot->type == OS_FILE_WRITE && n_consecutive > 1) { +@@ -4081,16 +4171,13 @@ + } + } + +- if (combined_buf2) { +- ut_free(combined_buf2); +- } +- + os_mutex_enter(array->mutex); + + /* Mark the i/os done in slots */ + + for (i = 0; i < n_consecutive; i++) { +- consecutive_ios[i]->io_already_done = TRUE; ++ ut_a(consecutive_ios[i]->status == OS_AIO_ISSUED); ++ consecutive_ios[i]->status = OS_AIO_DONE; + } + + /* We return the messages for the first slot now, and if there were +@@ -4100,6 +4187,8 @@ + slot_io_done: + + ut_a(slot->reserved); ++ ut_a(slot->status == OS_AIO_DONE); ++ slot->status = OS_AIO_CLAIMED; + + *message1 = slot->message1; + *message2 = slot->message2; +diff -ruN a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c +--- a/innobase/srv/srv0srv.c 2009-07-02 16:43:23.000000000 +0900 ++++ b/innobase/srv/srv0srv.c 2009-07-02 18:36:54.000000000 +0900 +@@ -167,6 +167,8 @@ + ulint srv_lock_table_size = ULINT_MAX; + + ulint srv_n_file_io_threads = ULINT_MAX; ++ulint srv_n_read_io_threads = 1; ++ulint srv_n_write_io_threads = 1; + + #ifdef UNIV_LOG_ARCHIVE + ibool srv_log_archive_on = FALSE; +@@ -330,6 +332,24 @@ + ibool srv_use_awe = FALSE; + ibool srv_use_adaptive_hash_indexes = TRUE; + ++ulint srv_io_capacity = 100; ++ ++/* Returns the number of IO operations that is X percent of the capacity. ++PCT_IO(5) -> returns the number of IO operations that is 5% of the max ++where max is srv_io_capacity. */ ++#define PCT_IO(pct) ((ulint) (srv_io_capacity * ((double) pct / 100.0))) ++ ++long long srv_ibuf_max_size = 0; ++ulint srv_ibuf_active_contract = 0; /* 0:disable 1:enable */ ++ulint srv_ibuf_accel_rate = 100; ++#define PCT_IBUF_IO(pct) ((ulint) (srv_io_capacity * srv_ibuf_accel_rate * ((double) pct / 10000.0))) ++ ++ulint srv_flush_neighbor_pages = 1; /* 0:disable 1:enable */ ++ ++ulint srv_enable_unsafe_group_commit = 0; /* 0:disable 1:enable */ ++ ++uint srv_read_ahead = 3; /* 1: random 2: linear 3: Both */ ++uint srv_adaptive_checkpoint = 0; /* 0: none 1: reflex 2: estimate */ + /*-------------------------------------------*/ + ulong srv_n_spin_wait_rounds = 20; + ulong srv_n_free_tickets_to_enter = 500; +@@ -2228,6 +2248,10 @@ + ulint n_pend_ios; + ibool skip_sleep = FALSE; + ulint i; ++ ++ dulint lsn_old; ++ ++ dulint oldest_lsn; + + #ifdef UNIV_DEBUG_THREAD_CREATION + fprintf(stderr, "Master thread starts, id %lu\n", +@@ -2244,6 +2268,9 @@ + + mutex_exit(&kernel_mutex); + ++ mutex_enter(&(log_sys->mutex)); ++ lsn_old = log_sys->lsn; ++ mutex_exit(&(log_sys->mutex)); + os_event_set(srv_sys->operational); + loop: + /*****************************************************************/ +@@ -2279,6 +2306,18 @@ + if (!skip_sleep) { + + os_thread_sleep(1000000); ++ /* ++ mutex_enter(&(log_sys->mutex)); ++ oldest_lsn = buf_pool_get_oldest_modification(); ++ dulint lsn = log_sys->lsn; ++ mutex_exit(&(log_sys->mutex)); ++ ++ if (!ut_dulint_is_zero(oldest_lsn)) ++ fprintf(stderr, ++ "InnoDB flush: age pct: %lu, lsn progress: %lu\n", ++ ut_dulint_minus(lsn, oldest_lsn) * 100 / log_sys->max_checkpoint_age, ++ ut_dulint_minus(lsn, lsn_old)); ++ */ + } + + skip_sleep = FALSE; +@@ -2317,13 +2356,14 @@ + + log_sys->n_pending_writes; + n_ios = log_sys->n_log_ios + buf_pool->n_pages_read + + buf_pool->n_pages_written; +- if (n_pend_ios < 3 && (n_ios - n_ios_old < 5)) { ++ if (n_pend_ios < PCT_IO(3) && (n_ios - n_ios_old < PCT_IO(5))) { + srv_main_thread_op_info = "doing insert buffer merge"; +- ibuf_contract_for_n_pages(TRUE, 5); ++ ibuf_contract_for_n_pages(TRUE, PCT_IBUF_IO(5)); + + srv_main_thread_op_info = "flushing log"; + +- log_buffer_flush_to_disk(); ++ /* No fsync when srv_flush_log_at_trx_commit != 1 */ ++ log_buffer_flush_maybe_sync(); + } + + if (buf_get_modified_ratio_pct() > +@@ -2332,7 +2372,7 @@ + /* Try to keep the number of modified pages in the + buffer pool under the limit wished by the user */ + +- n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100, ++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), + ut_dulint_max); + + /* If we had to do the flush, it may have taken +@@ -2341,6 +2381,140 @@ + iteration of this loop. */ + + skip_sleep = TRUE; ++ mutex_enter(&(log_sys->mutex)); ++ lsn_old = log_sys->lsn; ++ mutex_exit(&(log_sys->mutex)); ++ } else if (srv_adaptive_checkpoint == 1) { ++ ++ /* Try to keep modified age not to exceed ++ max_checkpoint_age * 7/8 line */ ++ ++ mutex_enter(&(log_sys->mutex)); ++ lsn_old = log_sys->lsn; ++ oldest_lsn = buf_pool_get_oldest_modification(); ++ if (ut_dulint_is_zero(oldest_lsn)) { ++ ++ mutex_exit(&(log_sys->mutex)); ++ ++ } else { ++ if (ut_dulint_minus(log_sys->lsn, oldest_lsn) ++ > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 8)) { ++ /* LOG_POOL_PREFLUSH_RATIO_ASYNC is exceeded. */ ++ /* We should not flush from here. */ ++ mutex_exit(&(log_sys->mutex)); ++ } else if (ut_dulint_minus(log_sys->lsn, oldest_lsn) ++ > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 4)) { ++ ++ /* 2nd defence line (max_checkpoint_age * 3/4) */ ++ ++ mutex_exit(&(log_sys->mutex)); ++ ++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ++ ut_dulint_max); ++ skip_sleep = TRUE; ++ } else if (ut_dulint_minus(log_sys->lsn, oldest_lsn) ++ > (log_sys->max_checkpoint_age)/2 ) { ++ ++ /* 1st defence line (max_checkpoint_age * 1/2) */ ++ ++ mutex_exit(&(log_sys->mutex)); ++ ++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(10), ++ ut_dulint_max); ++ skip_sleep = TRUE; ++ } else { ++ mutex_exit(&(log_sys->mutex)); ++ } ++ } ++ } else if (srv_adaptive_checkpoint == 2) { ++ ++ /* Try to keep modified age not to exceed ++ max_checkpoint_age * 7/8 line */ ++ ++ mutex_enter(&(log_sys->mutex)); ++ ++ oldest_lsn = buf_pool_get_oldest_modification(); ++ if (ut_dulint_is_zero(oldest_lsn)) { ++ lsn_old = log_sys->lsn; ++ mutex_exit(&(log_sys->mutex)); ++ ++ } else { ++ if (ut_dulint_minus(log_sys->lsn, oldest_lsn) ++ > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 8)) { ++ /* LOG_POOL_PREFLUSH_RATIO_ASYNC is exceeded. */ ++ /* We should not flush from here. */ ++ lsn_old = log_sys->lsn; ++ mutex_exit(&(log_sys->mutex)); ++ } else if (ut_dulint_minus(log_sys->lsn, oldest_lsn) ++ > (log_sys->max_checkpoint_age)/2 ) { ++ ++ /* defence line (max_checkpoint_age * 1/2) */ ++ dulint lsn = log_sys->lsn; ++ ++ mutex_exit(&(log_sys->mutex)); ++ ++ ib_longlong level, bpl; ++ buf_block_t* bpage; ++ ++ mutex_enter(&buf_pool->mutex); ++ ++ level = 0; ++ bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); ++ ++ while (bpage != NULL) { ++ dulint oldest_modification = bpage->oldest_modification; ++ if (!ut_dulint_is_zero(oldest_modification)) { ++ level += log_sys->max_checkpoint_age ++ - ut_dulint_minus(lsn, oldest_modification); ++ } ++ bpage = UT_LIST_GET_NEXT(flush_list, bpage); ++ } ++ ++ if (level) { ++ bpl = ((ib_longlong) UT_LIST_GET_LEN(buf_pool->flush_list) ++ * UT_LIST_GET_LEN(buf_pool->flush_list) ++ * ut_dulint_minus(lsn, lsn_old)) / level; ++ } else { ++ bpl = 0; ++ } ++ ++ mutex_exit(&buf_pool->mutex); ++ ++ if (!srv_use_doublewrite_buf) { ++ /* flush is faster than when doublewrite */ ++ bpl = (bpl * 3) / 4; ++ } ++ ++ if(bpl) { ++retry_flush_batch: ++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, ++ bpl, ++ ut_dulint_add(oldest_lsn, ++ ut_dulint_minus(lsn, ++ lsn_old))); ++ if (n_pages_flushed == ULINT_UNDEFINED) { ++ os_thread_sleep(5000); ++ goto retry_flush_batch; ++ } ++ } ++ ++ lsn_old = lsn; ++ /* ++ fprintf(stderr, ++ "InnoDB flush: age pct: %lu, lsn progress: %lu, blocks to flush:%llu\n", ++ ut_dulint_minus(lsn, oldest_lsn) * 100 / log_sys->max_checkpoint_age, ++ ut_dulint_minus(lsn, lsn_old), bpl); ++ */ ++ } else { ++ lsn_old = log_sys->lsn; ++ mutex_exit(&(log_sys->mutex)); ++ } ++ } ++ ++ } else { ++ mutex_enter(&(log_sys->mutex)); ++ lsn_old = log_sys->lsn; ++ mutex_exit(&(log_sys->mutex)); + } + + if (srv_activity_count == old_activity_count) { +@@ -2367,23 +2541,25 @@ + n_pend_ios = buf_get_n_pending_ios() + log_sys->n_pending_writes; + n_ios = log_sys->n_log_ios + buf_pool->n_pages_read + + buf_pool->n_pages_written; +- if (n_pend_ios < 3 && (n_ios - n_ios_very_old < 200)) { ++ if (n_pend_ios < 3 && (n_ios - n_ios_very_old < PCT_IO(200))) { + + srv_main_thread_op_info = "flushing buffer pool pages"; +- buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max); ++ buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ut_dulint_max); + + srv_main_thread_op_info = "flushing log"; +- log_buffer_flush_to_disk(); ++ /* No fsync when srv_flush_log_at_trx_commit != 1 */ ++ log_buffer_flush_maybe_sync(); + } + + /* We run a batch of insert buffer merge every 10 seconds, + even if the server were active */ + + srv_main_thread_op_info = "doing insert buffer merge"; +- ibuf_contract_for_n_pages(TRUE, 5); ++ ibuf_contract_for_n_pages(TRUE, PCT_IBUF_IO(5)); + + srv_main_thread_op_info = "flushing log"; +- log_buffer_flush_to_disk(); ++ /* No fsync when srv_flush_log_at_trx_commit != 1 */ ++ log_buffer_flush_maybe_sync(); + + /* We run a full purge every 10 seconds, even if the server + were active */ +@@ -2422,14 +2598,14 @@ + (> 70 %), we assume we can afford reserving the disk(s) for + the time it requires to flush 100 pages */ + +- n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100, ++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), + ut_dulint_max); + } else { + /* Otherwise, we only flush a small number of pages so that + we do not unnecessarily use much disk i/o capacity from + other work */ + +- n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 10, ++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(10), + ut_dulint_max); + } + +@@ -2518,7 +2694,7 @@ + if (srv_fast_shutdown && srv_shutdown_state > 0) { + n_bytes_merged = 0; + } else { +- n_bytes_merged = ibuf_contract_for_n_pages(TRUE, 20); ++ n_bytes_merged = ibuf_contract_for_n_pages(TRUE, PCT_IBUF_IO(100)); + } + + srv_main_thread_op_info = "reserving kernel mutex"; +@@ -2535,7 +2711,7 @@ + + if (srv_fast_shutdown < 2) { + n_pages_flushed = +- buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max); ++ buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ut_dulint_max); + } else { + /* In the fastest shutdown we do not flush the buffer pool + to data files: we set n_pages_flushed to 0 artificially. */ +@@ -2557,7 +2733,14 @@ + + srv_main_thread_op_info = "flushing log"; + +- log_buffer_flush_to_disk(); ++ current_time = time(NULL); ++ if (difftime(current_time, last_flush_time) > 1) { ++ log_buffer_flush_to_disk(); ++ last_flush_time = current_time; ++ } else { ++ /* No fsync when srv_flush_log_at_trx_commit != 1 */ ++ log_buffer_flush_maybe_sync(); ++ } + + srv_main_thread_op_info = "making checkpoint"; + +diff -ruN a/innobase/srv/srv0start.c b/innobase/srv/srv0start.c +--- a/innobase/srv/srv0start.c 2009-05-08 06:12:12.000000000 +0900 ++++ b/innobase/srv/srv0start.c 2009-07-02 16:44:49.000000000 +0900 +@@ -1205,24 +1205,28 @@ + return(DB_ERROR); + } + ++ /* over write innodb_file_io_threads */ ++ srv_n_file_io_threads = 2 + srv_n_read_io_threads + srv_n_write_io_threads; ++ + /* Restrict the maximum number of file i/o threads */ + if (srv_n_file_io_threads > SRV_MAX_N_IO_THREADS) { + + srv_n_file_io_threads = SRV_MAX_N_IO_THREADS; ++ srv_n_read_io_threads = srv_n_write_io_threads = (SRV_MAX_N_IO_THREADS - 2) / 2; + } + + if (!os_aio_use_native_aio) { + /* In simulated aio we currently have use only for 4 threads */ +- srv_n_file_io_threads = 4; ++ /*srv_n_file_io_threads = 4;*/ + + os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD + * srv_n_file_io_threads, +- srv_n_file_io_threads, ++ srv_n_read_io_threads, srv_n_write_io_threads, + SRV_MAX_N_PENDING_SYNC_IOS); + } else { + os_aio_init(SRV_N_PENDING_IOS_PER_THREAD + * srv_n_file_io_threads, +- srv_n_file_io_threads, ++ srv_n_read_io_threads, srv_n_write_io_threads, + SRV_MAX_N_PENDING_SYNC_IOS); + } + +diff -ruN a/patch_info/innodb_io_patches.info b/patch_info/innodb_io_patches.info +--- /dev/null 1970-01-01 09:00:00.000000000 +0900 ++++ b/patch_info/innodb_io_patches.info 2009-07-02 16:44:49.000000000 +0900 +@@ -0,0 +1,11 @@ ++File=innodb_io_patches.patch ++Name=Cluster of past InnoDB IO patches ++Version=1.1 ++Author=Percona ++License=GPL ++Comment=This patch contains fixed (control_flush_and_merge_and_read, control_io-threads, adaptive_flush) ++ChangeLog= ++2008-11-06 ++YK: Initial release ++2009-01-09 ++YK: Some parameters are added +diff -ruN a/sql/ha_innodb.cc b/sql/ha_innodb.cc +--- a/sql/ha_innodb.cc 2009-07-02 16:43:23.000000000 +0900 ++++ b/sql/ha_innodb.cc 2009-07-02 16:44:49.000000000 +0900 +@@ -149,6 +149,7 @@ + innobase_lock_wait_timeout, innobase_force_recovery, + innobase_open_files; + ++long innobase_read_io_threads, innobase_write_io_threads; + longlong innobase_buffer_pool_size, innobase_log_file_size; + + /* The default values for the following char* start-up parameters +@@ -1417,6 +1418,8 @@ + srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size; + + srv_n_file_io_threads = (ulint) innobase_file_io_threads; ++ srv_n_read_io_threads = (ulint) innobase_read_io_threads; ++ srv_n_write_io_threads = (ulint) innobase_write_io_threads; + + srv_lock_wait_timeout = (ulint) innobase_lock_wait_timeout; + srv_force_recovery = (ulint) innobase_force_recovery; +@@ -7330,6 +7333,10 @@ + trx_t* trx = check_trx_exists(thd); + + if (thd->lex->sql_command != SQLCOM_XA_PREPARE) { ++ if (srv_enable_unsafe_group_commit && !thd->variables.innodb_support_xa) { ++ /* choose group commit rather than binlog order */ ++ return(0); ++ } + + /* For ibbackup to work the order of transactions in binlog + and InnoDB must be the same. Consider the situation +diff -ruN a/sql/ha_innodb.h b/sql/ha_innodb.h +--- a/sql/ha_innodb.h 2009-07-02 16:43:23.000000000 +0900 ++++ b/sql/ha_innodb.h 2009-07-02 18:10:51.000000000 +0900 +@@ -204,6 +204,7 @@ + extern long innobase_additional_mem_pool_size; + extern long innobase_buffer_pool_awe_mem_mb; + extern long innobase_file_io_threads, innobase_lock_wait_timeout; ++extern long innobase_read_io_threads, innobase_write_io_threads; + extern long innobase_force_recovery; + extern long innobase_open_files; + extern char *innobase_data_home_dir, *innobase_data_file_path; +@@ -234,6 +235,15 @@ + extern ulong srv_thread_concurrency; + extern ulong srv_commit_concurrency; + extern ulong srv_flush_log_at_trx_commit; ++extern ulong srv_io_capacity; ++extern long long srv_ibuf_max_size; ++extern ulong srv_ibuf_active_contract; ++extern ulong srv_ibuf_accel_rate; ++extern ulong srv_flush_neighbor_pages; ++extern ulong srv_enable_unsafe_group_commit; ++extern uint srv_read_ahead; ++extern uint srv_adaptive_checkpoint; ++ + /* An option to enable the fix for "Bug#43660 SHOW INDEXES/ANALYZE does + NOT update cardinality for indexes of InnoDB table". By default we are + running with the fix disabled because MySQL 5.1 is frozen for such +diff -ruN a/sql/mysqld.cc b/sql/mysqld.cc +--- a/sql/mysqld.cc 2009-07-02 16:43:23.000000000 +0900 ++++ b/sql/mysqld.cc 2009-07-02 18:00:04.000000000 +0900 +@@ -5086,6 +5086,16 @@ + OPT_INNODB_ROLLBACK_ON_TIMEOUT, + OPT_SECURE_FILE_PRIV, + OPT_KEEP_FILES_ON_CREATE, ++ OPT_INNODB_IO_CAPACITY, ++ OPT_INNODB_IBUF_MAX_SIZE, ++ OPT_INNODB_IBUF_ACTIVE_CONTRACT, ++ OPT_INNODB_IBUF_ACCEL_RATE, ++ OPT_INNODB_FLUSH_NEIGHBOR_PAGES, ++ OPT_INNODB_ENABLE_UNSAFE_GROUP_COMMIT, ++ OPT_INNODB_READ_AHEAD, ++ OPT_INNODB_ADAPTIVE_CHECKPOINT, ++ OPT_INNODB_READ_IO_THREADS, ++ OPT_INNODB_WRITE_IO_THREADS, + OPT_INNODB_ADAPTIVE_HASH_INDEX, + OPT_FEDERATED, + OPT_INNODB_USE_LEGACY_CARDINALITY_ALGORITHM +@@ -5403,6 +5413,44 @@ + (gptr*) &srv_use_legacy_cardinality_algorithm, + (gptr*) &srv_use_legacy_cardinality_algorithm, + 0, GET_BOOL, OPT_ARG, 1, 0, 0, 0, 0, 0}, ++ {"innodb_io_capacity", OPT_INNODB_IO_CAPACITY, ++ "Number of IO operations per second the server can do. Tunes background IO rate.", ++ (gptr*) &srv_io_capacity, (gptr*) &srv_io_capacity, ++ 0, GET_ULONG, REQUIRED_ARG, 200, 100, 999999999, 0, 0, 0}, ++ {"innodb_ibuf_max_size", OPT_INNODB_IBUF_MAX_SIZE, ++ "The maximum size of the insert buffer. (in bytes)", ++ (gptr*) &srv_ibuf_max_size, (gptr*) &srv_ibuf_max_size, 0, ++ GET_LL, REQUIRED_ARG, LONGLONG_MAX, 0, LONGLONG_MAX, 0, 0, 0}, ++ {"innodb_ibuf_active_contract", OPT_INNODB_IBUF_ACTIVE_CONTRACT, ++ "Enable/Disable active_contract of insert buffer. 0:disable 1:enable", ++ (gptr*) &srv_ibuf_active_contract, (gptr*) &srv_ibuf_active_contract, ++ 0, GET_ULONG, REQUIRED_ARG, 0, 0, 1, 0, 0, 0}, ++ {"innodb_ibuf_accel_rate", OPT_INNODB_IBUF_ACCEL_RATE, ++ "Tunes amount of insert buffer processing of background, in addition to innodb_io_capacity. (in percentage)", ++ (gptr*) &srv_ibuf_accel_rate, (gptr*) &srv_ibuf_accel_rate, ++ 0, GET_ULONG, REQUIRED_ARG, 100, 100, 999999999, 0, 0, 0}, ++ {"innodb_flush_neighbor_pages", OPT_INNODB_FLUSH_NEIGHBOR_PAGES, ++ "Enable/Disable flushing also neighbor pages. 0:disable 1:enable", ++ (gptr*) &srv_flush_neighbor_pages, (gptr*) &srv_flush_neighbor_pages, ++ 0, GET_ULONG, REQUIRED_ARG, 1, 0, 1, 0, 0, 0}, ++ {"innodb_read_ahead", OPT_INNODB_READ_AHEAD, ++ "Control read ahead activity. (none, random, linear, [both])", ++ 0, 0, 0, GET_ULONG, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, ++ {"innodb_adaptive_checkpoint", OPT_INNODB_ADAPTIVE_CHECKPOINT, ++ "Enable/Diasable flushing along modified age. ([none], reflex, estimate)", ++ 0, 0, 0, GET_ULONG, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, ++ {"innodb_enable_unsafe_group_commit", OPT_INNODB_ENABLE_UNSAFE_GROUP_COMMIT, ++ "Enable/Disable unsafe group commit when support_xa=OFF and use with binlog or other XA storage engine.", ++ (gptr*) &srv_enable_unsafe_group_commit, (gptr*) &srv_enable_unsafe_group_commit, ++ 0, GET_ULONG, REQUIRED_ARG, 0, 0, 1, 0, 0, 0}, ++ {"innodb_read_io_threads", OPT_INNODB_READ_IO_THREADS, ++ "Number of background read I/O threads in InnoDB.", ++ (gptr*) &innobase_read_io_threads, (gptr*) &innobase_read_io_threads, ++ 0, GET_LONG, REQUIRED_ARG, 8, 1, 64, 0, 0, 0}, ++ {"innodb_write_io_threads", OPT_INNODB_WRITE_IO_THREADS, ++ "Number of background write I/O threads in InnoDB.", ++ (gptr*) &innobase_write_io_threads, (gptr*) &innobase_write_io_threads, ++ 0, GET_LONG, REQUIRED_ARG, 8, 1, 64, 0, 0, 0}, + #endif /* End HAVE_INNOBASE_DB */ + {"isam", OPT_ISAM, "Obsolete. ISAM storage engine is no longer supported.", + (gptr*) &opt_isam, (gptr*) &opt_isam, 0, GET_BOOL, NO_ARG, 0, 0, 0, +@@ -7644,6 +7692,38 @@ + case OPT_INNODB_LOG_ARCHIVE: + innobase_log_archive= argument ? test(atoi(argument)) : 1; + break; ++ case OPT_INNODB_READ_AHEAD: ++ if (argument == disabled_my_option) ++ srv_read_ahead = 0; ++ else if (! argument) ++ srv_read_ahead = 3; ++ else ++ { ++ int type; ++ if ((type=find_type(argument, &innodb_read_ahead_typelib, 2)) <= 0) ++ { ++ fprintf(stderr,"Unknown innodb_read_ahead type: %s\n",argument); ++ exit(1); ++ } ++ srv_read_ahead = (uint) ((type - 1) & 3); ++ } ++ break; ++ case OPT_INNODB_ADAPTIVE_CHECKPOINT: ++ if (argument == disabled_my_option) ++ srv_adaptive_checkpoint = 0; ++ else if (! argument) ++ srv_adaptive_checkpoint = 0; ++ else ++ { ++ int type; ++ if ((type=find_type(argument, &innodb_adaptive_checkpoint_typelib, 2)) <= 0) ++ { ++ fprintf(stderr,"Unknown innodb_adaptive_checkpoint type: %s\n",argument); ++ exit(1); ++ } ++ srv_adaptive_checkpoint = (uint) ((type - 1) % 3); ++ } ++ break; + #endif /* HAVE_INNOBASE_DB */ + case OPT_MYISAM_RECOVER: + { +diff -ruN a/sql/set_var.cc b/sql/set_var.cc +--- a/sql/set_var.cc 2009-07-02 16:43:23.000000000 +0900 ++++ b/sql/set_var.cc 2009-07-02 17:45:29.000000000 +0900 +@@ -489,6 +489,57 @@ + sys_var_long_ptr sys_innodb_flush_log_at_trx_commit( + "innodb_flush_log_at_trx_commit", + &srv_flush_log_at_trx_commit); ++sys_var_long_ptr sys_innodb_io_capacity("innodb_io_capacity", ++ &srv_io_capacity); ++sys_var_long_ptr sys_innodb_ibuf_active_contract("innodb_ibuf_active_contract", ++ &srv_ibuf_active_contract); ++sys_var_long_ptr sys_innodb_ibuf_accel_rate("innodb_ibuf_accel_rate", ++ &srv_ibuf_accel_rate); ++sys_var_long_ptr sys_innodb_flush_neighbor_pages("innodb_flush_neighbor_pages", ++ &srv_flush_neighbor_pages); ++ ++const char *innodb_read_ahead_names[]= ++{ ++ "none", /* 0 */ ++ "random", ++ "linear", ++ "both", /* 3 */ ++ /* For compatibility of the older patch */ ++ "0", /* 4 ("none" + 4) */ ++ "1", ++ "2", ++ "3", /* 7 ("both" + 4) */ ++ NullS ++}; ++TYPELIB innodb_read_ahead_typelib= ++{ ++ array_elements(innodb_read_ahead_names) - 1, "innodb_read_ahead_typelib", ++ innodb_read_ahead_names, NULL ++}; ++sys_var_enum sys_innodb_read_ahead("innodb_read_ahead", &srv_read_ahead, ++ &innodb_read_ahead_typelib, fix_innodb_read_ahead); ++sys_var_long_ptr sys_innodb_enable_unsafe_group_commit("innodb_enable_unsafe_group_commit", ++ &srv_enable_unsafe_group_commit); ++ ++const char *innodb_adaptive_checkpoint_names[]= ++{ ++ "none", /* 0 */ ++ "reflex", /* 1 */ ++ "estimate", /* 2 */ ++ /* For compatibility of the older patch */ ++ "0", /* 3 ("none" + 3) */ ++ "1", /* 4 ("reflex" + 3) */ ++ "2", /* 5 ("estimate" + 3) */ ++ NullS ++}; ++TYPELIB innodb_adaptive_checkpoint_typelib= ++{ ++ array_elements(innodb_adaptive_checkpoint_names) - 1, "innodb_adaptive_checkpoint_typelib", ++ innodb_adaptive_checkpoint_names, NULL ++}; ++sys_var_enum sys_innodb_adaptive_checkpoint("innodb_adaptive_checkpoint", ++ &srv_adaptive_checkpoint, ++ &innodb_adaptive_checkpoint_typelib, fix_innodb_adaptive_checkpoint); + sys_var_const_os_str_ptr sys_innodb_data_file_path("innodb_data_file_path", + &innobase_data_file_path); + sys_var_const_os_str_ptr sys_innodb_data_home_dir("innodb_data_home_dir", +@@ -860,6 +911,13 @@ + &sys_innodb_thread_concurrency, + &sys_innodb_commit_concurrency, + &sys_innodb_flush_log_at_trx_commit, ++ &sys_innodb_io_capacity, ++ &sys_innodb_ibuf_active_contract, ++ &sys_innodb_ibuf_accel_rate, ++ &sys_innodb_flush_neighbor_pages, ++ &sys_innodb_read_ahead, ++ &sys_innodb_enable_unsafe_group_commit, ++ &sys_innodb_adaptive_checkpoint, + #endif + &sys_trust_routine_creators, + &sys_trust_function_creators, +@@ -997,6 +1055,16 @@ + {sys_innodb_table_locks.name, (char*) &sys_innodb_table_locks, SHOW_SYS}, + {sys_innodb_thread_concurrency.name, (char*) &sys_innodb_thread_concurrency, SHOW_SYS}, + {sys_innodb_thread_sleep_delay.name, (char*) &sys_innodb_thread_sleep_delay, SHOW_SYS}, ++ {sys_innodb_io_capacity.name, (char*) &sys_innodb_io_capacity, SHOW_SYS}, ++ {"innodb_ibuf_max_size", (char*) &srv_ibuf_max_size, SHOW_LONGLONG}, ++ {sys_innodb_ibuf_active_contract.name, (char*) &sys_innodb_ibuf_active_contract, SHOW_SYS}, ++ {sys_innodb_ibuf_accel_rate.name, (char*) &sys_innodb_ibuf_accel_rate, SHOW_SYS}, ++ {sys_innodb_flush_neighbor_pages.name, (char*) &sys_innodb_flush_neighbor_pages, SHOW_SYS}, ++ {sys_innodb_read_ahead.name, (char*) &sys_innodb_read_ahead, SHOW_SYS}, ++ {sys_innodb_enable_unsafe_group_commit.name, (char*) &sys_innodb_enable_unsafe_group_commit, SHOW_SYS}, ++ {sys_innodb_adaptive_checkpoint.name, (char*) &sys_innodb_adaptive_checkpoint, SHOW_SYS}, ++ {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG}, ++ {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG}, + {sys_innodb_use_legacy_cardinality_algorithm.name, + (char*) &sys_innodb_use_legacy_cardinality_algorithm, SHOW_SYS}, + #endif +@@ -1459,6 +1527,18 @@ + } + } + ++#ifdef HAVE_INNOBASE_DB ++extern void fix_innodb_read_ahead(THD *thd, enum_var_type type) ++{ ++ srv_read_ahead &= 3; ++} ++ ++extern void fix_innodb_adaptive_checkpoint(THD *thd, enum_var_type type) ++{ ++ srv_adaptive_checkpoint %= 3; ++} ++#endif /* HAVE_INNOBASE_DB */ ++ + static void fix_max_binlog_size(THD *thd, enum_var_type type) + { + DBUG_ENTER("fix_max_binlog_size"); +diff -ruN a/sql/set_var.h b/sql/set_var.h +--- a/sql/set_var.h 2009-07-02 16:43:23.000000000 +0900 ++++ b/sql/set_var.h 2009-07-02 17:35:17.000000000 +0900 +@@ -31,6 +31,11 @@ + + extern TYPELIB bool_typelib, delay_key_write_typelib, sql_mode_typelib; + ++#ifdef HAVE_INNOBASE_DB ++extern TYPELIB innodb_read_ahead_typelib; ++extern TYPELIB innodb_adaptive_checkpoint_typelib; ++#endif /* HAVE_INNOBASE_DB */ ++ + typedef int (*sys_check_func)(THD *, set_var *); + typedef bool (*sys_update_func)(THD *, set_var *); + typedef void (*sys_after_update_func)(THD *,enum_var_type); +@@ -1148,6 +1153,10 @@ + int sql_set_variables(THD *thd, List<set_var_base> *var_list); + bool not_all_support_one_shot(List<set_var_base> *var_list); + void fix_delay_key_write(THD *thd, enum_var_type type); ++#ifdef HAVE_INNOBASE_DB ++void fix_innodb_read_ahead(THD *thd, enum_var_type type); ++void fix_innodb_adaptive_checkpoint(THD *thd, enum_var_type type); ++#endif /* HAVE_INNOBASE_DB */ + ulong fix_sql_mode(ulong sql_mode); + extern sys_var_const_str sys_charset_system; + extern sys_var_str sys_init_connect; diff --git a/percona/5.0.91-b22-20100522/innodb_io_pattern.patch b/percona/5.0.91-b22-20100522/innodb_io_pattern.patch new file mode 100644 index 0000000..d9e60e9 --- /dev/null +++ b/percona/5.0.91-b22-20100522/innodb_io_pattern.patch @@ -0,0 +1,693 @@ +diff -r d4826c0a98c2 include/mysql_com.h +--- a/include/mysql_com.h Wed Jul 29 09:58:58 2009 -0700 ++++ b/include/mysql_com.h Wed Jul 29 10:00:12 2009 -0700 +@@ -122,6 +122,9 @@ + #define REFRESH_DES_KEY_FILE 0x40000L + #define REFRESH_USER_RESOURCES 0x80000L + ++/* TRUNCATE INFORMATION_SCHEMA.INNODB_IO_PATTERN */ ++#define REFRESH_INNODB_IO_PATTERN 0x1000000L ++ + #define CLIENT_LONG_PASSWORD 1 /* new more secure passwords */ + #define CLIENT_FOUND_ROWS 2 /* Found instead of affected rows */ + #define CLIENT_LONG_FLAG 4 /* Get all column flags */ +diff -r d4826c0a98c2 innobase/buf/buf0buf.c +--- a/innobase/buf/buf0buf.c Wed Jul 29 09:58:58 2009 -0700 ++++ b/innobase/buf/buf0buf.c Wed Jul 29 10:00:12 2009 -0700 +@@ -654,6 +654,9 @@ + } + + buf_pool->page_hash = hash_create(2 * max_size); ++ buf_pool->io_counter_hash = NULL; ++ buf_pool->io_counter_heap = NULL; ++ buf_pool->io_counters = 0; + + buf_pool->n_pend_reads = 0; + +@@ -1967,6 +1970,9 @@ + ulint io_type; + ulint read_page_no; + ++ buf_io_counter_t* io_counter; ++ ulint fold; ++ + ut_ad(block); + + ut_a(block->state == BUF_BLOCK_FILE_PAGE); +@@ -2068,6 +2074,26 @@ + buf_pool->n_pages_read++; + + rw_lock_x_unlock_gen(&(block->lock), BUF_IO_READ); ++ /* io_counter here */ ++ if (srv_io_pattern && srv_io_pattern_trace_running) { ++ fold = buf_page_address_fold(block->space, block->offset); ++ HASH_SEARCH(hash, buf_pool->io_counter_hash, fold, io_counter, ++ (io_counter->space == block->space) && (io_counter->offset == block->offset)); ++ if (io_counter == NULL && buf_pool->io_counters < srv_io_pattern_size_limit) { ++ io_counter = mem_heap_alloc(buf_pool->io_counter_heap,(sizeof(buf_io_counter_t))); ++ io_counter->space = block->space; ++ io_counter->offset = block->offset; ++ io_counter->n_read = 0; ++ io_counter->n_write = 0; ++ HASH_INSERT(buf_io_counter_t, hash, buf_pool->io_counter_hash, ++ buf_page_address_fold(block->space, block->offset), io_counter); ++ buf_pool->io_counters++; ++ } ++ if (io_counter != NULL) { ++ io_counter->index_id = ut_dulint_get_low(btr_page_get_index_id(buf_block_get_frame(block))); ++ io_counter->n_read++; ++ } ++ } + + #ifdef UNIV_DEBUG + if (buf_debug_prints) { +@@ -2083,6 +2109,26 @@ + buf_flush_write_complete(block); + + rw_lock_s_unlock_gen(&(block->lock), BUF_IO_WRITE); ++ /* io_counter here */ ++ if (srv_io_pattern && srv_io_pattern_trace_running) { ++ fold = buf_page_address_fold(block->space, block->offset); ++ HASH_SEARCH(hash, buf_pool->io_counter_hash, fold, io_counter, ++ (io_counter->space == block->space) && (io_counter->offset == block->offset)); ++ if (io_counter == NULL && buf_pool->io_counters < srv_io_pattern_size_limit) { ++ io_counter = mem_heap_alloc(buf_pool->io_counter_heap,(sizeof(buf_io_counter_t))); ++ io_counter->space = block->space; ++ io_counter->offset = block->offset; ++ io_counter->n_read = 0; ++ io_counter->n_write = 0; ++ HASH_INSERT(buf_io_counter_t, hash, buf_pool->io_counter_hash, ++ buf_page_address_fold(block->space, block->offset), io_counter); ++ buf_pool->io_counters++; ++ } ++ if (io_counter != NULL) { ++ io_counter->index_id = ut_dulint_get_low(btr_page_get_index_id(buf_block_get_frame(block))); ++ io_counter->n_write++; ++ } ++ } + + buf_pool->n_pages_written++; + +@@ -2657,3 +2703,58 @@ + return buf_pool_get_nth_block(buf_pool, i); + + } ++ ++/************************************************************************* ++Controls the internal hash table for IO pattern tracing ++along innodb_io_pattern_trace value.*/ ++ ++void ++buf_io_counter_control(void) ++/*========================*/ ++{ ++ ulint n; ++ ++ mutex_enter(&(buf_pool->mutex)); ++ if (srv_io_pattern_trace) { ++ if (buf_pool->io_counter_hash == NULL) { ++ /* estimating (buf_pool * 10) */ ++ buf_pool->io_counter_hash = hash_create(20 * buf_pool->max_size); ++ buf_pool->io_counter_heap = mem_heap_create(4096 * 1024); ++ buf_pool->io_counters = 0; ++ ++ srv_io_pattern = TRUE; ++ } ++ } else { ++ if (buf_pool->io_counter_hash != NULL) { ++ srv_io_pattern = FALSE; ++ ++ for (n = 0; n < buf_pool->io_counter_hash->n_cells; n++) { ++ (buf_pool->io_counter_hash->array + n)->node = NULL; ++ } ++ mem_heap_free(buf_pool->io_counter_heap); ++ buf_pool->io_counter_heap = NULL; ++ buf_pool->io_counters = 0; ++ ++ hash_table_free(buf_pool->io_counter_hash); ++ buf_pool->io_counter_hash = NULL; ++ } ++ } ++ mutex_exit(&(buf_pool->mutex)); ++} ++ ++void ++buf_io_counter_clear(void) ++/*======================*/ ++{ ++ ulint n; ++ ++ mutex_enter(&(buf_pool->mutex)); ++ if (buf_pool->io_counter_hash != NULL) { ++ for (n = 0; n < buf_pool->io_counter_hash->n_cells; n++) { ++ (buf_pool->io_counter_hash->array + n)->node = NULL; ++ } ++ mem_heap_empty(buf_pool->io_counter_heap); ++ buf_pool->io_counters = 0; ++ } ++ mutex_exit(&(buf_pool->mutex)); ++} +diff -r d4826c0a98c2 innobase/include/buf0buf.h +--- a/innobase/include/buf0buf.h Wed Jul 29 09:58:58 2009 -0700 ++++ b/innobase/include/buf0buf.h Wed Jul 29 10:00:12 2009 -0700 +@@ -709,6 +709,18 @@ + void buf_pool_dump(void); + buf_block_t* buf_pool_get_nth_block_no_inline(buf_pool_t* pool, ulint i); + ++ ++/************************************************************************* ++Controls the internal hash table for IO pattern tracing ++along innodb_io_pattern_trace value.*/ ++ ++void ++buf_io_counter_control(void); ++/*=========================*/ ++ ++void ++buf_io_counter_clear(void); ++/*=======================*/ + + /* The buffer control block structure */ + +@@ -930,6 +942,9 @@ + ulint curr_size; /* current pool size in pages; + currently always the same as + max_size */ ++ hash_table_t* io_counter_hash; ++ mem_heap_t* io_counter_heap; ++ ulint io_counters; + hash_table_t* page_hash; /* hash table of the file pages */ + + ulint n_pend_reads; /* number of pending read operations */ +@@ -1015,6 +1030,15 @@ + locki table, are not in this list */ + }; + ++struct buf_io_counter_struct{ ++ ulint space; ++ ulint offset; ++ buf_io_counter_t* hash; ++ ulint index_id; ++ ulint n_read; ++ ulint n_write; ++}; ++ + /* States of a control block */ + #define BUF_BLOCK_NOT_USED 211 /* is in the free list */ + #define BUF_BLOCK_READY_FOR_USE 212 /* when buf_get_free_block returns +diff -r d4826c0a98c2 innobase/include/buf0types.h +--- a/innobase/include/buf0types.h Wed Jul 29 09:58:58 2009 -0700 ++++ b/innobase/include/buf0types.h Wed Jul 29 10:00:12 2009 -0700 +@@ -12,6 +12,8 @@ + typedef struct buf_block_struct buf_block_t; + typedef struct buf_pool_struct buf_pool_t; + ++typedef struct buf_io_counter_struct buf_io_counter_t; ++ + /* The 'type' used of a buffer frame */ + typedef byte buf_frame_t; + +diff -r d4826c0a98c2 innobase/include/srv0srv.h +--- a/innobase/include/srv0srv.h Wed Jul 29 09:58:58 2009 -0700 ++++ b/innobase/include/srv0srv.h Wed Jul 29 10:00:12 2009 -0700 +@@ -146,6 +146,11 @@ + extern ulint srv_enable_unsafe_group_commit; + extern uint srv_read_ahead; + extern uint srv_adaptive_checkpoint; ++ ++extern volatile ibool srv_io_pattern; ++extern ulong srv_io_pattern_trace; ++extern ulong srv_io_pattern_trace_running; ++extern ulong srv_io_pattern_size_limit; + /*-------------------------------------------*/ + + extern ulint srv_n_rows_inserted; +diff -r d4826c0a98c2 innobase/srv/srv0srv.c +--- a/innobase/srv/srv0srv.c Wed Jul 29 09:58:58 2009 -0700 ++++ b/innobase/srv/srv0srv.c Wed Jul 29 10:00:12 2009 -0700 +@@ -352,6 +352,11 @@ + + uint srv_read_ahead = 3; /* 1: random 2: linear 3: Both */ + uint srv_adaptive_checkpoint = 0; /* 0: none 1: reflex 2: estimate */ ++ ++volatile ibool srv_io_pattern = FALSE; ++ulint srv_io_pattern_trace = 0; ++ulint srv_io_pattern_trace_running = 0; ++ulint srv_io_pattern_size_limit = ULINT_MAX - (1024 * 1024); + /*-------------------------------------------*/ + ulong srv_n_spin_wait_rounds = 20; + ulong srv_n_free_tickets_to_enter = 500; +diff -r d4826c0a98c2 mysql-test/r/information_schema.result +--- a/mysql-test/r/information_schema.result Wed Jul 29 09:58:58 2009 -0700 ++++ b/mysql-test/r/information_schema.result Wed Jul 29 10:00:12 2009 -0700 +@@ -59,6 +59,7 @@ + USER_PRIVILEGES + USER_STATISTICS + VIEWS ++INNODB_IO_PATTERN + columns_priv + db + func +@@ -742,7 +743,7 @@ + CREATE VIEW a1 (t_CRASHME) AS SELECT f1 FROM t_crashme GROUP BY f1; + CREATE VIEW a2 AS SELECT t_CRASHME FROM a1; + count(*) +-108 ++109 + drop view a2, a1; + drop table t_crashme; + select table_schema,table_name, column_name from +@@ -812,12 +813,13 @@ + TABLE_PRIVILEGES TABLE_NAME select + TABLE_STATISTICS TABLE_NAME select + VIEWS TABLE_NAME select ++INNODB_IO_PATTERN TABLE_NAME select + delete from mysql.user where user='mysqltest_4'; + delete from mysql.db where user='mysqltest_4'; + flush privileges; + SELECT table_schema, count(*) FROM information_schema.TABLES GROUP BY TABLE_SCHEMA; + table_schema count(*) +-information_schema 23 ++information_schema 24 + mysql 17 + create table t1 (i int, j int); + create trigger trg1 before insert on t1 for each row +@@ -1225,6 +1227,7 @@ + USER_PRIVILEGES GRANTEE + USER_STATISTICS USER + VIEWS TABLE_SCHEMA ++INNODB_IO_PATTERN SPACE + SELECT t.table_name, c1.column_name + FROM information_schema.tables t + INNER JOIN +@@ -1263,6 +1266,7 @@ + USER_PRIVILEGES GRANTEE + USER_STATISTICS USER + VIEWS TABLE_SCHEMA ++INNODB_IO_PATTERN SPACE + SELECT MAX(table_name) FROM information_schema.tables; + MAX(table_name) + VIEWS +@@ -1337,6 +1341,7 @@ + COLUMN_PRIVILEGES information_schema.COLUMN_PRIVILEGES 1 + INDEX_STATISTICS information_schema.INDEX_STATISTICS 1 + INNODB_BUFFER_POOL_CONTENT information_schema.INNODB_BUFFER_POOL_CONTENT 1 ++INNODB_IO_PATTERN information_schema.INNODB_IO_PATTERN 1 + KEY_COLUMN_USAGE information_schema.KEY_COLUMN_USAGE 1 + PROCESSLIST information_schema.PROCESSLIST 1 + PROFILING information_schema.PROFILING 1 +diff -r d4826c0a98c2 mysql-test/r/information_schema_db.result +--- a/mysql-test/r/information_schema_db.result Wed Jul 29 09:58:58 2009 -0700 ++++ b/mysql-test/r/information_schema_db.result Wed Jul 29 10:00:12 2009 -0700 +@@ -28,6 +28,7 @@ + USER_PRIVILEGES + USER_STATISTICS + VIEWS ++INNODB_IO_PATTERN + show tables from INFORMATION_SCHEMA like 'T%'; + Tables_in_information_schema (T%) + TABLES +diff -r d4826c0a98c2 mysql-test/r/mysqlshow.result +--- a/mysql-test/r/mysqlshow.result Wed Jul 29 09:58:58 2009 -0700 ++++ b/mysql-test/r/mysqlshow.result Wed Jul 29 10:00:12 2009 -0700 +@@ -102,6 +102,7 @@ + | USER_PRIVILEGES | + | USER_STATISTICS | + | VIEWS | ++| INNODB_IO_PATTERN | + +---------------------------------------+ + Database: INFORMATION_SCHEMA + +---------------------------------------+ +@@ -130,6 +131,7 @@ + | USER_PRIVILEGES | + | USER_STATISTICS | + | VIEWS | ++| INNODB_IO_PATTERN | + +---------------------------------------+ + Wildcard: inf_rmation_schema + +--------------------+ +diff -r d4826c0a98c2 patch_info/innodb_io_pattern.info +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/patch_info/innodb_io_pattern.info Wed Jul 29 10:00:12 2009 -0700 +@@ -0,0 +1,8 @@ ++File=innodb_io_pattern.patch ++Name=Information schema table of InnoDB IO counts for each datafile pages ++Version=1.0 ++Author=Percona <info@percona.com> ++License=GPL ++Comment=INFORMATION_SCHEMA.INNODB_IO_PATTERN ++2008-12-01 ++YK: fix for mysql-test +diff -r d4826c0a98c2 sql/ha_innodb.cc +--- a/sql/ha_innodb.cc Wed Jul 29 09:58:58 2009 -0700 ++++ b/sql/ha_innodb.cc Wed Jul 29 10:00:12 2009 -0700 +@@ -1583,6 +1583,8 @@ + pthread_cond_init(&commit_cond, NULL); + innodb_inited= 1; + ++ buf_io_counter_control(); ++ + /* If this is a replication slave and we needed to do a crash recovery, + set the master binlog position to what InnoDB internally knew about + how far we got transactions durable inside InnoDB. There is a +@@ -6551,6 +6553,28 @@ + } + + /**************************************************************************** ++Controls the internal hash table for IO pattern tracing ++along innodb_io_pattern_trace value.*/ ++ ++void ++innodb_io_pattern_control(void) ++/*===========================*/ ++{ ++ if (innodb_inited) { ++ buf_io_counter_control(); ++ } ++} ++ ++void ++innodb_io_pattern_clear(void) ++/*=========================*/ ++{ ++ if (innodb_inited) { ++ buf_io_counter_clear(); ++ } ++} ++ ++/**************************************************************************** + Implements the SHOW INNODB STATUS command. Sends the output of the InnoDB + Monitor to the client. */ + +diff -r d4826c0a98c2 sql/ha_innodb.h +--- a/sql/ha_innodb.h Wed Jul 29 09:58:58 2009 -0700 ++++ b/sql/ha_innodb.h Wed Jul 29 10:00:12 2009 -0700 +@@ -245,6 +245,9 @@ + extern uint srv_adaptive_checkpoint; + extern ulong srv_show_locks_held; + extern ulong srv_show_verbose_locks; ++extern ulong srv_io_pattern_trace; ++extern ulong srv_io_pattern_trace_running; ++extern ulong srv_io_pattern_size_limit; + + /* An option to enable the fix for "Bug#43660 SHOW INDEXES/ANALYZE does + NOT update cardinality for indexes of InnoDB table". By default we are +@@ -278,6 +281,9 @@ + bool innodb_mutex_show_status(THD* thd); + void innodb_export_status(void); + ++void innodb_io_pattern_control(void); ++void innodb_io_pattern_clear(void); ++ + void innobase_release_temporary_latches(THD *thd); + + void innobase_store_binlog_offset_and_flush_log(char *binlog_name,longlong offset); +diff -r d4826c0a98c2 sql/lex.h +--- a/sql/lex.h Wed Jul 29 09:58:58 2009 -0700 ++++ b/sql/lex.h Wed Jul 29 10:00:12 2009 -0700 +@@ -244,6 +244,7 @@ + { "INNER", SYM(INNER_SYM)}, + { "INNOBASE", SYM(INNOBASE_SYM)}, + { "INNODB", SYM(INNOBASE_SYM)}, ++ { "INNODB_IO_PATTERN", SYM(INNODB_IO_PATTERN)}, + { "INOUT", SYM(INOUT_SYM)}, + { "INSENSITIVE", SYM(INSENSITIVE_SYM)}, + { "INSERT", SYM(INSERT)}, +diff -r d4826c0a98c2 sql/mysqld.cc +--- a/sql/mysqld.cc Wed Jul 29 09:58:58 2009 -0700 ++++ b/sql/mysqld.cc Wed Jul 29 10:00:12 2009 -0700 +@@ -5029,6 +5029,9 @@ + OPT_INNODB_SYNC_SPIN_LOOPS, + OPT_INNODB_CONCURRENCY_TICKETS, + OPT_INNODB_THREAD_SLEEP_DELAY, ++ OPT_INNODB_IO_PATTERN_TRACE, ++ OPT_INNODB_IO_PATTERN_TRACE_RUNNING, ++ OPT_INNODB_IO_PATTERN_SIZE_LIMIT, + OPT_BDB_CACHE_SIZE, + OPT_BDB_LOG_BUFFER_SIZE, + OPT_BDB_MAX_LOCK, +@@ -5461,6 +5464,18 @@ + "Number of background write I/O threads in InnoDB.", + (gptr*) &innobase_write_io_threads, (gptr*) &innobase_write_io_threads, + 0, GET_LONG, REQUIRED_ARG, 8, 1, 64, 0, 0, 0}, ++ {"innodb_io_pattern_trace", OPT_INNODB_IO_PATTERN_TRACE, ++ "Create/Drop the internal hash table for IO pattern tracing.", ++ (gptr*) &srv_io_pattern_trace, (gptr*) &srv_io_pattern_trace, ++ 0, GET_ULONG, REQUIRED_ARG, 0, 0, 1, 0, 0, 0}, ++ {"innodb_io_pattern_trace_running", OPT_INNODB_IO_PATTERN_TRACE_RUNNING, ++ "Control IO pattern trace running or not.", ++ (gptr*) &srv_io_pattern_trace_running, (gptr*) &srv_io_pattern_trace_running, ++ 0, GET_ULONG, REQUIRED_ARG, 0, 0, 1, 0, 0, 0}, ++ {"innodb_io_pattern_size_limit", OPT_INNODB_IO_PATTERN_SIZE_LIMIT, ++ "Set max number of counters per data pages. (0 = disable counting).", ++ (gptr*) &srv_io_pattern_size_limit, (gptr*) &srv_io_pattern_size_limit, ++ 0, GET_ULONG, REQUIRED_ARG, 0, 0, ULONG_MAX - (1024 * 1024), 0, 0, 0}, + #endif /* End HAVE_INNOBASE_DB */ + {"isam", OPT_ISAM, "Obsolete. ISAM storage engine is no longer supported.", + (gptr*) &opt_isam, (gptr*) &opt_isam, 0, GET_BOOL, NO_ARG, 0, 0, 0, +diff -r d4826c0a98c2 sql/set_var.cc +--- a/sql/set_var.cc Wed Jul 29 09:58:58 2009 -0700 ++++ b/sql/set_var.cc Wed Jul 29 10:00:12 2009 -0700 +@@ -546,6 +546,12 @@ + sys_var_long_ptr sys_innodb_show_verbose_locks( + "innodb_show_verbose_locks", + &srv_show_verbose_locks); ++sys_var_innodb_io_pattern_trace sys_innodb_io_pattern_trace("innodb_io_pattern_trace", ++ &srv_io_pattern_trace); ++sys_var_long_ptr sys_innodb_io_pattern_trace_running("innodb_io_pattern_trace_running", ++ &srv_io_pattern_trace_running); ++sys_var_long_ptr sys_innodb_io_pattern_size_limit("innodb_io_pattern_size_limit", ++ &srv_io_pattern_size_limit); + sys_var_const_os_str_ptr sys_innodb_data_file_path("innodb_data_file_path", + &innobase_data_file_path); + sys_var_const_os_str_ptr sys_innodb_data_home_dir("innodb_data_home_dir", +@@ -926,6 +932,9 @@ + &sys_innodb_adaptive_checkpoint, + &sys_innodb_show_locks_held, + &sys_innodb_show_verbose_locks, ++ &sys_innodb_io_pattern_trace, ++ &sys_innodb_io_pattern_trace_running, ++ &sys_innodb_io_pattern_size_limit, + #endif + &sys_trust_routine_creators, + &sys_trust_function_creators, +@@ -1075,6 +1084,9 @@ + {sys_innodb_adaptive_checkpoint.name, (char*) &sys_innodb_adaptive_checkpoint, SHOW_SYS}, + {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG}, + {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG}, ++ {sys_innodb_io_pattern_trace.name, (char*) &sys_innodb_io_pattern_trace, SHOW_SYS}, ++ {sys_innodb_io_pattern_trace_running.name, (char*) &sys_innodb_io_pattern_trace_running, SHOW_SYS}, ++ {sys_innodb_io_pattern_size_limit.name, (char*) &sys_innodb_io_pattern_size_limit, SHOW_SYS}, + {sys_innodb_use_legacy_cardinality_algorithm.name, + (char*) &sys_innodb_use_legacy_cardinality_algorithm, SHOW_SYS}, + #endif +@@ -3210,6 +3222,19 @@ + thd->variables.lc_time_names= global_system_variables.lc_time_names; + } + ++#ifdef HAVE_INNOBASE_DB ++bool sys_var_innodb_io_pattern_trace::update(THD *thd, set_var *var) ++{ ++ bool ret; ++ ++ ret = sys_var_long_ptr_global::update(thd, var); ++ ++ innodb_io_pattern_control(); ++ ++ return ret; ++} ++#endif /* HAVE_INNOBASE_DB */ ++ + /* + Functions to update thd->options bits + */ +diff -r d4826c0a98c2 sql/set_var.h +--- a/sql/set_var.h Wed Jul 29 09:58:58 2009 -0700 ++++ b/sql/set_var.h Wed Jul 29 10:00:12 2009 -0700 +@@ -1012,6 +1012,17 @@ + virtual void set_default(THD *thd, enum_var_type type); + }; + ++#ifdef HAVE_INNOBASE_DB ++/* sys_var_innodb_io_pattern_trace */ ++class sys_var_innodb_io_pattern_trace :public sys_var_long_ptr ++{ ++public: ++ sys_var_innodb_io_pattern_trace(const char *name_arg, ulong *value_ptr_arg) ++ :sys_var_long_ptr(name_arg,value_ptr_arg) {} ++ bool update(THD *thd, set_var *var); ++}; ++#endif /* HAVE_INNOBASE_DB */ ++ + /**************************************************************************** + Classes for parsing of the SET command + ****************************************************************************/ +diff -r d4826c0a98c2 sql/sql_parse.cc +--- a/sql/sql_parse.cc Wed Jul 29 09:58:58 2009 -0700 ++++ b/sql/sql_parse.cc Wed Jul 29 10:00:12 2009 -0700 +@@ -8104,6 +8104,13 @@ + } + pthread_mutex_unlock(&LOCK_global_user_client_stats); + } ++#ifdef HAVE_INNOBASE_DB ++ if (options & REFRESH_INNODB_IO_PATTERN) ++ { ++ tmp_write_to_binlog= 0; ++ innodb_io_pattern_clear(); ++ } ++#endif /* HAVE_INNOBASE_DB */ + *write_to_binlog= tmp_write_to_binlog; + return result; + } +diff -r d4826c0a98c2 sql/sql_show.cc +--- a/sql/sql_show.cc Wed Jul 29 09:58:58 2009 -0700 ++++ b/sql/sql_show.cc Wed Jul 29 10:00:12 2009 -0700 +@@ -33,6 +33,17 @@ + #include "ha_innodb.h" + #endif + ++#ifdef HAVE_INNOBASE_DB ++#define INSIDE_HA_INNOBASE_CC ++extern "C" { ++#include "srv0srv.h" ++#include "buf0buf.h" ++#include "dict0dict.h" ++} ++/* We need to undef it in InnoDB */ ++#undef byte ++#endif /* HAVE_INNOBASE_DB */ ++ + #ifndef NO_EMBEDDED_ACCESS_CHECKS + static const char *grant_names[]={ + "select","insert","update","delete","create","drop","reload","shutdown", +@@ -4108,6 +4119,72 @@ + DBUG_RETURN(res); + } + ++int innodb_io_pattern_fill_table(THD *thd, TABLE_LIST *tables, COND *cond) ++{ ++ TABLE *table= (TABLE *) tables->table; ++ ++ buf_io_counter_t* io_counter; ++ dict_index_t* index; ++ ++ DBUG_ENTER("innodb_io_pattern_fill_table"); ++ int returnable= 0; ++ ++ /* deny access to non-superusers */ ++ if (check_global_access(thd, PROCESS_ACL)) { ++ DBUG_RETURN(0); ++ } ++ ++ /* We cannot use inline functions of InnoDB here */ ++ ++ /* !!!!!ATTENTION!!!!!: This function is not protected by mutex for performance. */ ++ /* Don't use "DROP TABLE innodb_io_pattern" and INFORMATION_SCHEMA.INNODB_IO_PATTERN */ ++ /* at the same time as possible. */ ++ ++ if (srv_io_pattern) { ++ for (ulint n=0; n < buf_pool->io_counter_hash->n_cells; n++) { ++ if (!srv_io_pattern) ++ goto end_func; ++ ++ io_counter = (buf_io_counter_t*)(buf_pool->io_counter_hash->array + n)->node; ++ while (io_counter) { ++ if (!srv_io_pattern) ++ goto end_func; ++ ++ if (dict_sys != NULL) { ++ dulint id; ++ id.high = 0; ++ id.low = io_counter->index_id; ++ index = dict_index_find_on_id_low(id); ++ } else { ++ index = NULL; ++ } ++ ++ table->field[0]->store(io_counter->space); ++ table->field[1]->store(io_counter->offset); ++ table->field[2]->store(io_counter->index_id); ++ if (index != NULL) { ++ table->field[3]->store(index->table_name,strlen(index->table_name),system_charset_info); ++ table->field[4]->store(index->name,strlen(index->name),system_charset_info); ++ } else { ++ table->field[3]->store("",0,system_charset_info); ++ table->field[4]->store("",0,system_charset_info); ++ } ++ table->field[5]->store(io_counter->n_read); ++ table->field[6]->store(io_counter->n_write); ++ if (schema_table_store_record(thd, table)) ++ { ++ returnable= 1; ++ goto end_func; ++ } ++ io_counter = io_counter->hash; ++ } ++ } ++ } ++ ++ end_func: ++ DBUG_RETURN(returnable); ++} ++ + /* + Find schema_tables elment by name + +@@ -4914,6 +4986,19 @@ + {0, 0, MYSQL_TYPE_STRING, 0, 0, 0} + }; + ++#ifdef HAVE_INNOBASE_DB ++ST_FIELD_INFO innodb_io_pattern_field_info[]= ++{ ++ {"SPACE", 11, MYSQL_TYPE_LONG, 0, 0, "space_id"}, ++ {"OFFSET", 11, MYSQL_TYPE_LONG, 0, 0, "offset"}, ++ {"INDEX_ID", 11, MYSQL_TYPE_LONG, 0, 0, "index id"}, ++ {"TABLE_NAME", 32, MYSQL_TYPE_STRING, 0, 0, "table name"}, ++ {"INDEX_NAME", 32, MYSQL_TYPE_STRING, 0, 0, "index name"}, ++ {"N_READ", 11, MYSQL_TYPE_LONG, 0, 0, "read ios"}, ++ {"N_WRITE", 11, MYSQL_TYPE_LONG, 0, 0, "write ios"}, ++ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0} ++}; ++#endif + + ST_FIELD_INFO variables_fields_info[]= + { +@@ -5089,6 +5174,10 @@ + make_old_format, 0, -1, -1, 1}, + {"VIEWS", view_fields_info, create_schema_table, + get_all_tables, 0, get_schema_views_record, 1, 2, 0}, ++#ifdef HAVE_INNOBASE_DB ++ {"INNODB_IO_PATTERN", innodb_io_pattern_field_info, create_schema_table, ++ innodb_io_pattern_fill_table, 0, 0, -1, -1, 0}, ++#endif + {0, 0, 0, 0, 0, 0, 0, 0, 0} + }; + +diff -r d4826c0a98c2 sql/sql_yacc.yy +--- a/sql/sql_yacc.yy Wed Jul 29 09:58:58 2009 -0700 ++++ b/sql/sql_yacc.yy Wed Jul 29 10:00:12 2009 -0700 +@@ -685,6 +685,7 @@ + %token INFILE + %token INNER_SYM + %token INNOBASE_SYM ++%token INNODB_IO_PATTERN + %token INOUT_SYM + %token INSENSITIVE_SYM + %token INSERT +@@ -8500,6 +8501,7 @@ + | MASTER_SYM { Lex->type|= REFRESH_MASTER; } + | DES_KEY_FILE { Lex->type|= REFRESH_DES_KEY_FILE; } + | RESOURCES { Lex->type|= REFRESH_USER_RESOURCES; } ++ | INNODB_IO_PATTERN { Lex->type|= REFRESH_INNODB_IO_PATTERN; } + | CLIENT_STATS_SYM { Lex->type|= REFRESH_CLIENT_STATS; } + | USER_STATS_SYM { Lex->type|= REFRESH_USER_STATS; } + | TABLE_STATS_SYM { Lex->type|= REFRESH_TABLE_STATS; } +@@ -9552,6 +9554,7 @@ + | ISOLATION {} + | ISSUER_SYM {} + | INNOBASE_SYM {} ++ | INNODB_IO_PATTERN {} + | INSERT_METHOD {} + | IO_SYM {} + | IPC_SYM {} diff --git a/percona/5.0.91-b22-20100522/innodb_io_tune.patch b/percona/5.0.91-b22-20100522/innodb_io_tune.patch new file mode 100644 index 0000000..3953e1d --- /dev/null +++ b/percona/5.0.91-b22-20100522/innodb_io_tune.patch @@ -0,0 +1,1823 @@ +diff -r 322370200e6a innobase/include/os0file.h +--- a/innobase/include/os0file.h Mon Nov 03 05:07:57 2008 -0800 ++++ b/innobase/include/os0file.h Mon Nov 03 05:08:52 2008 -0800 +@@ -532,21 +532,16 @@ + FALSE otherwise */ + const char* path); /* in: path name */ + /**************************************************************************** +-Initializes the asynchronous io system. Creates separate aio array for +-non-ibuf read and write, a third aio array for the ibuf i/o, with just one +-segment, two aio arrays for log reads and writes with one segment, and a +-synchronous aio array of the specified size. The combined number of segments +-in the three first aio arrays is the parameter n_segments given to the +-function. The caller must create an i/o handler thread for each segment in +-the four first arrays, but not for the sync aio array. */ ++Initializes the asynchronous io system. */ + +-void ++ulint + os_aio_init( + /*========*/ +- ulint n, /* in: maximum number of pending aio operations +- allowed; n must be divisible by n_segments */ +- ulint n_segments, /* in: combined number of segments in the four +- first aio arrays; must be >= 4 */ ++ /* out: number of AIO handler threads */ ++ ulint ios_per_array, /* in: maximum number of pending aio operations ++ allowed per IO array */ ++ ulint n_read_threads, /* in: number of read threads */ ++ ulint n_write_threads, /* in: number of write threads */ + ulint n_slots_sync); /* in: number of slots in the sync aio array */ + /*********************************************************************** + Requests an asynchronous i/o operation. */ +diff -r 322370200e6a innobase/include/srv0srv.h +--- a/innobase/include/srv0srv.h Mon Nov 03 05:07:57 2008 -0800 ++++ b/innobase/include/srv0srv.h Mon Nov 03 05:08:52 2008 -0800 +@@ -87,6 +87,14 @@ + extern ulint srv_lock_table_size; + + extern ulint srv_n_file_io_threads; ++extern ulint srv_n_read_io_threads; ++extern ulint srv_n_write_io_threads; ++ ++/* Number of IO operations per second the server can do */ ++extern ulint srv_io_capacity; ++ ++/* Flush dirty pages when below max dirty percent */ ++extern ibool srv_extra_dirty_writes; + + #ifdef UNIV_LOG_ARCHIVE + extern ibool srv_log_archive_on; +@@ -252,6 +260,24 @@ + + /* variable to count the number of random read-aheads were done */ + extern ulint srv_read_ahead_rnd; ++ ++/* Number of IO operations read/write done for all threads */ ++extern ulint os_aio_read_requests; ++extern ulint os_aio_write_requests; ++ ++/* Number of pages read/written done for all threads */ ++extern ulint os_aio_pages_read; ++extern ulint os_aio_pages_written; ++ ++/* time usec used to perform read/write for all threads */ ++extern ib_longlong os_aio_read_time; ++extern ib_longlong os_aio_write_time; ++ ++extern ulint inno_pending_normal_aio_reads; ++extern ulint inno_pending_normal_aio_writes; ++extern ulint inno_pending_ibuf_aio_reads; ++extern ulint inno_pending_log_ios; ++extern ulint inno_pending_sync_ios; + + /* In this structure we store status variables to be passed to MySQL */ + typedef struct export_var_struct export_struc; +diff -r 322370200e6a innobase/log/log0log.c +--- a/innobase/log/log0log.c Mon Nov 03 05:07:57 2008 -0800 ++++ b/innobase/log/log0log.c Mon Nov 03 05:08:52 2008 -0800 +@@ -1537,6 +1537,30 @@ + + log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS, TRUE, + LOG_WRITE_FROM_BACKGROUND_SYNC); ++} ++ ++/******************************************************************** ++Flush the log buffer. Force it to disk depending on the value of ++innodb_flush_log_at_trx_commit. */ ++ ++void ++log_buffer_flush_maybe_sync(void) ++/*==========================*/ ++{ ++ dulint lsn; ++ ++ mutex_enter(&(log_sys->mutex)); ++ ++ lsn = log_sys->lsn; ++ ++ mutex_exit(&(log_sys->mutex)); ++ ++ /* Force log buffer to disk when innodb_flush_log_at_trx_commit = 1. */ ++ log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS, ++ srv_flush_log_at_trx_commit == 1 ? TRUE : FALSE, ++ srv_flush_log_at_trx_commit == 1 ? ++ LOG_WRITE_FROM_BACKGROUND_SYNC : ++ LOG_WRITE_FROM_BACKGROUND_ASYNC); + } + + /******************************************************************** +diff -r 322370200e6a innobase/os/os0file.c +--- a/innobase/os/os0file.c Mon Nov 03 05:07:57 2008 -0800 ++++ b/innobase/os/os0file.c Mon Nov 03 05:08:52 2008 -0800 +@@ -22,6 +22,8 @@ + #include <errno.h> + #endif /* UNIV_HOTBACKUP */ + ++extern long innobase_max_merged_io; ++ + #undef HAVE_FDATASYNC + + #ifdef POSIX_ASYNC_IO +@@ -63,6 +65,28 @@ + ibool os_aio_use_native_aio = FALSE; + + ibool os_aio_print_debug = FALSE; ++ ++/* State for the state of an IO request in simulated AIO. ++ Protocol for simulated aio: ++ client requests IO: find slot with reserved = FALSE. Add entry with ++ status = OS_AIO_NOT_ISSUED. ++ IO thread wakes: find adjacent slots with reserved = TRUE and status = ++ OS_AIO_NOT_ISSUED. Change status for slots to ++ OS_AIO_ISSUED. ++ IO operation completes: set status for slots to OS_AIO_DONE. set status ++ for the first slot to OS_AIO_CLAIMED and return ++ result for that slot. ++ When there are multiple read and write threads, they all compete to execute ++ the requests in the array (os_aio_array_t). This avoids the need to load ++ balance requests at the time the request is made at the cost of waking all ++ threads when a request is available. ++*/ ++typedef enum { ++ OS_AIO_NOT_ISSUED, /* Available to be processed by an IO thread. */ ++ OS_AIO_ISSUED, /* Being processed by an IO thread. */ ++ OS_AIO_DONE, /* Request processed. */ ++ OS_AIO_CLAIMED /* Result being returned to client. */ ++} os_aio_status; + + /* The aio array slot structure */ + typedef struct os_aio_slot_struct os_aio_slot_t; +@@ -72,6 +96,8 @@ + ulint pos; /* index of the slot in the aio + array */ + ibool reserved; /* TRUE if this slot is reserved */ ++ os_aio_status status; /* Status for current request. Valid when reserved ++ is TRUE. Used only in simulated aio. */ + time_t reservation_time;/* time when reserved */ + ulint len; /* length of the block to read or + write */ +@@ -82,11 +108,6 @@ + ulint offset_high; /* 32 high bits of file offset */ + os_file_t file; /* file where to read or write */ + const char* name; /* file name or path */ +- ibool io_already_done;/* used only in simulated aio: +- TRUE if the physical i/o already +- made and only the slot message +- needs to be passed to the caller +- of os_aio_simulated_handle */ + fil_node_t* message1; /* message which is given by the */ + void* message2; /* the requester of an aio operation + and which can be used to identify +@@ -116,9 +137,6 @@ + in this array */ + ulint n_slots; /* Total number of slots in the aio array. + This must be divisible by n_threads. */ +- ulint n_segments;/* Number of segments in the aio array of +- pending aio requests. A thread can wait +- separately for any one of the segments. */ + ulint n_reserved;/* Number of reserved slots in the + aio array outside the ibuf segment */ + os_aio_slot_t* slots; /* Pointer to the slots in the array */ +@@ -134,6 +152,17 @@ + + /* Array of events used in simulated aio */ + os_event_t* os_aio_segment_wait_events = NULL; ++ ++/* Number of threads for reading and writing. */ ++ulint os_aio_read_threads = 0; ++ulint os_aio_write_threads = 0; ++ ++/* Number for the first global segment for reading. */ ++const ulint os_aio_first_read_segment = 2; ++ ++/* Number for the first global segment for writing. Set to ++2 + os_aio_read_write_threads. */ ++ulint os_aio_first_write_segment = 0; + + /* The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These + are NULL when the module has not yet been initialized. */ +@@ -143,11 +172,39 @@ + static os_aio_array_t* os_aio_log_array = NULL; + static os_aio_array_t* os_aio_sync_array = NULL; + ++/* Per thread buffer used for merged IO requests. Used by ++os_aio_simulated_handle so that a buffer doesn't have to be allocated ++for each request. */ ++static char* os_aio_thread_buffer[SRV_MAX_N_IO_THREADS]; ++static ulint os_aio_thread_buffer_size[SRV_MAX_N_IO_THREADS]; ++ ++/* Count pages read and written per thread */ ++static ulint os_aio_thread_io_reads[SRV_MAX_N_IO_THREADS]; ++static ulint os_aio_thread_io_writes[SRV_MAX_N_IO_THREADS]; ++ ++/* Number of IO operations done. One request can be for N pages. */ ++static ulint os_aio_thread_io_requests[SRV_MAX_N_IO_THREADS]; ++ ++/* usecs spent blocked on an IO request */ ++static double os_aio_thread_io_wait[SRV_MAX_N_IO_THREADS]; ++/* max usecs spent blocked on an IO request */ ++static double os_aio_thread_max_io_wait[SRV_MAX_N_IO_THREADS]; ++ ++/* Number of IO global segments. An IO handler thread is created for each ++global segment, except for the segment associated with os_aio_sync_array. ++Several segments can be associated with os_aio_{read,write}_array. One ++segment is created for each of the other arrays. This is also the number ++of valid entries in srv_io_thread_reads, srv_io_thread_writes, ++srv_io_thread_op_info, srv_io_thread_function and os_aio_segment_wait_events. */ + static ulint os_aio_n_segments = ULINT_UNDEFINED; + +-/* If the following is TRUE, read i/o handler threads try to +-wait until a batch of new read requests have been posted */ +-static ibool os_aio_recommend_sleep_for_read_threads = FALSE; ++/* Set to TRUE to temporarily block reads from being scheduled while a batch ++of read requests is added to allow them to be merged by the IO handler thread ++if they are adjacent. Declared volatile because we don't want this to be ++read from a register in a loop when another thread may change the value in ++memory. ++*/ ++static volatile ibool os_aio_recommend_sleep_for_read_threads = FALSE; + + ulint os_n_file_reads = 0; + ulint os_bytes_read_since_printout = 0; +@@ -166,6 +223,19 @@ + ulint os_file_n_pending_pwrites = 0; + ulint os_n_pending_writes = 0; + ulint os_n_pending_reads = 0; ++ ++/* TODO -- does InnoDB provide a portable method for this? */ ++static double time_usecs() { ++#ifdef __WIN__ ++ return 0.0; ++#else ++ struct timeval tv; ++ if (gettimeofday(&tv, NULL)) ++ return 0; ++ else ++ return tv.tv_sec * 1000000.0 + tv.tv_usec; ++#endif ++} + + /*************************************************************************** + Gets the operating system version. Currently works only on Windows. */ +@@ -1351,6 +1421,8 @@ + /* We disable OS caching (O_DIRECT) only on data files */ + if (type != OS_LOG_FILE + && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) { ++ ++ fprintf(stderr, "Using O_DIRECT for file %s\n", name); + + os_file_set_nocache(file, name, mode_str); + } +@@ -1798,6 +1870,32 @@ + #endif /* __WIN__ */ + } + ++#ifndef __WIN__ ++/*************************************************************************** ++Possibly flushes a given file to disk. */ ++ ++ibool ++os_maybe_fsync( ++/*==========*/ ++ /* out: 0 if success, error code otherwise */ ++ os_file_t file) /* in, own: handle to a file */ ++{ ++ return (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) ? 0 : fsync(file); ++} ++ ++/*************************************************************************** ++Possibly flushes a given file to disk. */ ++ ++ibool ++os_maybe_fdatasync( ++/*==========*/ ++ /* out: 0 if success, error code otherwise */ ++ os_file_t file) /* in, own: handle to a file */ ++{ ++ return (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) ? 0 : fdatasync(file); ++} ++#endif ++ + /*************************************************************************** + Flushes the write buffers of a given file to the disk. */ + +@@ -1855,21 +1953,21 @@ + /* If we are not on an operating system that supports this, + then fall back to a plain fsync. */ + +- ret = fsync(file); ++ ret = os_maybe_fsync(file); + } else { + ret = fcntl(file, F_FULLFSYNC, NULL); + + if (ret) { + /* If we are not on a file system that supports this, + then fall back to a plain fsync. */ +- ret = fsync(file); ++ ret = os_maybe_fsync(file); + } + } + #elif HAVE_FDATASYNC +- ret = fdatasync(file); ++ ret = os_maybe_fdatasync(file); + #else + /* fprintf(stderr, "Flushing to file %p\n", file); */ +- ret = fsync(file); ++ ret = os_maybe_fsync(file); + #endif + os_n_fsyncs++; + +@@ -2298,6 +2396,9 @@ + + return(TRUE); + } ++ fprintf(stderr, ++"InnoDB: error: os_file_pread wanted %lu and got %lu.\n", ++ (ulint) n, (ulint) ret); + #endif + #ifdef __WIN__ + error_handling: +@@ -2784,9 +2885,8 @@ + os_aio_array_create( + /*================*/ + /* out, own: aio array */ +- ulint n, /* in: maximum number of pending aio operations +- allowed; n must be divisible by n_segments */ +- ulint n_segments) /* in: number of segments in the aio array */ ++ ulint n) /* in: maximum number of pending aio operations ++ allowed */ + { + os_aio_array_t* array; + ulint i; +@@ -2795,7 +2895,6 @@ + OVERLAPPED* over; + #endif + ut_a(n > 0); +- ut_a(n_segments > 0); + + array = ut_malloc(sizeof(os_aio_array_t)); + +@@ -2806,7 +2905,6 @@ + os_event_set(array->is_empty); + + array->n_slots = n; +- array->n_segments = n_segments; + array->n_reserved = 0; + array->slots = ut_malloc(n * sizeof(os_aio_slot_t)); + #ifdef __WIN__ +@@ -2833,70 +2931,75 @@ + + /**************************************************************************** + Initializes the asynchronous io system. Calls also os_io_init_simple. +-Creates a separate aio array for +-non-ibuf read and write, a third aio array for the ibuf i/o, with just one +-segment, two aio arrays for log reads and writes with one segment, and a +-synchronous aio array of the specified size. The combined number of segments +-in the three first aio arrays is the parameter n_segments given to the +-function. The caller must create an i/o handler thread for each segment in +-the four first arrays, but not for the sync aio array. */ +- +-void ++Creates an aio array for each of non-ibuf read, non-ibuf write, ibuf IO, ++log IO, and synchronous IO. The caller must create i/o handler thread for all ++but the synchronous aio array. Multiple threads can access the same array for ++the non-ibuf read (prefetch) and write (flush dirty buffer pages) arrays. ++Return the number of AIO handler threads. */ ++ ++ulint + os_aio_init( + /*========*/ +- ulint n, /* in: maximum number of pending aio operations +- allowed; n must be divisible by n_segments */ +- ulint n_segments, /* in: combined number of segments in the four +- first aio arrays; must be >= 4 */ ++ ulint ios_per_array, /* in: maximum number of pending aio operations ++ allowed per array */ ++ ulint n_read_threads, /* in: number of read threads */ ++ ulint n_write_threads, /* in: number of write threads */ + ulint n_slots_sync) /* in: number of slots in the sync aio array */ + { +- ulint n_read_segs; +- ulint n_write_segs; +- ulint n_per_seg; +- ulint i; ++ ulint i; ++ ulint n_segments = 2 + n_read_threads + n_write_threads; + #ifdef POSIX_ASYNC_IO + sigset_t sigset; + #endif +- ut_ad(n % n_segments == 0); +- ut_ad(n_segments >= 4); ++ ut_a(ios_per_array >= OS_AIO_N_PENDING_IOS_PER_THREAD); ++ ut_a(n_read_threads >= 1 && n_read_threads <= 64); ++ ut_a(n_write_threads >= 1 && n_write_threads <= 64); ++ ut_a(n_segments < SRV_MAX_N_IO_THREADS); + + os_io_init_simple(); + + for (i = 0; i < n_segments; i++) { + srv_set_io_thread_op_info(i, "not started yet"); +- } +- +- n_per_seg = n / n_segments; +- n_write_segs = (n_segments - 2) / 2; +- n_read_segs = n_segments - 2 - n_write_segs; +- +- /* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */ +- +- os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1); ++ os_aio_thread_io_reads[i] = 0; ++ os_aio_thread_io_writes[i] = 0; ++ os_aio_thread_io_requests[i] = 0; ++ os_aio_thread_buffer[i] = 0; ++ os_aio_thread_buffer_size[i] = 0; ++ os_aio_thread_io_wait[i] = 0; ++ os_aio_thread_max_io_wait[i] = 0; ++ } ++ ++ os_aio_read_threads = n_read_threads; ++ os_aio_write_threads = n_write_threads; ++ os_aio_first_write_segment = os_aio_first_read_segment + os_aio_read_threads; ++ ++ fprintf(stderr, ++ "InnoDB: ios_per_array %lu read threads %lu write threads %lu\n", ++ ios_per_array, os_aio_read_threads, os_aio_write_threads); ++ ++ os_aio_ibuf_array = os_aio_array_create(ios_per_array); + + srv_io_thread_function[0] = "insert buffer thread"; + +- os_aio_log_array = os_aio_array_create(n_per_seg, 1); ++ os_aio_log_array = os_aio_array_create(ios_per_array); + + srv_io_thread_function[1] = "log thread"; + +- os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg, +- n_read_segs); +- for (i = 2; i < 2 + n_read_segs; i++) { ++ os_aio_read_array = os_aio_array_create(ios_per_array); ++ for (i = os_aio_first_read_segment; i < os_aio_first_write_segment; i++) { + ut_a(i < SRV_MAX_N_IO_THREADS); +- srv_io_thread_function[i] = "read thread"; +- } +- +- os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg, +- n_write_segs); +- for (i = 2 + n_read_segs; i < n_segments; i++) { ++ srv_io_thread_function[i] = "read thread"; ++ } ++ ++ os_aio_write_array = os_aio_array_create(ios_per_array); ++ for (i = os_aio_first_write_segment; i < n_segments; i++) { + ut_a(i < SRV_MAX_N_IO_THREADS); +- srv_io_thread_function[i] = "write thread"; +- } +- +- os_aio_sync_array = os_aio_array_create(n_slots_sync, 1); +- +- os_aio_n_segments = n_segments; ++ srv_io_thread_function[i] = "write thread"; ++ } ++ ++ os_aio_sync_array = os_aio_array_create(n_slots_sync); ++ ++ os_aio_n_segments = 2 + os_aio_read_threads + os_aio_write_threads; + + os_aio_validate(); + +@@ -2924,6 +3027,7 @@ + + pthread_sigmask(SIG_BLOCK, &sigset, NULL); */ + #endif ++ return os_aio_n_segments; + } + + #ifdef WIN_ASYNC_IO +@@ -2981,77 +3085,32 @@ + os_event_wait(os_aio_write_array->is_empty); + } + +-/************************************************************************** +-Calculates segment number for a slot. */ +-static +-ulint +-os_aio_get_segment_no_from_slot( +-/*============================*/ +- /* out: segment number (which is the number +- used by, for example, i/o-handler threads) */ +- os_aio_array_t* array, /* in: aio wait array */ +- os_aio_slot_t* slot) /* in: slot in this array */ +-{ +- ulint segment; +- ulint seg_len; +- +- if (array == os_aio_ibuf_array) { +- segment = 0; +- +- } else if (array == os_aio_log_array) { +- segment = 1; +- +- } else if (array == os_aio_read_array) { +- seg_len = os_aio_read_array->n_slots / +- os_aio_read_array->n_segments; +- +- segment = 2 + slot->pos / seg_len; +- } else { +- ut_a(array == os_aio_write_array); +- seg_len = os_aio_write_array->n_slots / +- os_aio_write_array->n_segments; +- +- segment = os_aio_read_array->n_segments + 2 +- + slot->pos / seg_len; +- } +- +- return(segment); +-} +- +-/************************************************************************** +-Calculates local segment number and aio array from global segment number. */ +-static +-ulint +-os_aio_get_array_and_local_segment( ++ ++/************************************************************************** ++Calculates aio array from global segment number. */ ++static ++os_aio_array_t* ++os_aio_get_array( + /*===============================*/ +- /* out: local segment number within +- the aio array */ +- os_aio_array_t** array, /* out: aio wait array */ ++ /* out: aio wait array */ + ulint global_segment)/* in: global segment number */ + { +- ulint segment; + + ut_a(global_segment < os_aio_n_segments); + + if (global_segment == 0) { +- *array = os_aio_ibuf_array; +- segment = 0; ++ return os_aio_ibuf_array; + + } else if (global_segment == 1) { +- *array = os_aio_log_array; +- segment = 0; +- +- } else if (global_segment < os_aio_read_array->n_segments + 2) { +- *array = os_aio_read_array; +- +- segment = global_segment - 2; +- } else { +- *array = os_aio_write_array; +- +- segment = global_segment - (os_aio_read_array->n_segments + 2); +- } +- +- return(segment); ++ return os_aio_log_array; ++ ++ } else if (global_segment < os_aio_first_write_segment) { ++ return os_aio_read_array; ++ ++ } else { ++ return os_aio_write_array; ++ ++ } + } + + /*********************************************************************** +@@ -3160,7 +3219,7 @@ + + os_aio_simulated_wake_handler_threads(); + } +- ++ + os_event_wait(array->not_full); + + goto loop; +@@ -3173,7 +3232,7 @@ + break; + } + } +- ++ ut_a(i < array->n_slots); + array->n_reserved++; + + if (array->n_reserved == 1) { +@@ -3195,7 +3254,7 @@ + slot->buf = buf; + slot->offset = offset; + slot->offset_high = offset_high; +- slot->io_already_done = FALSE; ++ slot->status = OS_AIO_NOT_ISSUED; + + #ifdef WIN_ASYNC_IO + control = &(slot->control); +@@ -3246,8 +3305,9 @@ + os_mutex_enter(array->mutex); + + ut_ad(slot->reserved); +- ++ + slot->reserved = FALSE; ++ slot->status = OS_AIO_NOT_ISSUED; + + array->n_reserved--; + +@@ -3266,36 +3326,40 @@ + } + + /************************************************************************** +-Wakes up a simulated aio i/o-handler thread if it has something to do. */ ++Wake up the simulated aio i/o-handler threads for a given array if there ++is work to do. */ + static + void + os_aio_simulated_wake_handler_thread( + /*=================================*/ +- ulint global_segment) /* in: the number of the segment in the aio +- arrays */ +-{ +- os_aio_array_t* array; +- os_aio_slot_t* slot; +- ulint segment; ++ os_aio_array_t* array) /* in: aio array for which wakeup is done */ ++{ ++ os_aio_slot_t* slot; + ulint n; + ulint i; + + ut_ad(!os_aio_use_native_aio); + +- segment = os_aio_get_array_and_local_segment(&array, global_segment); +- +- n = array->n_slots / array->n_segments; +- +- /* Look through n slots after the segment * n'th slot */ +- +- os_mutex_enter(array->mutex); +- +- for (i = 0; i < n; i++) { +- slot = os_aio_array_get_nth_slot(array, i + segment * n); +- +- if (slot->reserved) { +- /* Found an i/o request */ +- ++ n = array->n_slots; ++ ++ /* Look through n slots */ ++ ++ os_mutex_enter(array->mutex); ++ ++ for (i = 0; i < n; i++) { ++ slot = os_aio_array_get_nth_slot(array, i ); ++ ++ if (slot->reserved && ++ (slot->status == OS_AIO_NOT_ISSUED || ++ slot->status == OS_AIO_DONE)) { ++ /* Found an i/o request ++ /* OS_AIO_NOT_ISSUED means the read or write request has ++ * yet to be done. OS_AIO_DONE means the request has been ++ * done but it was part of a set of requests merged into ++ * one read or write call and was not the first block in ++ * the request, so the handling of the IO completion for ++ * that block has not been done. */ ++ + break; + } + } +@@ -3303,7 +3367,25 @@ + os_mutex_exit(array->mutex); + + if (i < n) { +- os_event_set(os_aio_segment_wait_events[global_segment]); ++ if (array == os_aio_ibuf_array) { ++ os_event_set(os_aio_segment_wait_events[0]); ++ ++ } else if (array == os_aio_log_array) { ++ os_event_set(os_aio_segment_wait_events[1]); ++ ++ } else if (array == os_aio_read_array) { ++ ulint x; ++ for (x = os_aio_first_read_segment; x < os_aio_first_write_segment; x++) ++ os_event_set(os_aio_segment_wait_events[x]); ++ ++ } else if (array == os_aio_write_array) { ++ ulint x; ++ for (x = os_aio_first_write_segment; x < os_aio_n_segments; x++) ++ os_event_set(os_aio_segment_wait_events[x]); ++ ++ } else { ++ ut_a(0); ++ } + } + } + +@@ -3320,13 +3402,14 @@ + /* We do not use simulated aio: do nothing */ + + return; +- } +- +- os_aio_recommend_sleep_for_read_threads = FALSE; +- +- for (i = 0; i < os_aio_n_segments; i++) { +- os_aio_simulated_wake_handler_thread(i); +- } ++ } ++ ++ os_aio_recommend_sleep_for_read_threads = FALSE; ++ ++ os_aio_simulated_wake_handler_thread(os_aio_ibuf_array); ++ os_aio_simulated_wake_handler_thread(os_aio_log_array); ++ os_aio_simulated_wake_handler_thread(os_aio_read_array); ++ os_aio_simulated_wake_handler_thread(os_aio_write_array); + } + + /************************************************************************** +@@ -3339,18 +3422,13 @@ + os_aio_simulated_put_read_threads_to_sleep(void) + /*============================================*/ + { +- os_aio_array_t* array; + ulint g; + ++ /* TODO(mcallaghan): provide similar function for write? */ + os_aio_recommend_sleep_for_read_threads = TRUE; + +- for (g = 0; g < os_aio_n_segments; g++) { +- os_aio_get_array_and_local_segment(&array, g); +- +- if (array == os_aio_read_array) { +- +- os_event_reset(os_aio_segment_wait_events[g]); +- } ++ for (g = os_aio_first_read_segment; g < os_aio_first_write_segment; g++) { ++ os_event_reset(os_aio_segment_wait_events[g]); + } + } + +@@ -3480,8 +3558,7 @@ + #endif + } else { + if (!wake_later) { +- os_aio_simulated_wake_handler_thread( +- os_aio_get_segment_no_from_slot(array, slot)); ++ os_aio_simulated_wake_handler_thread(array); + } + } + } else if (type == OS_FILE_WRITE) { +@@ -3497,8 +3574,7 @@ + #endif + } else { + if (!wake_later) { +- os_aio_simulated_wake_handler_thread( +- os_aio_get_segment_no_from_slot(array, slot)); ++ os_aio_simulated_wake_handler_thread(array); + } + } + } else { +@@ -3561,7 +3637,7 @@ + os_aio_windows_handle( + /*==================*/ + /* out: TRUE if the aio operation succeeded */ +- ulint segment, /* in: the number of the segment in the aio ++ ulint global_segment, /* in: the number of the segment in the aio + arrays to wait for; segment 0 is the ibuf + i/o thread, segment 1 the log i/o thread, + then follow the non-ibuf read threads, and as +@@ -3579,7 +3655,6 @@ + void** message2, + ulint* type) /* out: OS_FILE_WRITE or ..._READ */ + { +- ulint orig_seg = segment; + os_aio_array_t* array; + os_aio_slot_t* slot; + ulint n; +@@ -3588,33 +3663,30 @@ + BOOL ret; + DWORD len; + +- if (segment == ULINT_UNDEFINED) { ++ if (global_segment == ULINT_UNDEFINED) { + array = os_aio_sync_array; +- segment = 0; +- } else { +- segment = os_aio_get_array_and_local_segment(&array, segment); ++ } else { ++ array = os_aio_get_array(global_segment); + } + + /* NOTE! We only access constant fields in os_aio_array. Therefore + we do not have to acquire the protecting mutex yet */ + + ut_ad(os_aio_validate()); +- ut_ad(segment < array->n_segments); +- +- n = array->n_slots / array->n_segments; ++ ++ n = array->n_slots; + + if (array == os_aio_sync_array) { + os_event_wait(os_aio_array_get_nth_slot(array, pos)->event); + i = pos; + } else { +- srv_set_io_thread_op_info(orig_seg, "wait Windows aio"); +- i = os_event_wait_multiple(n, +- (array->native_events) + segment * n); +- } +- +- os_mutex_enter(array->mutex); +- +- slot = os_aio_array_get_nth_slot(array, i + segment * n); ++ srv_set_io_thread_op_info(global_segment, "wait Windows aio"); ++ i = os_event_wait_multiple(n, (array->native_events)); ++ } ++ ++ os_mutex_enter(array->mutex); ++ ++ slot = os_aio_array_get_nth_slot(array, i); + + ut_a(slot->reserved); + +@@ -3787,14 +3859,16 @@ + ulint* type) /* out: OS_FILE_WRITE or ..._READ */ + { + os_aio_array_t* array; +- ulint segment; + os_aio_slot_t* slot; + os_aio_slot_t* slot2; + os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE]; ++ os_aio_slot_t* lowest_request; ++ os_aio_slot_t* oldest_request; + ulint n_consecutive; + ulint total_len; + ulint offs; + ulint lowest_offset; ++ ulint oldest_offset; + ulint biggest_age; + ulint age; + byte* combined_buf; +@@ -3802,8 +3876,10 @@ + ibool ret; + ulint n; + ulint i; +- +- segment = os_aio_get_array_and_local_segment(&array, global_segment); ++ ++ double start_usecs, stop_usecs, elapsed_usecs; ++ time_t now; ++ array = os_aio_get_array(global_segment); + + restart: + /* NOTE! We only access constant fields in os_aio_array. Therefore +@@ -3812,11 +3888,10 @@ + srv_set_io_thread_op_info(global_segment, + "looking for i/o requests (a)"); + ut_ad(os_aio_validate()); +- ut_ad(segment < array->n_segments); +- +- n = array->n_slots / array->n_segments; +- +- /* Look through n slots after the segment * n'th slot */ ++ ++ n = array->n_slots; ++ ++ /* Look through n slots */ + + if (array == os_aio_read_array + && os_aio_recommend_sleep_for_read_threads) { +@@ -3836,9 +3911,9 @@ + done */ + + for (i = 0; i < n; i++) { +- slot = os_aio_array_get_nth_slot(array, i + segment * n); +- +- if (slot->reserved && slot->io_already_done) { ++ slot = os_aio_array_get_nth_slot(array, i); ++ ++ if (slot->reserved && slot->status == OS_AIO_DONE) { + + if (os_aio_print_debug) { + fprintf(stderr, +@@ -3846,79 +3921,66 @@ + } + + ret = TRUE; +- ++ + goto slot_io_done; + } + } + +- n_consecutive = 0; +- +- /* If there are at least 2 seconds old requests, then pick the oldest +- one to prevent starvation. If several requests have the same age, +- then pick the one at the lowest offset. */ +- + biggest_age = 0; +- lowest_offset = ULINT_MAX; +- +- for (i = 0; i < n; i++) { +- slot = os_aio_array_get_nth_slot(array, i + segment * n); +- +- if (slot->reserved) { +- age = (ulint)difftime(time(NULL), +- slot->reservation_time); +- ++ now = time(NULL); ++ oldest_request = lowest_request = NULL; ++ oldest_offset = lowest_offset = ULINT_MAX; ++ ++ /* Find the oldest request and the request with the smallest offset */ ++ for (i = 0; i < n; i++) { ++ slot = os_aio_array_get_nth_slot(array, i); ++ ++ if (slot->reserved && slot->status == OS_AIO_NOT_ISSUED) { ++ age = (ulint)difftime(now, slot->reservation_time); ++ ++ /* If there are at least 2 seconds old requests, then pick the oldest ++ one to prevent starvation. If several requests have the same age, ++ then pick the one at the lowest offset. */ + if ((age >= 2 && age > biggest_age) + || (age >= 2 && age == biggest_age +- && slot->offset < lowest_offset)) { ++ && slot->offset < oldest_offset)) { + + /* Found an i/o request */ +- consecutive_ios[0] = slot; +- +- n_consecutive = 1; +- + biggest_age = age; ++ oldest_request = slot; ++ oldest_offset = slot->offset; ++ } ++ ++ /* Look for an i/o request at the lowest offset in the array ++ * (we ignore the high 32 bits of the offset) */ ++ if (slot->offset < lowest_offset) { ++ /* Found an i/o request */ ++ lowest_request = slot; + lowest_offset = slot->offset; + } + } + } + +- if (n_consecutive == 0) { +- /* There were no old requests. Look for an i/o request at the +- lowest offset in the array (we ignore the high 32 bits of the +- offset in these heuristics) */ +- +- lowest_offset = ULINT_MAX; +- +- for (i = 0; i < n; i++) { +- slot = os_aio_array_get_nth_slot(array, +- i + segment * n); +- +- if (slot->reserved && slot->offset < lowest_offset) { +- +- /* Found an i/o request */ +- consecutive_ios[0] = slot; +- +- n_consecutive = 1; +- +- lowest_offset = slot->offset; +- } +- } +- } +- +- if (n_consecutive == 0) { ++ if (!lowest_request && !oldest_request) { + + /* No i/o requested at the moment */ + + goto wait_for_io; + } + +- slot = consecutive_ios[0]; ++ if (oldest_request) { ++ slot = oldest_request; ++ } else { ++ slot = lowest_request; ++ } ++ consecutive_ios[0] = slot; ++ n_consecutive = 1; + + /* Check if there are several consecutive blocks to read or write */ + + consecutive_loop: + for (i = 0; i < n; i++) { +- slot2 = os_aio_array_get_nth_slot(array, i + segment * n); ++ slot2 = os_aio_array_get_nth_slot(array, i); + + if (slot2->reserved && slot2 != slot + && slot2->offset == slot->offset + slot->len +@@ -3926,7 +3988,8 @@ + sum does not wrap over */ + && slot2->offset_high == slot->offset_high + && slot2->type == slot->type +- && slot2->file == slot->file) { ++ && slot2->file == slot->file ++ && slot2->status == OS_AIO_NOT_ISSUED) { + + /* Found a consecutive i/o request */ + +@@ -3935,7 +3998,8 @@ + + slot = slot2; + +- if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) { ++ if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE ++ && n_consecutive < innobase_max_merged_io) { + + goto consecutive_loop; + } else { +@@ -3955,6 +4019,8 @@ + + for (i = 0; i < n_consecutive; i++) { + total_len += consecutive_ios[i]->len; ++ ut_a(consecutive_ios[i]->status == OS_AIO_NOT_ISSUED); ++ consecutive_ios[i]->status = OS_AIO_ISSUED; + } + + if (n_consecutive == 1) { +@@ -3962,7 +4028,16 @@ + combined_buf = slot->buf; + combined_buf2 = NULL; + } else { +- combined_buf2 = ut_malloc(total_len + UNIV_PAGE_SIZE); ++ if ((total_len + UNIV_PAGE_SIZE) > os_aio_thread_buffer_size[global_segment]) { ++ ++ if (os_aio_thread_buffer[global_segment]) ++ ut_free(os_aio_thread_buffer[global_segment]); ++ ++ os_aio_thread_buffer[global_segment] = ut_malloc(total_len + UNIV_PAGE_SIZE); ++ ++ os_aio_thread_buffer_size[global_segment] = total_len + UNIV_PAGE_SIZE; ++ } ++ combined_buf2 = os_aio_thread_buffer[global_segment]; + + ut_a(combined_buf2); + +@@ -3973,6 +4048,9 @@ + this assumes that there is just one i/o-handler thread serving + a single segment of slots! */ + ++ ut_a(slot->reserved); ++ ut_a(slot->status == OS_AIO_ISSUED); ++ + os_mutex_exit(array->mutex); + + if (slot->type == OS_FILE_WRITE && n_consecutive > 1) { +@@ -3998,6 +4076,7 @@ + + /* Do the i/o with ordinary, synchronous i/o functions: */ + if (slot->type == OS_FILE_WRITE) { ++ os_aio_thread_io_writes[global_segment] += n_consecutive; + if (array == os_aio_write_array) { + if ((total_len % UNIV_PAGE_SIZE != 0) + || (slot->offset % UNIV_PAGE_SIZE != 0)) { +@@ -4012,16 +4091,34 @@ + os_file_check_page_trailers(combined_buf, total_len); + } + ++ start_usecs = time_usecs(); + ret = os_file_write(slot->name, slot->file, combined_buf, + slot->offset, slot->offset_high, total_len); +- ++ stop_usecs = time_usecs(); ++ elapsed_usecs = stop_usecs - start_usecs; ++ if (elapsed_usecs < 0) elapsed_usecs = 0; + if (array == os_aio_write_array) { + os_file_check_page_trailers(combined_buf, total_len); + } +- } else { ++ os_aio_write_requests++; ++ os_aio_pages_written += n_consecutive; ++ os_aio_write_time += (ib_longlong)elapsed_usecs; ++ } else { ++ start_usecs = time_usecs(); ++ os_aio_thread_io_reads[global_segment] += n_consecutive; + ret = os_file_read(slot->file, combined_buf, + slot->offset, slot->offset_high, total_len); +- } ++ stop_usecs = time_usecs(); ++ elapsed_usecs = stop_usecs - start_usecs; ++ if (elapsed_usecs < 0) elapsed_usecs = 0; ++ os_aio_read_requests++; ++ os_aio_pages_read += n_consecutive; ++ os_aio_read_time += (ib_longlong)elapsed_usecs; ++ } ++ if (elapsed_usecs > os_aio_thread_max_io_wait[global_segment]) ++ os_aio_thread_max_io_wait[global_segment] = elapsed_usecs; ++ os_aio_thread_io_wait[global_segment] += elapsed_usecs; ++ os_aio_thread_io_requests[global_segment]++; + + ut_a(ret); + srv_set_io_thread_op_info(global_segment, "file i/o done"); +@@ -4042,16 +4139,13 @@ + } + } + +- if (combined_buf2) { +- ut_free(combined_buf2); +- } +- + os_mutex_enter(array->mutex); + + /* Mark the i/os done in slots */ + + for (i = 0; i < n_consecutive; i++) { +- consecutive_ios[i]->io_already_done = TRUE; ++ ut_a(consecutive_ios[i]->status == OS_AIO_ISSUED); ++ consecutive_ios[i]->status = OS_AIO_DONE; + } + + /* We return the messages for the first slot now, and if there were +@@ -4061,6 +4155,8 @@ + slot_io_done: + + ut_a(slot->reserved); ++ ut_a(slot->status == OS_AIO_DONE); ++ slot->status = OS_AIO_CLAIMED; + + *message1 = slot->message1; + *message2 = slot->message2; +@@ -4070,7 +4166,8 @@ + os_mutex_exit(array->mutex); + + os_aio_array_free_slot(array, slot); +- ++ srv_set_io_thread_op_info(global_segment, "exited handler"); ++ + return(ret); + + wait_for_io: +@@ -4115,7 +4212,6 @@ + os_mutex_enter(array->mutex); + + ut_a(array->n_slots > 0); +- ut_a(array->n_segments > 0); + + for (i = 0; i < array->n_slots; i++) { + slot = os_aio_array_get_nth_slot(array, i); +@@ -4165,11 +4261,20 @@ + double time_elapsed; + double avg_bytes_read; + ulint i; +- +- for (i = 0; i < srv_n_file_io_threads; i++) { +- fprintf(file, "I/O thread %lu state: %s (%s)", (ulong) i, +- srv_io_thread_op_info[i], +- srv_io_thread_function[i]); ++ ulint num_issued, num_done, num_claimed; ++ ++ if (file) { ++ for (i = 0; i < os_aio_n_segments; i++) { ++ fprintf(file, ++ "I/O thread %lu state: %s (%s) reads %lu writes %lu " ++ "requests %lu io secs %lf io msecs/request %lf max_io_wait %lf", ++ i, srv_io_thread_op_info[i], srv_io_thread_function[i], ++ os_aio_thread_io_reads[i], os_aio_thread_io_writes[i], ++ os_aio_thread_io_requests[i], ++ os_aio_thread_io_wait[i] / 1000000.0, ++ os_aio_thread_io_requests[i] ? ++ os_aio_thread_io_wait[i] / os_aio_thread_io_requests[i] / 1000.0 : 0.0, ++ os_aio_thread_max_io_wait[i] / 1000.0); + + #ifndef __WIN__ + if (os_aio_segment_wait_events[i]->is_set) { +@@ -4181,6 +4286,7 @@ + } + + fputs("Pending normal aio reads:", file); ++ } // if (file) + + array = os_aio_read_array; + loop: +@@ -4189,14 +4295,23 @@ + os_mutex_enter(array->mutex); + + ut_a(array->n_slots > 0); +- ut_a(array->n_segments > 0); + + n_reserved = 0; ++ num_done = num_issued = num_claimed = 0; + + for (i = 0; i < array->n_slots; i++) { + slot = os_aio_array_get_nth_slot(array, i); + + if (slot->reserved) { ++ if (slot->status == OS_AIO_ISSUED) ++ num_issued++; ++ else if (slot->status == OS_AIO_DONE) ++ num_done++; ++ else { ++ ut_ad(slot->status == OS_AIO_CLAIMED); ++ num_claimed++; ++ } ++ + n_reserved++; + /* fprintf(stderr, "Reserved slot, messages %p %p\n", + slot->message1, slot->message2); */ +@@ -4206,42 +4321,56 @@ + + ut_a(array->n_reserved == n_reserved); + +- fprintf(file, " %lu", (ulong) n_reserved); +- ++ if (file) fprintf(file, " %lu", (ulong) n_reserved); ++ + os_mutex_exit(array->mutex); + + if (array == os_aio_read_array) { +- fputs(", aio writes:", file); +- ++ inno_pending_normal_aio_reads = (ulong) n_reserved; ++ if (file) fputs(", aio writes:", file); + array = os_aio_write_array; + + goto loop; + } + + if (array == os_aio_write_array) { +- fputs(",\n ibuf aio reads:", file); ++ inno_pending_normal_aio_writes = (ulong) n_reserved; ++ if (file) fputs(",\n ibuf aio reads:", file); + array = os_aio_ibuf_array; + + goto loop; + } + + if (array == os_aio_ibuf_array) { +- fputs(", log i/o's:", file); ++ inno_pending_ibuf_aio_reads = (ulong) n_reserved; ++ if (file) fputs(", log i/o's:", file); + array = os_aio_log_array; + + goto loop; + } + + if (array == os_aio_log_array) { +- fputs(", sync i/o's:", file); ++ inno_pending_log_ios = (ulong) n_reserved; ++ if (file) fputs(", sync i/o's:", file); + array = os_aio_sync_array; + + goto loop; + } + +- putc('\n', file); ++ if (array == os_aio_sync_array) { ++ inno_pending_sync_ios = (ulong) n_reserved; ++ } ++ + current_time = time(NULL); + time_elapsed = 0.001 + difftime(current_time, os_last_printout); ++ ++ if (file) { ++ putc('\n', file); ++ fprintf(file, ++ "Summary of background IO slot status: %lu issued, " ++ "%lu done, %lu claimed, sleep set %d\n", ++ num_issued, num_done, num_claimed, ++ os_aio_recommend_sleep_for_read_threads); + + fprintf(file, + "Pending flushes (fsync) log: %lu; buffer pool: %lu\n" +@@ -4274,6 +4403,7 @@ + / time_elapsed, + (os_n_fsyncs - os_n_fsyncs_old) + / time_elapsed); ++ } // if (file) + + os_n_file_reads_old = os_n_file_reads; + os_n_file_writes_old = os_n_file_writes; +diff -r 322370200e6a innobase/srv/srv0srv.c +--- a/innobase/srv/srv0srv.c Mon Nov 03 05:07:57 2008 -0800 ++++ b/innobase/srv/srv0srv.c Mon Nov 03 05:08:52 2008 -0800 +@@ -164,7 +164,17 @@ + ulint srv_mem_pool_size = ULINT_MAX; /* size in bytes */ + ulint srv_lock_table_size = ULINT_MAX; + ++ulint srv_io_capacity = ULINT_MAX; /* Number of IO operations per ++ second the server can do */ ++ ++ibool srv_extra_dirty_writes = TRUE; /* Write dirty pages to disk when pct ++ dirty < max dirty pct */ ++ ++/* Deprecated by srv_n_{read,write}_io_threads */ + ulint srv_n_file_io_threads = ULINT_MAX; ++/* Number of background IO threads for read and write requests */ ++ulint srv_n_read_io_threads = ULINT_MAX; ++ulint srv_n_write_io_threads = ULINT_MAX; + + #ifdef UNIV_LOG_ARCHIVE + ibool srv_log_archive_on = FALSE; +@@ -238,6 +248,24 @@ + + /* variable to count the number of random read-aheads */ + ulint srv_read_ahead_rnd = 0; ++ ++/* Number of IO operations read/write done for all threads */ ++ulint os_aio_read_requests = 0; ++ulint os_aio_write_requests = 0; ++ ++/* Number of pages read/written done for all threads */ ++ulint os_aio_pages_read = 0; ++ulint os_aio_pages_written = 0; ++ ++/* time usec used to perform read/write for all threads */ ++ib_longlong os_aio_read_time = 0; ++ib_longlong os_aio_write_time = 0; ++ ++ulint inno_pending_normal_aio_reads = 0; ++ulint inno_pending_normal_aio_writes = 0; ++ulint inno_pending_ibuf_aio_reads = 0; ++ulint inno_pending_log_ios = 0; ++ulint inno_pending_sync_ios = 0; + + /* structure to pass status variables to MySQL */ + export_struc export_vars; +@@ -413,6 +441,23 @@ + + ulint srv_main_thread_process_no = 0; + ulint srv_main_thread_id = 0; ++ ++// The following count work done by srv_master_thread. ++ ++// Iterations by the 'once per second' loop. ++ulint srv_main_1_second_loops = 0; ++// Calls to sleep by the 'once per second' loop. ++ulint srv_main_sleeps = 0; ++// Iterations by the 'once per 10 seconds' loop. ++ulint srv_main_10_second_loops = 0; ++// Iterations of the loop bounded by the 'background_loop' label. ++ulint srv_main_background_loops = 0; ++// Iterations of the loop bounded by the 'flush_loop' label. ++ulint srv_main_flush_loops = 0; ++// Calls to log_buffer_flush_to_disk. ++ulint srv_sync_flush = 0; ++// Calls to log_buffer_flush_maybe_sync. ++ulint srv_async_flush = 0; + + /* + IMPLEMENTATION OF THE SERVER MAIN PROGRAM +@@ -2170,7 +2215,12 @@ + } + + /************************************************************************* +-The master thread controlling the server. */ ++Returns the number of IO operations that is X percent of the capacity. ++ ++PCT_IO(5) -> returns the number of IO operations that is 5% of the max ++where max is srv_io_capacity. ++*/ ++#define PCT_IO(pct) ((ulint) (srv_io_capacity * ((double) pct / 100.0))) + + #ifndef __WIN__ + void* +@@ -2199,11 +2249,15 @@ + ulint n_pend_ios; + ibool skip_sleep = FALSE; + ulint i; ++ + + #ifdef UNIV_DEBUG_THREAD_CREATION + fprintf(stderr, "Master thread starts, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); + #endif ++ fprintf(stderr, "InnoDB master thread running with io_capacity %lu\n", ++ srv_io_capacity); ++ + srv_main_thread_process_no = os_proc_get_number(); + srv_main_thread_id = os_thread_pf(os_thread_get_curr_id()); + +@@ -2275,26 +2329,28 @@ + + srv_main_thread_op_info = "flushing log"; + log_buffer_flush_to_disk(); ++ srv_sync_flush++; + + srv_main_thread_op_info = "making checkpoint"; + log_free_check(); + +- /* If there were less than 5 i/os during the +- one second sleep, we assume that there is free +- disk i/o capacity available, and it makes sense to +- do an insert buffer merge. */ ++ /* If i/os during one second sleep were less than 5% of ++ capacity, we assume that there is free disk i/o capacity ++ available, and it makes sense to do an insert buffer merge. */ + + n_pend_ios = buf_get_n_pending_ios() + + log_sys->n_pending_writes; + n_ios = log_sys->n_log_ios + buf_pool->n_pages_read + + buf_pool->n_pages_written; +- if (n_pend_ios < 3 && (n_ios - n_ios_old < 5)) { ++ if (n_pend_ios < PCT_IO(3) && (n_ios - n_ios_old < PCT_IO(5))) { + srv_main_thread_op_info = "doing insert buffer merge"; +- ibuf_contract_for_n_pages(TRUE, 5); ++ ibuf_contract_for_n_pages(TRUE, PCT_IO(5)); + + srv_main_thread_op_info = "flushing log"; + +- log_buffer_flush_to_disk(); ++ /* No fsync when srv_flush_log_at_trx_commit != 1 */ ++ log_buffer_flush_maybe_sync(); ++ srv_async_flush++; + } + + if (buf_get_modified_ratio_pct() > +@@ -2303,7 +2359,8 @@ + /* Try to keep the number of modified pages in the + buffer pool under the limit wished by the user */ + +- n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100, ++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, ++ PCT_IO(100), + ut_dulint_max); + + /* If we had to do the flush, it may have taken +@@ -2325,36 +2382,47 @@ + + /* ---- We perform the following code approximately once per + 10 seconds when there is database activity */ ++ srv_main_10_second_loops++; + + #ifdef MEM_PERIODIC_CHECK + /* Check magic numbers of every allocated mem block once in 10 + seconds */ + mem_validate_all_blocks(); + #endif +- /* If there were less than 200 i/os during the 10 second period, +- we assume that there is free disk i/o capacity available, and it +- makes sense to flush 100 pages. */ ++ /* If i/os during the 10 second period were less than 200% of ++ capacity, we assume that there is free disk i/o capacity ++ available, and it makes sense to flush srv_io_capacity pages. ++ ++ Note that this is done regardless of the fraction of dirty ++ pages relative to the max requested by the user. The one second ++ loop above requests writes for that case. The writes done here ++ are not required, and may be disabled. */ + + n_pend_ios = buf_get_n_pending_ios() + log_sys->n_pending_writes; + n_ios = log_sys->n_log_ios + buf_pool->n_pages_read + + buf_pool->n_pages_written; +- if (n_pend_ios < 3 && (n_ios - n_ios_very_old < 200)) { ++ if (srv_extra_dirty_writes && ++ n_pend_ios < PCT_IO(3) && (n_ios - n_ios_very_old < PCT_IO(200))) { + + srv_main_thread_op_info = "flushing buffer pool pages"; +- buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max); ++ buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ut_dulint_max); + + srv_main_thread_op_info = "flushing log"; +- log_buffer_flush_to_disk(); ++ /* No fsync when srv_flush_log_at_trx_commit != 1 */ ++ log_buffer_flush_maybe_sync(); ++ srv_async_flush++; + } + + /* We run a batch of insert buffer merge every 10 seconds, + even if the server were active */ + + srv_main_thread_op_info = "doing insert buffer merge"; +- ibuf_contract_for_n_pages(TRUE, 5); ++ ibuf_contract_for_n_pages(TRUE, PCT_IO(5)); + + srv_main_thread_op_info = "flushing log"; +- log_buffer_flush_to_disk(); ++ /* No fsync when srv_flush_log_at_trx_commit != 1 */ ++ log_buffer_flush_maybe_sync(); ++ srv_async_flush++; + + /* We run a full purge every 10 seconds, even if the server + were active */ +@@ -2378,8 +2446,9 @@ + if (difftime(current_time, last_flush_time) > 1) { + srv_main_thread_op_info = "flushing log"; + +- log_buffer_flush_to_disk(); ++ log_buffer_flush_to_disk(); + last_flush_time = current_time; ++ srv_sync_flush++; + } + } + +@@ -2393,14 +2462,14 @@ + (> 70 %), we assume we can afford reserving the disk(s) for + the time it requires to flush 100 pages */ + +- n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100, ++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), + ut_dulint_max); + } else { + /* Otherwise, we only flush a small number of pages so that + we do not unnecessarily use much disk i/o capacity from + other work */ + +- n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 10, ++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(10), + ut_dulint_max); + } + +@@ -2434,7 +2503,7 @@ + + /* The server has been quiet for a while: start running background + operations */ +- ++ srv_main_background_loops++; + srv_main_thread_op_info = "doing background drop tables"; + + n_tables_to_drop = row_drop_tables_for_mysql_in_background(); +@@ -2472,6 +2541,7 @@ + + log_buffer_flush_to_disk(); + last_flush_time = current_time; ++ srv_sync_flush++; + } + } + +@@ -2487,9 +2557,13 @@ + srv_main_thread_op_info = "doing insert buffer merge"; + + if (srv_fast_shutdown && srv_shutdown_state > 0) { +- n_bytes_merged = 0; ++ n_bytes_merged = 0; + } else { +- n_bytes_merged = ibuf_contract_for_n_pages(TRUE, 20); ++ /* This should do an amount of IO similar to the number of ++ * dirty pages that will be flushed in the call to ++ * buf_flush_batch below. Otherwise, the system favors ++ * clean pages over cleanup throughput. */ ++ n_bytes_merged = ibuf_contract_for_n_pages(TRUE, PCT_IO(100)); + } + + srv_main_thread_op_info = "reserving kernel mutex"; +@@ -2503,10 +2577,11 @@ + + flush_loop: + srv_main_thread_op_info = "flushing buffer pool pages"; ++ srv_main_flush_loops++; + + if (srv_fast_shutdown < 2) { + n_pages_flushed = +- buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max); ++ buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ut_dulint_max); + } else { + /* In the fastest shutdown we do not flush the buffer pool + to data files: we set n_pages_flushed to 0 artificially. */ +@@ -2528,7 +2603,17 @@ + + srv_main_thread_op_info = "flushing log"; + +- log_buffer_flush_to_disk(); ++ current_time = time(NULL); ++ if (difftime(current_time, last_flush_time) > 1) { ++ srv_main_thread_op_info = (char*) "flushing log"; ++ log_buffer_flush_to_disk(); ++ last_flush_time = current_time; ++ srv_sync_flush++; ++ } else { ++ /* No fsync when srv_flush_log_at_trx_commit != 1 */ ++ log_buffer_flush_maybe_sync(); ++ srv_async_flush++; ++ } + + srv_main_thread_op_info = "making checkpoint"; + +diff -r 322370200e6a innobase/srv/srv0start.c +--- a/innobase/srv/srv0start.c Mon Nov 03 05:07:57 2008 -0800 ++++ b/innobase/srv/srv0start.c Mon Nov 03 05:08:52 2008 -0800 +@@ -973,6 +973,7 @@ + ulint i; + ibool srv_file_per_table_original_value = srv_file_per_table; + mtr_t mtr; ++ ulint n_threads; + #ifdef HAVE_DARWIN_THREADS + # ifdef F_FULLFSYNC + /* This executable has been compiled on Mac OS X 10.3 or later. +@@ -1206,24 +1207,32 @@ + } + + /* Restrict the maximum number of file i/o threads */ +- if (srv_n_file_io_threads > SRV_MAX_N_IO_THREADS) { +- +- srv_n_file_io_threads = SRV_MAX_N_IO_THREADS; ++ if ((srv_n_read_io_threads + srv_n_write_io_threads) > SRV_MAX_N_IO_THREADS) { ++ fprintf(stderr, ++ "InnoDB: requested too many read(%d) or write(%d) IO threads, max is %d\n", ++ srv_n_read_io_threads, srv_n_write_io_threads, SRV_MAX_N_IO_THREADS); ++ return(DB_ERROR); + } + + if (!os_aio_use_native_aio) { +- /* In simulated aio we currently have use only for 4 threads */ +- srv_n_file_io_threads = 4; ++ /* More than 4 threads are now supported. */ ++ n_threads = os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD, ++ srv_n_read_io_threads, ++ srv_n_write_io_threads, ++ SRV_MAX_N_PENDING_SYNC_IOS); ++ } else { ++ /* Might need more slots here. Alas, I don't do windows. */ ++ n_threads = os_aio_init(SRV_N_PENDING_IOS_PER_THREAD, ++ srv_n_read_io_threads, ++ srv_n_write_io_threads, ++ SRV_MAX_N_PENDING_SYNC_IOS); ++ } + +- os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD +- * srv_n_file_io_threads, +- srv_n_file_io_threads, +- SRV_MAX_N_PENDING_SYNC_IOS); +- } else { +- os_aio_init(SRV_N_PENDING_IOS_PER_THREAD +- * srv_n_file_io_threads, +- srv_n_file_io_threads, +- SRV_MAX_N_PENDING_SYNC_IOS); ++ if (n_threads > SRV_MAX_N_IO_THREADS) { ++ fprintf(stderr, ++ "InnoDB: requested too many IO threads(%d), max is %d\n", ++ n_threads, SRV_MAX_N_IO_THREADS); ++ return(DB_ERROR); + } + + fil_init(srv_max_n_open_files); +@@ -1259,11 +1268,11 @@ + + /* Create i/o-handler threads: */ + +- for (i = 0; i < srv_n_file_io_threads; i++) { ++ for (i = 0; i < n_threads; i++) { + n[i] = i; + + os_thread_create(io_handler_thread, n + i, thread_ids + i); +- } ++ } + + #ifdef UNIV_LOG_ARCHIVE + if (0 != ut_strcmp(srv_log_group_home_dirs[0], srv_arch_dir)) { +diff -r 322370200e6a patch_info/innodb_io_tune.info +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/patch_info/innodb_io_tune.info Mon Nov 03 05:08:52 2008 -0800 +@@ -0,0 +1,9 @@ ++File=innodb_io_tune.patch ++Name=Tune InnoDB IO settings ++Version=1.0 ++Author=Google ++License=GPL ++Comment= ++ChangeLog= ++2008-11-01 ++VT: Initial porting +diff -r 322370200e6a sql/ha_innodb.cc +--- a/sql/ha_innodb.cc Mon Nov 03 05:07:57 2008 -0800 ++++ b/sql/ha_innodb.cc Mon Nov 03 05:08:52 2008 -0800 +@@ -147,7 +147,7 @@ + innobase_additional_mem_pool_size, innobase_file_io_threads, + innobase_lock_wait_timeout, innobase_force_recovery, + innobase_open_files; +- ++long innobase_read_io_threads, innobase_write_io_threads; + longlong innobase_buffer_pool_size, innobase_log_file_size; + + /* The default values for the following char* start-up parameters +@@ -175,6 +175,23 @@ + my_bool innobase_rollback_on_timeout = FALSE; + my_bool innobase_create_status_file = FALSE; + my_bool innobase_adaptive_hash_index = TRUE; ++ ++/* Max number of IO requests merged to perform large IO in background ++ IO threads. ++*/ ++long innobase_max_merged_io = 64; ++ ++/* time interval in seconds allowed to calling innodb_show_status functions */ ++long innobase_min_status_update_time_interval = 30; ++ ++ ++/* Default number of IO per second supported by server. Tunes background ++ IO rate ++*/ ++long innobase_io_capacity = 100; ++ ++/* Write dirty pages when pct dirty is less than max pct dirty */ ++my_bool innobase_extra_dirty_writes = TRUE; + + static char *internal_innobase_data_file_path = NULL; + +@@ -1372,7 +1389,11 @@ + + srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size; + ++ srv_io_capacity = (ulint) innobase_io_capacity; ++ srv_extra_dirty_writes = (ibool) innobase_extra_dirty_writes; + srv_n_file_io_threads = (ulint) innobase_file_io_threads; ++ srv_n_read_io_threads = (ulint) innobase_read_io_threads; ++ srv_n_write_io_threads = (ulint) innobase_write_io_threads; + + srv_lock_wait_timeout = (ulint) innobase_lock_wait_timeout; + srv_force_recovery = (ulint) innobase_force_recovery; +diff -r 322370200e6a sql/ha_innodb.h +--- a/sql/ha_innodb.h Mon Nov 03 05:07:57 2008 -0800 ++++ b/sql/ha_innodb.h Mon Nov 03 05:08:52 2008 -0800 +@@ -197,6 +197,7 @@ + + extern struct show_var_st innodb_status_variables[]; + extern ulong innobase_fast_shutdown; ++extern long innobase_max_merged_io; + extern ulong innobase_large_page_size; + extern long innobase_mirrored_log_groups, innobase_log_files_in_group; + extern longlong innobase_buffer_pool_size, innobase_log_file_size; +@@ -205,10 +206,14 @@ + extern long innobase_buffer_pool_awe_mem_mb; + extern long innobase_file_io_threads, innobase_lock_wait_timeout; + extern long innobase_force_recovery; ++extern long innobase_read_io_threads, innobase_write_io_threads; + extern long innobase_open_files; + extern char *innobase_data_home_dir, *innobase_data_file_path; + extern char *innobase_log_group_home_dir, *innobase_log_arch_dir; + extern char *innobase_unix_file_flush_method; ++extern long innobase_io_capacity; ++extern my_bool innobase_extra_dirty_writes; ++ + /* The following variables have to be my_bool for SHOW VARIABLES to work */ + extern my_bool innobase_log_archive, + innobase_use_doublewrite, +diff -r 322370200e6a sql/mysqld.cc +--- a/sql/mysqld.cc Mon Nov 03 05:07:57 2008 -0800 ++++ b/sql/mysqld.cc Mon Nov 03 05:08:52 2008 -0800 +@@ -4932,6 +4932,11 @@ + OPT_INNODB_ADDITIONAL_MEM_POOL_SIZE, + OPT_INNODB_MAX_PURGE_LAG, + OPT_INNODB_FILE_IO_THREADS, ++ OPT_INNODB_READ_IO_THREADS, ++ OPT_INNODB_WRITE_IO_THREADS, ++ OPT_INNODB_MAX_MERGED_IO, ++ OPT_INNODB_IO_CAPACITY, ++ OPT_INNODB_EXTRA_DIRTY_WRITES, + OPT_INNODB_LOCK_WAIT_TIMEOUT, + OPT_INNODB_THREAD_CONCURRENCY, + OPT_INNODB_COMMIT_CONCURRENCY, +@@ -5302,6 +5307,25 @@ + (gptr*) &global_system_variables.innodb_table_locks, + (gptr*) &global_system_variables.innodb_table_locks, + 0, GET_BOOL, OPT_ARG, 1, 0, 0, 0, 0, 0}, ++ {"innodb_max_merged_io", OPT_INNODB_MAX_MERGED_IO, ++ "Max number of IO requests merged to issue large IO from background IO threads.", ++ (gptr*) &innobase_max_merged_io, ++ (gptr*) &innobase_max_merged_io, 0, GET_LONG, REQUIRED_ARG, 64, 1, 64, 0, 0, 0}, ++ {"innodb_read_io_threads", OPT_INNODB_READ_IO_THREADS, ++ "Number of background read I/O threads in InnoDB.", (gptr*) &innobase_read_io_threads, ++ (gptr*) &innobase_read_io_threads, 0, GET_LONG, REQUIRED_ARG, 1, 1, 64, 0, 1, 0}, ++ {"innodb_write_io_threads", OPT_INNODB_WRITE_IO_THREADS, ++ "Number of background write I/O threads in InnoDB.", (gptr*) &innobase_write_io_threads, ++ (gptr*) &innobase_write_io_threads, 0, GET_LONG, REQUIRED_ARG, 1, 1, 64, 0, 1, 0}, ++ {"innodb_io_capacity", OPT_INNODB_IO_CAPACITY, ++ "Number of IO operations per second the server can do. Tunes background IO rate.", ++ (gptr*) &innobase_io_capacity, ++ (gptr*) &innobase_io_capacity, 0, GET_LONG, ++ REQUIRED_ARG, 100, 100, 999999999, 0, 1, 0}, ++ {"innodb_extra_dirty_writes", OPT_INNODB_EXTRA_DIRTY_WRITES, ++ "When set, flush dirty buffer pages when dirty pct is less than max dirty pct. ", ++ (gptr*) &innobase_extra_dirty_writes, (gptr*) &innobase_extra_dirty_writes, ++ 0, GET_BOOL, NO_ARG, 1, 0, 1, 0, 1, 0}, + #endif /* End HAVE_INNOBASE_DB */ + {"isam", OPT_ISAM, "Obsolete. ISAM storage engine is no longer supported.", + (gptr*) &opt_isam, (gptr*) &opt_isam, 0, GET_BOOL, NO_ARG, 0, 0, 0, +diff -r 322370200e6a sql/set_var.cc +--- a/sql/set_var.cc Mon Nov 03 05:07:57 2008 -0800 ++++ b/sql/set_var.cc Mon Nov 03 05:08:52 2008 -0800 +@@ -919,12 +919,14 @@ + {"innodb_data_home_dir", (char*) &innobase_data_home_dir, SHOW_CHAR_PTR}, + {"innodb_adaptive_hash_index", (char*) &innobase_adaptive_hash_index, SHOW_MY_BOOL}, + {"innodb_doublewrite", (char*) &innobase_use_doublewrite, SHOW_MY_BOOL}, ++ {"innodb_extra_dirty_writes", (char*) &innobase_extra_dirty_writes, SHOW_MY_BOOL}, + {sys_innodb_fast_shutdown.name,(char*) &sys_innodb_fast_shutdown, SHOW_SYS}, + {"innodb_file_io_threads", (char*) &innobase_file_io_threads, SHOW_LONG }, + {"innodb_file_per_table", (char*) &innobase_file_per_table, SHOW_MY_BOOL}, + {sys_innodb_flush_log_at_trx_commit.name, (char*) &sys_innodb_flush_log_at_trx_commit, SHOW_SYS}, + {"innodb_flush_method", (char*) &innobase_unix_file_flush_method, SHOW_CHAR_PTR}, + {"innodb_force_recovery", (char*) &innobase_force_recovery, SHOW_LONG }, ++ {"innodb_io_capacity", (char*) &innobase_io_capacity, SHOW_LONG }, + {"innodb_lock_wait_timeout", (char*) &innobase_lock_wait_timeout, SHOW_LONG }, + {"innodb_locks_unsafe_for_binlog", (char*) &innobase_locks_unsafe_for_binlog, SHOW_MY_BOOL}, + {"innodb_log_arch_dir", (char*) &innobase_log_arch_dir, SHOW_CHAR_PTR}, +@@ -943,6 +945,9 @@ + {sys_innodb_table_locks.name, (char*) &sys_innodb_table_locks, SHOW_SYS}, + {sys_innodb_thread_concurrency.name, (char*) &sys_innodb_thread_concurrency, SHOW_SYS}, + {sys_innodb_thread_sleep_delay.name, (char*) &sys_innodb_thread_sleep_delay, SHOW_SYS}, ++ {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG }, ++ {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG }, ++ {"innodb_max_merged_io", (char*) &innobase_max_merged_io, SHOW_LONG}, + #endif + {sys_interactive_timeout.name,(char*) &sys_interactive_timeout, SHOW_SYS}, + {sys_join_buffer_size.name, (char*) &sys_join_buffer_size, SHOW_SYS}, diff --git a/percona/5.0.91-b22-20100522/innodb_locks_held.patch b/percona/5.0.91-b22-20100522/innodb_locks_held.patch new file mode 100644 index 0000000..062fa47 --- /dev/null +++ b/percona/5.0.91-b22-20100522/innodb_locks_held.patch @@ -0,0 +1,219 @@ +diff -r e9fb5b8bcf78 innobase/include/srv0srv.h +--- a/innobase/include/srv0srv.h Mon Jun 01 00:36:33 2009 -0700 ++++ b/innobase/include/srv0srv.h Mon Jun 01 00:36:41 2009 -0700 +@@ -80,6 +80,8 @@ + extern ulint srv_log_file_size; + extern ulint srv_log_buffer_size; + extern ulong srv_flush_log_at_trx_commit; ++extern ulong srv_show_locks_held; ++extern ulong srv_show_verbose_locks; + + extern byte srv_latin1_ordering[256];/* The sort order table of the latin1 + character set */ +diff -r e9fb5b8bcf78 innobase/lock/lock0lock.c +--- a/innobase/lock/lock0lock.c Mon Jun 01 00:36:33 2009 -0700 ++++ b/innobase/lock/lock0lock.c Mon Jun 01 00:36:41 2009 -0700 +@@ -4181,6 +4181,7 @@ + #endif /* UNIV_SYNC_DEBUG */ + } + ++ if ( srv_show_verbose_locks ) { + for (i = 0; i < lock_rec_get_n_bits(lock); i++) { + + if (lock_rec_get_nth_bit(lock, i)) { +@@ -4198,6 +4199,7 @@ + putc('\n', file); + } + } ++ } /* srv_show_verbose_locks */ + + mtr_commit(&mtr); + if (UNIV_LIKELY_NULL(heap)) { +@@ -4369,7 +4371,7 @@ + } + } + +- if (!srv_print_innodb_lock_monitor) { ++ if (!srv_print_innodb_lock_monitor && !srv_show_locks_held) { + nth_trx++; + goto loop; + } +@@ -4426,9 +4428,9 @@ + + nth_lock++; + +- if (nth_lock >= 10) { ++ if (nth_lock >= srv_show_locks_held) { + fputs( +- "10 LOCKS PRINTED FOR THIS TRX: SUPPRESSING FURTHER PRINTS\n", ++ "TOO MANY LOCKS PRINTED FOR THIS TRX: SUPPRESSING FURTHER PRINTS\n", + file); + + nth_trx++; +diff -r e9fb5b8bcf78 innobase/srv/srv0srv.c +--- a/innobase/srv/srv0srv.c Mon Jun 01 00:36:33 2009 -0700 ++++ b/innobase/srv/srv0srv.c Mon Jun 01 00:36:41 2009 -0700 +@@ -116,6 +116,8 @@ + ulint srv_log_file_size = ULINT_MAX; /* size in database pages */ + ulint srv_log_buffer_size = ULINT_MAX; /* size in database pages */ + ulong srv_flush_log_at_trx_commit = 1; ++ulint srv_show_locks_held = 10; ++ulint srv_show_verbose_locks = 0; + + byte srv_latin1_ordering[256] /* The sort order table of the latin1 + character set. The following table is +@@ -1711,24 +1713,6 @@ + + mutex_exit(&dict_foreign_err_mutex); + +- lock_print_info_summary(file); +- if (trx_start) { +- long t = ftell(file); +- if (t < 0) { +- *trx_start = ULINT_UNDEFINED; +- } else { +- *trx_start = (ulint) t; +- } +- } +- lock_print_info_all_transactions(file); +- if (trx_end) { +- long t = ftell(file); +- if (t < 0) { +- *trx_end = ULINT_UNDEFINED; +- } else { +- *trx_end = (ulint) t; +- } +- } + fputs("--------\n" + "FILE I/O\n" + "--------\n", file); +@@ -1822,6 +1806,25 @@ + srv_n_rows_deleted_old = srv_n_rows_deleted; + srv_n_rows_read_old = srv_n_rows_read; + ++ lock_print_info_summary(file); ++ if (trx_start) { ++ long t = ftell(file); ++ if (t < 0) { ++ *trx_start = ULINT_UNDEFINED; ++ } else { ++ *trx_start = (ulint) t; ++ } ++ } ++ lock_print_info_all_transactions(file); ++ if (trx_end) { ++ long t = ftell(file); ++ if (t < 0) { ++ *trx_end = ULINT_UNDEFINED; ++ } else { ++ *trx_end = (ulint) t; ++ } ++ } ++ + fputs("----------------------------\n" + "END OF INNODB MONITOR OUTPUT\n" + "============================\n", file); +diff -r e9fb5b8bcf78 libmysqld/set_var.cc +--- a/libmysqld/set_var.cc Mon Jun 01 00:36:33 2009 -0700 ++++ b/libmysqld/set_var.cc Mon Jun 01 00:36:41 2009 -0700 +@@ -825,6 +825,8 @@ + &sys_innodb_thread_concurrency, + &sys_innodb_commit_concurrency, + &sys_innodb_flush_log_at_trx_commit, ++ &sys_innodb_show_locks_held, ++ &sys_innodb_show_verbose_locks, + #endif + &sys_trust_routine_creators, + &sys_trust_function_creators, +@@ -942,6 +944,8 @@ + {"innodb_file_io_threads", (char*) &innobase_file_io_threads, SHOW_LONG }, + {"innodb_file_per_table", (char*) &innobase_file_per_table, SHOW_MY_BOOL}, + {sys_innodb_flush_log_at_trx_commit.name, (char*) &sys_innodb_flush_log_at_trx_commit, SHOW_SYS}, ++ {sys_innodb_show_locks_held.name, (char*) &sys_innodb_show_locks_held, SHOW_SYS }, ++ {sys_innodb_show_verbose_locks.name, (char*) &sys_innodb_show_verbose_locks, SHOW_SYS }, + {"innodb_flush_method", (char*) &innobase_unix_file_flush_method, SHOW_CHAR_PTR}, + {"innodb_force_recovery", (char*) &innobase_force_recovery, SHOW_LONG }, + {"innodb_lock_wait_timeout", (char*) &innobase_lock_wait_timeout, SHOW_LONG }, +diff -r e9fb5b8bcf78 patch_info/innodb_locks_held.info +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/patch_info/innodb_locks_held.info Mon Jun 01 00:36:41 2009 -0700 +@@ -0,0 +1,6 @@ ++File=innodb_locks_held.patch ++Name=Add locks held, remove locked records in SHOW INNODB STATUS ++Version=1.0 ++Author=Baron Schwartz <baron@xaprb.com> ++License=GPL ++Comment=Bug #29126 fix +diff -r e9fb5b8bcf78 sql/ha_innodb.h +--- a/sql/ha_innodb.h Mon Jun 01 00:36:33 2009 -0700 ++++ b/sql/ha_innodb.h Mon Jun 01 00:36:41 2009 -0700 +@@ -243,6 +243,8 @@ + extern ulong srv_enable_unsafe_group_commit; + extern uint srv_read_ahead; + extern uint srv_adaptive_checkpoint; ++extern ulong srv_show_locks_held; ++extern ulong srv_show_verbose_locks; + + /* An option to enable the fix for "Bug#43660 SHOW INDEXES/ANALYZE does + NOT update cardinality for indexes of InnoDB table". By default we are +diff -r e9fb5b8bcf78 sql/mysqld.cc +--- a/sql/mysqld.cc Mon Jun 01 00:36:33 2009 -0700 ++++ b/sql/mysqld.cc Mon Jun 01 00:36:41 2009 -0700 +@@ -5016,6 +5016,8 @@ + OPT_INNODB_MAX_PURGE_LAG, + OPT_INNODB_FILE_IO_THREADS, + OPT_INNODB_LOCK_WAIT_TIMEOUT, ++ OPT_INNODB_SHOW_LOCKS_HELD, ++ OPT_INNODB_SHOW_VERBOSE_LOCKS, + OPT_INNODB_THREAD_CONCURRENCY, + OPT_INNODB_COMMIT_CONCURRENCY, + OPT_INNODB_FORCE_RECOVERY, +@@ -5364,6 +5366,14 @@ + (gptr*) &srv_flush_log_at_trx_commit, + (gptr*) &srv_flush_log_at_trx_commit, + 0, GET_ULONG, OPT_ARG, 1, 0, 2, 0, 0, 0}, ++ {"innodb_show_locks_held", OPT_INNODB_SHOW_LOCKS_HELD, ++ "Number of locks held to print for each InnoDB transaction in SHOW INNODB STATUS.", ++ (gptr*) &srv_show_locks_held, (gptr*) &srv_show_locks_held, ++ 0, GET_LONG, OPT_ARG, 10, 0, 1000, 0, 1, 0}, ++ {"innodb_show_verbose_locks", OPT_INNODB_SHOW_VERBOSE_LOCKS, ++ "Whether to show records locked in SHOW INNODB STATUS.", ++ (gptr*) &srv_show_verbose_locks, (gptr*) &srv_show_verbose_locks, ++ 0, GET_LONG, OPT_ARG, 0, 0, 1, 0, 1, 0}, + {"innodb_flush_method", OPT_INNODB_FLUSH_METHOD, + "With which method to flush data.", (gptr*) &innobase_unix_file_flush_method, + (gptr*) &innobase_unix_file_flush_method, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, +diff -r e9fb5b8bcf78 sql/set_var.cc +--- a/sql/set_var.cc Mon Jun 01 00:36:33 2009 -0700 ++++ b/sql/set_var.cc Mon Jun 01 00:36:41 2009 -0700 +@@ -527,6 +527,12 @@ + sys_var_enum sys_innodb_adaptive_checkpoint("innodb_adaptive_checkpoint", + &srv_adaptive_checkpoint, + &innodb_adaptive_checkpoint_typelib, fix_innodb_adaptive_checkpoint); ++sys_var_long_ptr sys_innodb_show_locks_held( ++ "innodb_show_locks_held", ++ &srv_show_locks_held); ++sys_var_long_ptr sys_innodb_show_verbose_locks( ++ "innodb_show_verbose_locks", ++ &srv_show_verbose_locks); + sys_var_const_os_str_ptr sys_innodb_data_file_path("innodb_data_file_path", + &innobase_data_file_path); + sys_var_const_os_str_ptr sys_innodb_data_home_dir("innodb_data_home_dir", +@@ -906,6 +912,8 @@ + &sys_innodb_read_ahead, + &sys_innodb_enable_unsafe_group_commit, + &sys_innodb_adaptive_checkpoint, ++ &sys_innodb_show_locks_held, ++ &sys_innodb_show_verbose_locks, + #endif + &sys_trust_routine_creators, + &sys_trust_function_creators, +@@ -1023,6 +1031,8 @@ + {"innodb_file_io_threads", (char*) &innobase_file_io_threads, SHOW_LONG }, + {"innodb_file_per_table", (char*) &innobase_file_per_table, SHOW_MY_BOOL}, + {sys_innodb_flush_log_at_trx_commit.name, (char*) &sys_innodb_flush_log_at_trx_commit, SHOW_SYS}, ++ {sys_innodb_show_locks_held.name, (char*) &sys_innodb_show_locks_held, SHOW_SYS }, ++ {sys_innodb_show_verbose_locks.name, (char*) &sys_innodb_show_verbose_locks, SHOW_SYS }, + {"innodb_flush_method", (char*) &innobase_unix_file_flush_method, SHOW_CHAR_PTR}, + {"innodb_force_recovery", (char*) &innobase_force_recovery, SHOW_LONG }, + {"innodb_lock_wait_timeout", (char*) &innobase_lock_wait_timeout, SHOW_LONG }, diff --git a/percona/5.0.91-b22-20100522/innodb_misc_patch.patch b/percona/5.0.91-b22-20100522/innodb_misc_patch.patch new file mode 100644 index 0000000..4f4faf3 --- /dev/null +++ b/percona/5.0.91-b22-20100522/innodb_misc_patch.patch @@ -0,0 +1,64 @@ +diff -ru mysql-5.0.84_p_orig/innobase/row/row0sel.c mysql-5.0.84/innobase/row/row0sel.c +--- mysql-5.0.84_p_orig/innobase/row/row0sel.c 2009-07-07 21:54:10.000000000 +0900 ++++ mysql-5.0.84/innobase/row/row0sel.c 2009-08-28 09:28:56.000000000 +0900 +@@ -2988,6 +2988,15 @@ + return(SEL_FOUND); + } + ++/********************************************************************** ++Returns true if the thread is executing a SELECT statement. ++(Prototype for global functions in ha_innodb.cc) */ ++ibool ++thd_is_select( ++/*==========*/ ++ /* out: true if thd is executing SELECT */ ++ const void* thd); /* in: thread handle (THD*) */ ++ + /************************************************************************ + Searches for rows in the database. This is used in the interface to + MySQL. This function opens a cursor, and also implements fetch next +@@ -3361,20 +3370,12 @@ + + if (trx->isolation_level <= TRX_ISO_READ_COMMITTED + && prebuilt->select_lock_type != LOCK_NONE +- && trx->mysql_query_str) { +- +- /* Scan the MySQL query string; check if SELECT is the first +- word there */ +- ibool success; +- +- dict_accept(*trx->mysql_query_str, "SELECT", &success); +- +- if (success) { ++ && trx->mysql_thd != NULL ++ && thd_is_select(trx->mysql_thd)) { + /* It is a plain locking SELECT and the isolation + level is low: do not lock gaps */ + + set_also_gap_locks = FALSE; +- } + } + + /* Note that if the search mode was GE or G, then the cursor +diff -ru mysql-5.0.84_p_orig/sql/ha_innodb.cc mysql-5.0.84/sql/ha_innodb.cc +--- mysql-5.0.84_p_orig/sql/ha_innodb.cc 2009-08-27 16:06:21.000000000 +0900 ++++ mysql-5.0.84/sql/ha_innodb.cc 2009-08-28 09:33:38.000000000 +0900 +@@ -394,6 +394,18 @@ + } + } + ++/********************************************************************** ++Returns true if the thread is executing a SELECT statement. */ ++extern "C" ++ibool ++thd_is_select( ++/*==========*/ ++ /* out: true if thd is executing SELECT */ ++ const void* thd) /* in: thread handle (THD*) */ ++{ ++ return(((const THD*) thd)->lex->sql_command == SQLCOM_SELECT); ++} ++ + /************************************************************************ + Call this function when mysqld passes control to the client. That is to + avoid deadlocks on the adaptive hash S-latch possibly held by thd. For more diff --git a/percona/5.0.91-b22-20100522/innodb_recovery_patches.patch b/percona/5.0.91-b22-20100522/innodb_recovery_patches.patch new file mode 100644 index 0000000..3d3e567 --- /dev/null +++ b/percona/5.0.91-b22-20100522/innodb_recovery_patches.patch @@ -0,0 +1,217 @@ +diff -ruN a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c +--- a/innobase/buf/buf0flu.c 2009-08-04 16:53:42.000000000 +0900 ++++ b/innobase/buf/buf0flu.c 2009-08-04 17:02:36.000000000 +0900 +@@ -85,6 +85,22 @@ + prev_b = NULL; + b = UT_LIST_GET_FIRST(buf_pool->flush_list); + ++ if (srv_fast_recovery) { ++ /* speed hack */ ++ if (b == NULL || (ut_dulint_cmp(b->oldest_modification, ++ block->oldest_modification) < 0)) { ++ UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, block); ++ } else { ++ b = UT_LIST_GET_LAST(buf_pool->flush_list); ++ if (ut_dulint_cmp(b->oldest_modification, ++ block->oldest_modification) < 0) { ++ /* align oldest_modification not to sort */ ++ block->oldest_modification = b->oldest_modification; ++ } ++ UT_LIST_ADD_LAST(flush_list, buf_pool->flush_list, block); ++ } ++ } else { ++ /* normal */ + while (b && (ut_dulint_cmp(b->oldest_modification, + block->oldest_modification) > 0)) { + prev_b = b; +@@ -97,6 +113,7 @@ + UT_LIST_INSERT_AFTER(flush_list, buf_pool->flush_list, prev_b, + block); + } ++ } + + ut_ad(buf_flush_validate_low()); + } +diff -ruN a/innobase/buf/buf0rea.c b/innobase/buf/buf0rea.c +--- a/innobase/buf/buf0rea.c 2009-08-04 16:53:42.000000000 +0900 ++++ b/innobase/buf/buf0rea.c 2009-08-04 17:11:41.000000000 +0900 +@@ -127,6 +127,46 @@ + block = buf_page_init_for_read(err, mode, space, tablespace_version, + offset); + if (block == NULL) { ++ /* bugfix: http://bugs.mysql.com/bug.php?id=43948 */ ++ if (recv_recovery_is_on() && *err == DB_TABLESPACE_DELETED) { ++ /* hashed log recs must be treated here */ ++ recv_addr_t* recv_addr; ++ ++ mutex_enter(&(recv_sys->mutex)); ++ ++ if (recv_sys->apply_log_recs == FALSE) { ++ mutex_exit(&(recv_sys->mutex)); ++ goto not_to_recover; ++ } ++ ++ /* recv_get_fil_addr_struct() */ ++ recv_addr = HASH_GET_FIRST(recv_sys->addr_hash, ++ hash_calc_hash(ut_fold_ulint_pair(space, offset), ++ recv_sys->addr_hash)); ++ while (recv_addr) { ++ if ((recv_addr->space == space) ++ && (recv_addr->page_no == offset)) { ++ break; ++ } ++ recv_addr = HASH_GET_NEXT(addr_hash, recv_addr); ++ } ++ ++ if ((recv_addr == NULL) ++ || (recv_addr->state == RECV_BEING_PROCESSED) ++ || (recv_addr->state == RECV_PROCESSED)) { ++ mutex_exit(&(recv_sys->mutex)); ++ goto not_to_recover; ++ } ++ ++ fprintf(stderr, " (space:%lu is deleted)", space); ++ recv_addr->state = RECV_PROCESSED; ++ ++ ut_a(recv_sys->n_addrs); ++ recv_sys->n_addrs--; ++ ++ mutex_exit(&(recv_sys->mutex)); ++ } ++not_to_recover: + + return(0); + } +@@ -697,11 +737,11 @@ + while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) { + + os_aio_simulated_wake_handler_threads(); +- os_thread_sleep(500000); ++ os_thread_sleep(10000); + + count++; + +- if (count > 100) { ++ if (count > 5000) { + fprintf(stderr, + "InnoDB: Error: InnoDB has waited for 50 seconds for pending\n" + "InnoDB: reads to the buffer pool to be finished.\n" +diff -ruN a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h +--- a/innobase/include/srv0srv.h 2009-08-04 16:53:42.000000000 +0900 ++++ b/innobase/include/srv0srv.h 2009-08-04 17:39:51.000000000 +0900 +@@ -59,6 +59,8 @@ + extern ibool srv_file_per_table; + extern ibool srv_locks_unsafe_for_binlog; + ++extern ibool srv_fast_recovery; ++ + extern ulint srv_n_data_files; + extern char** srv_data_file_names; + extern ulint* srv_data_file_sizes; +diff -ruN a/innobase/log/log0recv.c b/innobase/log/log0recv.c +--- a/innobase/log/log0recv.c 2009-07-07 21:54:08.000000000 +0900 ++++ b/innobase/log/log0recv.c 2009-08-04 17:15:15.000000000 +0900 +@@ -101,7 +101,7 @@ + use these free frames to read in pages when we start applying the + log records to the database. */ + +-ulint recv_n_pool_free_frames = 256; ++ulint recv_n_pool_free_frames = 1024; + + /* The maximum lsn we see for a page during the recovery process. If this + is bigger than the lsn we are able to scan up to, that is an indication that +@@ -1135,6 +1135,8 @@ + recv_addr = recv_get_fil_addr_struct(space, page_no); + + if ((recv_addr == NULL) ++ /* bugfix: http://bugs.mysql.com/bug.php?id=44140 */ ++ || (recv_addr->state == RECV_BEING_READ && !just_read_in) + || (recv_addr->state == RECV_BEING_PROCESSED) + || (recv_addr->state == RECV_PROCESSED)) { + +diff -ruN a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c +--- a/innobase/srv/srv0srv.c 2009-08-04 16:53:42.000000000 +0900 ++++ b/innobase/srv/srv0srv.c 2009-08-04 17:41:05.000000000 +0900 +@@ -88,6 +88,8 @@ + i.e. do not use next-key locking + except on duplicate key checking and + foreign key checking */ ++ibool srv_fast_recovery = FALSE; ++ + ulint srv_n_data_files = 0; + char** srv_data_file_names = NULL; + ulint* srv_data_file_sizes = NULL; /* size in database pages */ +diff -ruN a/patch_info/innodb_recovery_patches.info b/patch_info/innodb_recovery_patches.info +--- /dev/null 1970-01-01 09:00:00.000000000 +0900 ++++ b/patch_info/innodb_recovery_patches.info 2009-08-04 16:58:07.000000000 +0900 +@@ -0,0 +1,6 @@ ++File=innodb_recovery_patches.patch ++Name=Bugfixes and adjustments about recovery process ++Version=1.0 ++Author=Percona <info@percona.com> ++License=GPL ++Comment= +diff -ruN a/sql/ha_innodb.cc b/sql/ha_innodb.cc +--- a/sql/ha_innodb.cc 2009-08-04 16:53:42.000000000 +0900 ++++ b/sql/ha_innodb.cc 2009-08-04 17:35:44.000000000 +0900 +@@ -182,6 +182,7 @@ + my_bool innobase_rollback_on_timeout = FALSE; + my_bool innobase_create_status_file = FALSE; + my_bool innobase_adaptive_hash_index = TRUE; ++my_bool innobase_fast_recovery = FALSE; + + static char *internal_innobase_data_file_path = NULL; + +@@ -1534,6 +1535,8 @@ + srv_lock_wait_timeout = (ulint) innobase_lock_wait_timeout; + srv_force_recovery = (ulint) innobase_force_recovery; + ++ srv_fast_recovery = (ibool) innobase_fast_recovery; ++ + srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite; + srv_use_checksums = (ibool) innobase_use_checksums; + +diff -ruN a/sql/ha_innodb.h b/sql/ha_innodb.h +--- a/sql/ha_innodb.h 2009-08-04 16:53:42.000000000 +0900 ++++ b/sql/ha_innodb.h 2009-08-04 17:37:18.000000000 +0900 +@@ -220,6 +220,7 @@ + innobase_use_large_pages, + innobase_use_native_aio, + innobase_file_per_table, innobase_locks_unsafe_for_binlog, ++ innobase_fast_recovery, + innobase_rollback_on_timeout, + innobase_create_status_file, + innobase_adaptive_hash_index; +diff -ruN a/sql/mysqld.cc b/sql/mysqld.cc +--- a/sql/mysqld.cc 2009-08-04 16:53:42.000000000 +0900 ++++ b/sql/mysqld.cc 2009-08-04 17:48:25.000000000 +0900 +@@ -5102,6 +5102,7 @@ + OPT_INNODB_READ_IO_THREADS, + OPT_INNODB_WRITE_IO_THREADS, + OPT_INNODB_USE_SYS_MALLOC, ++ OPT_INNODB_FAST_RECOVERY, + OPT_INNODB_THREAD_CONCURRENCY_TIMER_BASED, + OPT_INNODB_EXTRA_RSEGMENTS, + OPT_INNODB_DICT_SIZE_LIMIT, +@@ -5347,6 +5348,10 @@ + {"innodb_doublewrite", OPT_INNODB_DOUBLEWRITE, "Enable InnoDB doublewrite buffer (enabled by default). \ + Disable with --skip-innodb-doublewrite.", (gptr*) &innobase_use_doublewrite, + (gptr*) &innobase_use_doublewrite, 0, GET_BOOL, NO_ARG, 1, 0, 0, 0, 0, 0}, ++ {"innodb_fast_recovery", OPT_INNODB_FAST_RECOVERY, ++ "Enable to use speed hack of recovery avoiding flush list sorting.", ++ (gptr*) &innobase_fast_recovery, (gptr*) &innobase_fast_recovery, ++ 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"innodb_fast_shutdown", OPT_INNODB_FAST_SHUTDOWN, + "Speeds up the shutdown process of the InnoDB storage engine. Possible " + "values are 0, 1 (faster)" +diff -ruN a/sql/set_var.cc b/sql/set_var.cc +--- a/sql/set_var.cc 2009-08-04 16:53:42.000000000 +0900 ++++ b/sql/set_var.cc 2009-08-04 17:51:49.000000000 +0900 +@@ -1088,6 +1088,7 @@ + {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG}, + {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG}, + {"innodb_use_sys_malloc", (char*) &innobase_use_sys_malloc, SHOW_MY_BOOL}, ++ {"innodb_fast_recovery", (char*) &innobase_fast_recovery, SHOW_MY_BOOL}, + {"innodb_thread_concurrency_timer_based", (char*) &innobase_thread_concurrency_timer_based, SHOW_MY_BOOL}, + {"innodb_extra_rsegments", (char*) &innobase_extra_rsegments, SHOW_LONG}, + {sys_innodb_dict_size_limit.name, (char*) &sys_innodb_dict_size_limit, SHOW_SYS}, diff --git a/percona/5.0.91-b22-20100522/innodb_rw_lock.patch b/percona/5.0.91-b22-20100522/innodb_rw_lock.patch new file mode 100644 index 0000000..a509f70 --- /dev/null +++ b/percona/5.0.91-b22-20100522/innodb_rw_lock.patch @@ -0,0 +1,2480 @@ +diff -ruN a/innobase/btr/btr0cur.c b/innobase/btr/btr0cur.c +--- a/innobase/btr/btr0cur.c 2009-10-22 15:15:05.000000000 +0900 ++++ b/innobase/btr/btr0cur.c 2009-10-22 15:18:44.000000000 +0900 +@@ -313,7 +313,7 @@ + #ifdef UNIV_SEARCH_PERF_STAT + info->n_searches++; + #endif +- if (btr_search_latch.writer == RW_LOCK_NOT_LOCKED ++ if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_NOT_LOCKED + && latch_mode <= BTR_MODIFY_LEAF && info->last_hash_succ + && !estimate + #ifdef PAGE_CUR_LE_OR_EXTENDS +diff -ruN a/innobase/btr/btr0sea.c b/innobase/btr/btr0sea.c +--- a/innobase/btr/btr0sea.c 2009-10-22 15:15:05.000000000 +0900 ++++ b/innobase/btr/btr0sea.c 2009-10-22 15:18:44.000000000 +0900 +@@ -773,8 +773,8 @@ + rw_lock_s_lock(&btr_search_latch); + } + +- ut_ad(btr_search_latch.writer != RW_LOCK_EX); +- ut_ad(btr_search_latch.reader_count > 0); ++ ut_ad(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_EX); ++ ut_ad(rw_lock_get_reader_count(&btr_search_latch) > 0); + + rec = ha_search_and_get_data(btr_search_sys->hash_index, fold); + +diff -ruN a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c +--- a/innobase/buf/buf0buf.c 2009-10-22 15:15:05.000000000 +0900 ++++ b/innobase/buf/buf0buf.c 2009-10-22 15:18:44.000000000 +0900 +@@ -1292,7 +1292,7 @@ + + if (mode == BUF_GET_NOWAIT) { + if (rw_latch == RW_S_LATCH) { +- success = rw_lock_s_lock_func_nowait(&(block->lock), ++ success = rw_lock_s_lock_nowait(&(block->lock), + file, line); + fix_type = MTR_MEMO_PAGE_S_FIX; + } else { +@@ -1442,7 +1442,7 @@ + ut_ad(!ibuf_inside() || ibuf_page(block->space, block->offset)); + + if (rw_latch == RW_S_LATCH) { +- success = rw_lock_s_lock_func_nowait(&(block->lock), ++ success = rw_lock_s_lock_nowait(&(block->lock), + file, line); + fix_type = MTR_MEMO_PAGE_S_FIX; + } else { +@@ -1596,7 +1596,7 @@ + ut_ad(!ibuf_inside() || (mode == BUF_KEEP_OLD)); + + if (rw_latch == RW_S_LATCH) { +- success = rw_lock_s_lock_func_nowait(&(block->lock), ++ success = rw_lock_s_lock_nowait(&(block->lock), + file, line); + fix_type = MTR_MEMO_PAGE_S_FIX; + } else { +diff -ruN a/innobase/include/buf0buf.ic b/innobase/include/buf0buf.ic +--- a/innobase/include/buf0buf.ic 2009-10-22 15:15:05.000000000 +0900 ++++ b/innobase/include/buf0buf.ic 2009-10-22 16:12:25.000000000 +0900 +@@ -523,7 +523,7 @@ + #ifdef UNIV_SYNC_DEBUG + ibool ret; + +- ret = rw_lock_s_lock_func_nowait(&(block->debug_latch), file, line); ++ ret = rw_lock_s_lock_nowait(&(block->debug_latch), file, line); + + ut_ad(ret == TRUE); + ut_ad(mutex_own(&block->mutex)); +diff -ruN a/innobase/include/os0sync.h b/innobase/include/os0sync.h +--- a/innobase/include/os0sync.h 2009-09-10 04:02:59.000000000 +0900 ++++ b/innobase/include/os0sync.h 2009-10-22 15:18:44.000000000 +0900 +@@ -1,11 +1,35 @@ ++/***************************************************************************** ++ ++Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. ++Copyright (c) 2008, Google Inc. ++ ++Portions of this file contain modifications contributed and copyrighted by ++Google, Inc. Those modifications are gratefully acknowledged and are described ++briefly in the InnoDB documentation. The contributions by Google are ++incorporated with their permission, and subject to the conditions contained in ++the file COPYING.Google. ++ ++This program is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License as published by the Free Software ++Foundation; version 2 of the License. ++ ++This program is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License along with ++this program; if not, write to the Free Software Foundation, Inc., 59 Temple ++Place, Suite 330, Boston, MA 02111-1307 USA ++ ++*****************************************************************************/ ++ + /****************************************************** + The interface to the operating system + synchronization primitives. + +-(c) 1995 Innobase Oy +- + Created 9/6/1995 Heikki Tuuri + *******************************************************/ ++ + #ifndef os0sync_h + #define os0sync_h + +@@ -261,6 +285,23 @@ + /*===============*/ + os_fast_mutex_t* fast_mutex); /* in: mutex to free */ + ++#ifdef HAVE_ATOMIC_BUILTINS ++/************************************************************** ++Atomic compare-and-swap for InnoDB. Currently requires GCC atomic builtins. ++Returns true if swapped, ptr is pointer to target, old_val is value to ++compare to, new_val is the value to swap in. */ ++#define os_compare_and_swap(ptr, old_val, new_val) \ ++ __sync_bool_compare_and_swap(ptr, old_val, new_val) ++ ++/************************************************************** ++Atomic increment for InnoDB. Currently requires GCC atomic builtins. ++Returns the resulting value, ptr is pointer to target, amount is the ++amount of increment. */ ++#define os_atomic_increment(ptr, amount) \ ++ __sync_add_and_fetch(ptr, amount) ++ ++#endif /* HAVE_ATOMIC_BUILTINS */ ++ + #ifndef UNIV_NONINL + #include "os0sync.ic" + #endif +diff -ruN a/innobase/include/sync0rw.h b/innobase/include/sync0rw.h +--- a/innobase/include/sync0rw.h 2009-09-10 04:02:59.000000000 +0900 ++++ b/innobase/include/sync0rw.h 2009-10-22 15:18:44.000000000 +0900 +@@ -1,8 +1,31 @@ ++/***************************************************************************** ++ ++Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. ++Copyright (c) 2008, Google Inc. ++ ++Portions of this file contain modifications contributed and copyrighted by ++Google, Inc. Those modifications are gratefully acknowledged and are described ++briefly in the InnoDB documentation. The contributions by Google are ++incorporated with their permission, and subject to the conditions contained in ++the file COPYING.Google. ++ ++This program is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License as published by the Free Software ++Foundation; version 2 of the License. ++ ++This program is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License along with ++this program; if not, write to the Free Software Foundation, Inc., 59 Temple ++Place, Suite 330, Boston, MA 02111-1307 USA ++ ++*****************************************************************************/ ++ + /****************************************************** + The read-write lock (for threads, not for database transactions) + +-(c) 1995 Innobase Oy +- + Created 9/11/1995 Heikki Tuuri + *******************************************************/ + +@@ -24,6 +47,12 @@ + #define RW_X_LATCH 2 + #define RW_NO_LATCH 3 + ++/* We decrement lock_word by this amount for each x_lock. It is also the ++start value for the lock_word, meaning that it limits the maximum number ++of concurrent read locks before the rw_lock breaks. The current value of ++0x00100000 allows 1,048,575 concurrent readers and 2047 recursive writers.*/ ++#define X_LOCK_DECR 0x00100000 ++ + typedef struct rw_lock_struct rw_lock_t; + #ifdef UNIV_SYNC_DEBUG + typedef struct rw_lock_debug_struct rw_lock_debug_t; +@@ -47,14 +76,14 @@ + there may be waiters for the event */ + #endif /* UNIV_SYNC_DEBUG */ + +-extern ulint rw_s_system_call_count; +-extern ulint rw_s_spin_wait_count; +-extern ulint rw_s_exit_count; +-extern ulint rw_s_os_wait_count; +-extern ulint rw_x_system_call_count; +-extern ulint rw_x_spin_wait_count; +-extern ulint rw_x_os_wait_count; +-extern ulint rw_x_exit_count; ++extern ib_longlong rw_s_spin_wait_count; ++extern ib_longlong rw_s_spin_round_count; ++extern ib_longlong rw_s_exit_count; ++extern ib_longlong rw_s_os_wait_count; ++extern ib_longlong rw_x_spin_wait_count; ++extern ib_longlong rw_x_spin_round_count; ++extern ib_longlong rw_x_os_wait_count; ++extern ib_longlong rw_x_exit_count; + + /********************************************************************** + Creates, or rather, initializes an rw-lock object in a specified memory +@@ -116,8 +145,22 @@ + NOTE! The following macros should be used in rw s-locking, not the + corresponding function. */ + +-#define rw_lock_s_lock_nowait(M) rw_lock_s_lock_func_nowait(\ +- (M), __FILE__, __LINE__) ++#define rw_lock_s_lock_nowait(M, F, L) rw_lock_s_lock_low(\ ++ (M), 0, (F), (L)) ++/********************************************************************** ++Low-level function which tries to lock an rw-lock in s-mode. Performs no ++spinning. */ ++UNIV_INLINE ++ibool ++rw_lock_s_lock_low( ++/*===============*/ ++ /* out: TRUE if success */ ++ rw_lock_t* lock, /* in: pointer to rw-lock */ ++ ulint pass __attribute__((unused)), ++ /* in: pass value; != 0, if the lock will be ++ passed to another thread to unlock */ ++ const char* file_name, /* in: file name where lock requested */ ++ ulint line); /* in: line where requested */ + /********************************************************************** + NOTE! Use the corresponding macro, not directly this function, except if + you supply the file name and line number. Lock an rw-lock in shared mode +@@ -135,18 +178,6 @@ + const char* file_name,/* in: file name where lock requested */ + ulint line); /* in: line where requested */ + /********************************************************************** +-NOTE! Use the corresponding macro, not directly this function, except if +-you supply the file name and line number. Lock an rw-lock in shared mode +-for the current thread if the lock can be acquired immediately. */ +-UNIV_INLINE +-ibool +-rw_lock_s_lock_func_nowait( +-/*=======================*/ +- /* out: TRUE if success */ +- rw_lock_t* lock, /* in: pointer to rw-lock */ +- const char* file_name,/* in: file name where lock requested */ +- ulint line); /* in: line where requested */ +-/********************************************************************** + NOTE! Use the corresponding macro, not directly this function! Lock an + rw-lock in exclusive mode for the current thread if the lock can be + obtained immediately. */ +@@ -338,6 +369,41 @@ + rw_lock_get_reader_count( + /*=====================*/ + rw_lock_t* lock); ++/********************************************************************** ++Decrements lock_word the specified amount if it is greater than 0. ++This is used by both s_lock and x_lock operations. */ ++UNIV_INLINE ++ibool ++rw_lock_lock_word_decr( ++/*===================*/ ++ /* out: TRUE if decr occurs */ ++ rw_lock_t* lock, /* in: rw-lock */ ++ ulint amount); /* in: amount to decrement */ ++/********************************************************************** ++Increments lock_word the specified amount and returns new value. */ ++UNIV_INLINE ++lint ++rw_lock_lock_word_incr( ++/*===================*/ ++ /* out: TRUE if decr occurs */ ++ rw_lock_t* lock, ++ ulint amount); /* in: rw-lock */ ++/********************************************************************** ++This function sets the lock->writer_thread and lock->recursive fields. ++For platforms where we are using atomic builtins instead of lock->mutex ++it sets the lock->writer_thread field using atomics to ensure memory ++ordering. Note that it is assumed that the caller of this function ++effectively owns the lock i.e.: nobody else is allowed to modify ++lock->writer_thread at this point in time. ++The protocol is that lock->writer_thread MUST be updated BEFORE the ++lock->recursive flag is set. */ ++UNIV_INLINE ++void ++rw_lock_set_writer_id_and_recursion_flag( ++/*=====================================*/ ++ rw_lock_t* lock, /* in/out: lock to work on */ ++ ibool recursive); /* in: TRUE if recursion ++ allowed */ + #ifdef UNIV_SYNC_DEBUG + /********************************************************************** + Checks if the thread has locked the rw-lock in the specified mode, with +@@ -417,47 +483,33 @@ + field. Then no new readers are allowed in. */ + + struct rw_lock_struct { ++ volatile lint lock_word; ++ /* Holds the state of the lock. */ ++ volatile ulint waiters;/* 1: there are waiters */ ++ volatile ibool recursive;/* Default value FALSE which means the lock ++ is non-recursive. The value is typically set ++ to TRUE making normal rw_locks recursive. In ++ case of asynchronous IO, when a non-zero ++ value of 'pass' is passed then we keep the ++ lock non-recursive. ++ This flag also tells us about the state of ++ writer_thread field. If this flag is set ++ then writer_thread MUST contain the thread ++ id of the current x-holder or wait-x thread. ++ This flag must be reset in x_unlock ++ functions before incrementing the lock_word */ ++ volatile os_thread_id_t writer_thread; ++ /* Thread id of writer thread. Is only ++ guaranteed to have sane and non-stale ++ value iff recursive flag is set. */ + os_event_t event; /* Used by sync0arr.c for thread queueing */ +- +-#ifdef __WIN__ +- os_event_t wait_ex_event; /* This windows specific event is +- used by the thread which has set the +- lock state to RW_LOCK_WAIT_EX. The +- rw_lock design guarantees that this +- thread will be the next one to proceed +- once the current the event gets +- signalled. See LEMMA 2 in sync0sync.c */ +-#endif +- +- ulint reader_count; /* Number of readers who have locked this +- lock in the shared mode */ +- ulint writer; /* This field is set to RW_LOCK_EX if there +- is a writer owning the lock (in exclusive +- mode), RW_LOCK_WAIT_EX if a writer is +- queueing for the lock, and +- RW_LOCK_NOT_LOCKED, otherwise. */ +- os_thread_id_t writer_thread; +- /* Thread id of a possible writer thread */ +- ulint writer_count; /* Number of times the same thread has +- recursively locked the lock in the exclusive +- mode */ ++ os_event_t wait_ex_event; ++ /* Event for next-writer to wait on. A thread ++ must decrement lock_word before waiting. */ ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_t mutex; /* The mutex protecting rw_lock_struct */ +- ulint pass; /* Default value 0. This is set to some +- value != 0 given by the caller of an x-lock +- operation, if the x-lock is to be passed to +- another thread to unlock (which happens in +- asynchronous i/o). */ +- ulint waiters; /* This ulint is set to 1 if there are +- waiters (readers or writers) in the global +- wait array, waiting for this rw_lock. +- Otherwise, == 0. */ +- ibool writer_is_wait_ex; +- /* This is TRUE if the writer field is +- RW_LOCK_WAIT_EX; this field is located far +- from the memory update hotspot fields which +- are at the start of this struct, thus we can +- peek this field without causing much memory +- bus traffic */ ++#endif /* HAVE_ATOMIC_BUILTINS */ ++ + UT_LIST_NODE_T(rw_lock_t) list; + /* All allocated rw locks are put into a + list */ +@@ -465,15 +517,23 @@ + UT_LIST_BASE_NODE_T(rw_lock_debug_t) debug_list; + /* In the debug version: pointer to the debug + info list of the lock */ ++ ulint level; /* Level in the global latching order. */ + #endif /* UNIV_SYNC_DEBUG */ +- ulint level; /* Level in the global latching +- order; default SYNC_LEVEL_NONE */ ++ ulint count_os_wait; /* Count of os_waits. May not be accurate */ + const char* cfile_name;/* File name where lock created */ +- ulint cline; /* Line where created */ ++ /* last s-lock file/line is not guaranteed to be correct */ + const char* last_s_file_name;/* File name where last s-locked */ + const char* last_x_file_name;/* File name where last x-locked */ +- ulint last_s_line; /* Line number where last time s-locked */ +- ulint last_x_line; /* Line number where last time x-locked */ ++ ibool writer_is_wait_ex; ++ /* This is TRUE if the writer field is ++ RW_LOCK_WAIT_EX; this field is located far ++ from the memory update hotspot fields which ++ are at the start of this struct, thus we can ++ peek this field without causing much memory ++ bus traffic */ ++ unsigned cline:14; /* Line where created */ ++ unsigned last_s_line:14; /* Line number where last time s-locked */ ++ unsigned last_x_line:14; /* Line number where last time x-locked */ + ulint magic_n; + }; + +diff -ruN a/innobase/include/sync0rw.ic b/innobase/include/sync0rw.ic +--- a/innobase/include/sync0rw.ic 2009-09-10 04:02:59.000000000 +0900 ++++ b/innobase/include/sync0rw.ic 2009-10-22 15:18:44.000000000 +0900 +@@ -1,8 +1,31 @@ ++/***************************************************************************** ++ ++Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. ++Copyright (c) 2008, Google Inc. ++ ++Portions of this file contain modifications contributed and copyrighted by ++Google, Inc. Those modifications are gratefully acknowledged and are described ++briefly in the InnoDB documentation. The contributions by Google are ++incorporated with their permission, and subject to the conditions contained in ++the file COPYING.Google. ++ ++This program is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License as published by the Free Software ++Foundation; version 2 of the License. ++ ++This program is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License along with ++this program; if not, write to the Free Software Foundation, Inc., 59 Temple ++Place, Suite 330, Boston, MA 02111-1307 USA ++ ++*****************************************************************************/ ++ + /****************************************************** + The read-write lock (for threads) + +-(c) 1995 Innobase Oy +- + Created 9/11/1995 Heikki Tuuri + *******************************************************/ + +@@ -49,53 +72,88 @@ + ulint + rw_lock_get_waiters( + /*================*/ +- rw_lock_t* lock) ++ /* out: 1 if waiters, 0 otherwise */ ++ rw_lock_t* lock) /* in: rw-lock */ + { + return(lock->waiters); + } ++ ++/************************************************************************ ++Sets lock->waiters to 1. It is not an error if lock->waiters is already ++1. On platforms where ATOMIC builtins are used this function enforces a ++memory barrier. */ + UNIV_INLINE + void +-rw_lock_set_waiters( +-/*================*/ +- rw_lock_t* lock, +- ulint flag) ++rw_lock_set_waiter_flag( ++/*====================*/ ++ rw_lock_t* lock) /* in: rw-lock */ + { +- lock->waiters = flag; ++#ifdef HAVE_ATOMIC_BUILTINS ++ os_compare_and_swap(&lock->waiters, 0, 1); ++#else /* HAVE_ATOMIC_BUILTINS */ ++ lock->waiters = 1; ++#endif /* HAVE_ATOMIC_BUILTINS */ + } ++ ++/************************************************************************ ++Resets lock->waiters to 0. It is not an error if lock->waiters is already ++0. On platforms where ATOMIC builtins are used this function enforces a ++memory barrier. */ + UNIV_INLINE +-ulint +-rw_lock_get_writer( +-/*===============*/ +- rw_lock_t* lock) ++void ++rw_lock_reset_waiter_flag( ++/*======================*/ ++ rw_lock_t* lock) /* in: rw-lock */ + { +- return(lock->writer); ++#ifdef HAVE_ATOMIC_BUILTINS ++ os_compare_and_swap(&lock->waiters, 1, 0); ++#else /* HAVE_ATOMIC_BUILTINS */ ++ lock->waiters = 0; ++#endif /* HAVE_ATOMIC_BUILTINS */ + } ++ ++/********************************************************************** ++Returns the write-status of the lock - this function made more sense ++with the old rw_lock implementation. */ + UNIV_INLINE +-void +-rw_lock_set_writer( ++ulint ++rw_lock_get_writer( + /*===============*/ +- rw_lock_t* lock, +- ulint flag) ++ rw_lock_t* lock) + { +- lock->writer = flag; ++ lint lock_word = lock->lock_word; ++ if(lock_word > 0) { ++ /* return NOT_LOCKED in s-lock state, like the writer ++ member of the old lock implementation. */ ++ return(RW_LOCK_NOT_LOCKED); ++ } else if (((-lock_word) % X_LOCK_DECR) == 0) { ++ return(RW_LOCK_EX); ++ } else { ++ ut_ad(lock_word > -X_LOCK_DECR); ++ return(RW_LOCK_WAIT_EX); ++ } + } ++ ++/********************************************************************** ++Returns number of readers. */ + UNIV_INLINE + ulint + rw_lock_get_reader_count( + /*=====================*/ + rw_lock_t* lock) + { +- return(lock->reader_count); +-} +-UNIV_INLINE +-void +-rw_lock_set_reader_count( +-/*=====================*/ +- rw_lock_t* lock, +- ulint count) +-{ +- lock->reader_count = count; ++ lint lock_word = lock->lock_word; ++ if(lock_word > 0) { ++ /* s-locked, no x-waiters */ ++ return(X_LOCK_DECR - lock_word); ++ } else if (lock_word < 0 && lock_word > -X_LOCK_DECR) { ++ /* s-locked, with x-waiters */ ++ return((ulint)(-lock_word)); ++ } ++ return(0); + } ++ ++#ifndef HAVE_ATOMIC_BUILTINS + UNIV_INLINE + mutex_t* + rw_lock_get_mutex( +@@ -104,6 +162,7 @@ + { + return(&(lock->mutex)); + } ++#endif + + /********************************************************************** + Returns the value of writer_count for the lock. Does not reserve the lock +@@ -115,7 +174,126 @@ + /* out: value of writer_count */ + rw_lock_t* lock) /* in: rw-lock */ + { +- return(lock->writer_count); ++ lint lock_copy = lock->lock_word; ++ /* If there is a reader, lock_word is not divisible by X_LOCK_DECR */ ++ if(lock_copy > 0 || (-lock_copy) % X_LOCK_DECR != 0) { ++ return(0); ++ } ++ return(((-lock_copy) / X_LOCK_DECR) + 1); ++} ++ ++/********************************************************************** ++Two different implementations for decrementing the lock_word of a rw_lock: ++one for systems supporting atomic operations, one for others. This does ++does not support recusive x-locks: they should be handled by the caller and ++need not be atomic since they are performed by the current lock holder. ++Returns true if the decrement was made, false if not. */ ++UNIV_INLINE ++ibool ++rw_lock_lock_word_decr( ++/*===================*/ ++ /* out: TRUE if decr occurs */ ++ rw_lock_t* lock, /* in: rw-lock */ ++ ulint amount) /* in: amount of decrement */ ++{ ++ ++#ifdef HAVE_ATOMIC_BUILTINS ++ ++ lint local_lock_word = lock->lock_word; ++ while (local_lock_word > 0) { ++ if(os_compare_and_swap(&(lock->lock_word), ++ local_lock_word, ++ local_lock_word - amount)) { ++ return(TRUE); ++ } ++ local_lock_word = lock->lock_word; ++ } ++ return(FALSE); ++ ++#else /* HAVE_ATOMIC_BUILTINS */ ++ ++ ibool success = FALSE; ++ mutex_enter(&(lock->mutex)); ++ if(lock->lock_word > 0) { ++ lock->lock_word -= amount; ++ success = TRUE; ++ } ++ mutex_exit(&(lock->mutex)); ++ return(success); ++ ++#endif /* HAVE_ATOMIC_BUILTINS */ ++} ++ ++/********************************************************************** ++Two different implementations for incrementing the lock_word of a rw_lock: ++one for systems supporting atomic operations, one for others. ++Returns the value of lock_word after increment. */ ++UNIV_INLINE ++lint ++rw_lock_lock_word_incr( ++/*===================*/ ++ /* out: lock->lock_word after increment */ ++ rw_lock_t* lock, /* in: rw-lock */ ++ ulint amount) /* in: amount of increment */ ++{ ++ ++#ifdef HAVE_ATOMIC_BUILTINS ++ ++ return(os_atomic_increment(&(lock->lock_word), amount)); ++ ++#else /* HAVE_ATOMIC_BUILTINS */ ++ ++ lint local_lock_word; ++ ++ mutex_enter(&(lock->mutex)); ++ ++ lock->lock_word += amount; ++ local_lock_word = lock->lock_word; ++ ++ mutex_exit(&(lock->mutex)); ++ ++ return(local_lock_word); ++ ++#endif /* HAVE_ATOMIC_BUILTINS */ ++} ++ ++/********************************************************************** ++This function sets the lock->writer_thread and lock->recursive fields. ++For platforms where we are using atomic builtins instead of lock->mutex ++it sets the lock->writer_thread field using atomics to ensure memory ++ordering. Note that it is assumed that the caller of this function ++effectively owns the lock i.e.: nobody else is allowed to modify ++lock->writer_thread at this point in time. ++The protocol is that lock->writer_thread MUST be updated BEFORE the ++lock->recursive flag is set. */ ++UNIV_INLINE ++void ++rw_lock_set_writer_id_and_recursion_flag( ++/*=====================================*/ ++ rw_lock_t* lock, /* in/out: lock to work on */ ++ ibool recursive) /* in: TRUE if recursion ++ allowed */ ++{ ++ os_thread_id_t curr_thread = os_thread_get_curr_id(); ++ ++#ifdef HAVE_ATOMIC_BUILTINS ++ os_thread_id_t local_thread; ++ ibool success; ++ ++ local_thread = lock->writer_thread; ++ success = os_compare_and_swap(&lock->writer_thread, ++ local_thread, curr_thread); ++ ut_a(success); ++ lock->recursive = recursive; ++ ++#else /* HAVE_ATOMIC_BUILTINS */ ++ ++ mutex_enter(&lock->mutex); ++ lock->writer_thread = curr_thread; ++ lock->recursive = recursive; ++ mutex_exit(&lock->mutex); ++ ++#endif /* HAVE_ATOMIC_BUILTINS */ + } + + /********************************************************************** +@@ -133,26 +311,21 @@ + const char* file_name, /* in: file name where lock requested */ + ulint line) /* in: line where requested */ + { +-#ifdef UNIV_SYNC_DEBUG +- ut_ad(mutex_own(rw_lock_get_mutex(lock))); +-#endif /* UNIV_SYNC_DEBUG */ +- /* Check if the writer field is free */ +- +- if (UNIV_LIKELY(lock->writer == RW_LOCK_NOT_LOCKED)) { +- /* Set the shared lock by incrementing the reader count */ +- lock->reader_count++; ++ /* TODO: study performance of UNIV_LIKELY branch prediction hints. */ ++ if (!rw_lock_lock_word_decr(lock, 1)) { ++ /* Locking did not succeed */ ++ return(FALSE); ++ } + + #ifdef UNIV_SYNC_DEBUG +- rw_lock_add_debug_info(lock, pass, RW_LOCK_SHARED, file_name, +- line); ++ rw_lock_add_debug_info(lock, pass, RW_LOCK_SHARED, file_name, line); + #endif +- lock->last_s_file_name = file_name; +- lock->last_s_line = line; +- +- return(TRUE); /* locking succeeded */ +- } ++ /* These debugging values are not set safely: they may be incorrect ++ or even refer to a line that is invalid for the file name. */ ++ lock->last_s_file_name = file_name; ++ lock->last_s_line = line; + +- return(FALSE); /* locking did not succeed */ ++ return(TRUE); /* locking succeeded */ + } + + /********************************************************************** +@@ -167,11 +340,10 @@ + const char* file_name, /* in: file name where requested */ + ulint line) /* in: line where lock requested */ + { +- ut_ad(lock->writer == RW_LOCK_NOT_LOCKED); +- ut_ad(rw_lock_get_reader_count(lock) == 0); ++ ut_ad(lock->lock_word == X_LOCK_DECR); + +- /* Set the shared lock by incrementing the reader count */ +- lock->reader_count++; ++ /* Indicate there is a new reader by decrementing lock_word */ ++ lock->lock_word--; + + lock->last_s_file_name = file_name; + lock->last_s_line = line; +@@ -194,13 +366,11 @@ + ulint line) /* in: line where lock requested */ + { + ut_ad(rw_lock_validate(lock)); +- ut_ad(rw_lock_get_reader_count(lock) == 0); +- ut_ad(rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED); ++ ut_ad(lock->lock_word == X_LOCK_DECR); + +- rw_lock_set_writer(lock, RW_LOCK_EX); ++ lock->lock_word -= X_LOCK_DECR; + lock->writer_thread = os_thread_get_curr_id(); +- lock->writer_count++; +- lock->pass = 0; ++ lock->recursive = TRUE; + + lock->last_x_file_name = file_name; + lock->last_x_line = line; +@@ -241,15 +411,12 @@ + ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED)); /* see NOTE above */ + #endif /* UNIV_SYNC_DEBUG */ + +- mutex_enter(rw_lock_get_mutex(lock)); +- +- if (UNIV_LIKELY(rw_lock_s_lock_low(lock, pass, file_name, line))) { +- mutex_exit(rw_lock_get_mutex(lock)); ++ /* TODO: study performance of UNIV_LIKELY branch prediction hints. */ ++ if (rw_lock_s_lock_low(lock, pass, file_name, line)) { + + return; /* Success */ + } else { + /* Did not succeed, try spin wait */ +- mutex_exit(rw_lock_get_mutex(lock)); + + rw_lock_s_lock_spin(lock, pass, file_name, line); + +@@ -259,86 +426,60 @@ + + /********************************************************************** + NOTE! Use the corresponding macro, not directly this function! Lock an +-rw-lock in shared mode for the current thread if the lock can be acquired +-immediately. */ ++rw-lock in exclusive mode for the current thread if the lock can be ++obtained immediately. */ + UNIV_INLINE + ibool +-rw_lock_s_lock_func_nowait( ++rw_lock_x_lock_func_nowait( + /*=======================*/ + /* out: TRUE if success */ + rw_lock_t* lock, /* in: pointer to rw-lock */ + const char* file_name,/* in: file name where lock requested */ + ulint line) /* in: line where requested */ + { +- ibool success = FALSE; +- +- mutex_enter(rw_lock_get_mutex(lock)); +- +- if (lock->writer == RW_LOCK_NOT_LOCKED) { +- /* Set the shared lock by incrementing the reader count */ +- lock->reader_count++; ++ os_thread_id_t curr_thread = os_thread_get_curr_id(); + +-#ifdef UNIV_SYNC_DEBUG +- rw_lock_add_debug_info(lock, 0, RW_LOCK_SHARED, file_name, +- line); +-#endif ++ ibool success; + +- lock->last_s_file_name = file_name; +- lock->last_s_line = line; ++#ifdef HAVE_ATOMIC_BUILTINS ++ success = os_compare_and_swap(&(lock->lock_word), X_LOCK_DECR, 0); ++#else + ++ success = FALSE; ++ mutex_enter(&(lock->mutex)); ++ if (lock->lock_word == X_LOCK_DECR) { ++ lock->lock_word = 0; + success = TRUE; + } ++ mutex_exit(&(lock->mutex)); + +- mutex_exit(rw_lock_get_mutex(lock)); +- +- return(success); +-} ++#endif ++ if (success) { ++ rw_lock_set_writer_id_and_recursion_flag(lock, TRUE); + +-/********************************************************************** +-NOTE! Use the corresponding macro, not directly this function! Lock an +-rw-lock in exclusive mode for the current thread if the lock can be +-obtained immediately. */ +-UNIV_INLINE +-ibool +-rw_lock_x_lock_func_nowait( +-/*=======================*/ +- /* out: TRUE if success */ +- rw_lock_t* lock, /* in: pointer to rw-lock */ +- const char* file_name,/* in: file name where lock requested */ +- ulint line) /* in: line where requested */ +-{ +- ibool success = FALSE; +- os_thread_id_t curr_thread = os_thread_get_curr_id(); +- mutex_enter(rw_lock_get_mutex(lock)); ++ } else if (lock->recursive ++ && os_thread_eq(lock->writer_thread, curr_thread)) { ++ /* Relock: this lock_word modification is safe since no other ++ threads can modify (lock, unlock, or reserve) lock_word while ++ there is an exclusive writer and this is the writer thread. */ ++ lock->lock_word -= X_LOCK_DECR; + +- if (UNIV_UNLIKELY(rw_lock_get_reader_count(lock) != 0)) { +- } else if (UNIV_LIKELY(rw_lock_get_writer(lock) +- == RW_LOCK_NOT_LOCKED)) { +- rw_lock_set_writer(lock, RW_LOCK_EX); +- lock->writer_thread = curr_thread; +- lock->pass = 0; +- relock: +- lock->writer_count++; ++ ut_ad(((-lock->lock_word) % X_LOCK_DECR) == 0); + ++ } else { ++ /* Failure */ ++ return(FALSE); ++ } + #ifdef UNIV_SYNC_DEBUG +- rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line); ++ rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line); + #endif + +- lock->last_x_file_name = file_name; +- lock->last_x_line = line; +- +- success = TRUE; +- } else if (rw_lock_get_writer(lock) == RW_LOCK_EX +- && lock->pass == 0 +- && os_thread_eq(lock->writer_thread, curr_thread)) { +- goto relock; +- } +- +- mutex_exit(rw_lock_get_mutex(lock)); ++ lock->last_x_file_name = file_name; ++ lock->last_x_line = line; + + ut_ad(rw_lock_validate(lock)); + +- return(success); ++ return(TRUE); + } + + /********************************************************************** +@@ -354,39 +495,21 @@ + #endif + ) + { +- mutex_t* mutex = &(lock->mutex); +- ibool sg = FALSE; +- +- /* Acquire the mutex protecting the rw-lock fields */ +- mutex_enter(mutex); +- +- /* Reset the shared lock by decrementing the reader count */ +- +- ut_a(lock->reader_count > 0); +- lock->reader_count--; ++ ut_ad((lock->lock_word % X_LOCK_DECR) != 0); + + #ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, pass, RW_LOCK_SHARED); + #endif + +- /* If there may be waiters and this was the last s-lock, +- signal the object */ ++ /* Increment lock_word to indicate 1 less reader */ ++ if (rw_lock_lock_word_incr(lock, 1) == 0) { + +- if (UNIV_UNLIKELY(lock->waiters) +- && lock->reader_count == 0) { +- sg = TRUE; +- +- rw_lock_set_waiters(lock, 0); +- } +- +- mutex_exit(mutex); +- +- if (UNIV_UNLIKELY(sg)) { +-#ifdef __WIN__ ++ /* wait_ex waiter exists. It may not be asleep, but we signal ++ anyway. We do not wake other waiters, because they can't ++ exist without wait_ex waiter and wait_ex waiter goes first.*/ + os_event_set(lock->wait_ex_event); +-#endif +- os_event_set(lock->event); + sync_array_object_signalled(sync_primary_wait_array); ++ + } + + ut_ad(rw_lock_validate(lock)); +@@ -405,16 +528,15 @@ + /*====================*/ + rw_lock_t* lock) /* in: rw-lock */ + { +- /* Reset the shared lock by decrementing the reader count */ +- +- ut_ad(lock->reader_count > 0); +- +- lock->reader_count--; ++ ut_ad(lock->lock_word < X_LOCK_DECR); + + #ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, 0, RW_LOCK_SHARED); + #endif + ++ /* Decrease reader count by incrementing lock_word */ ++ lock->lock_word++; ++ + ut_ad(!lock->waiters); + ut_ad(rw_lock_validate(lock)); + #ifdef UNIV_SYNC_PERF_STAT +@@ -435,42 +557,32 @@ + #endif + ) + { +- ibool sg = FALSE; +- +- /* Acquire the mutex protecting the rw-lock fields */ +- mutex_enter(&(lock->mutex)); +- +- /* Reset the exclusive lock if this thread no longer has an x-mode +- lock */ +- +- ut_ad(lock->writer_count > 0); ++ ut_ad((lock->lock_word % X_LOCK_DECR) == 0); + +- lock->writer_count--; +- +- if (lock->writer_count == 0) { +- rw_lock_set_writer(lock, RW_LOCK_NOT_LOCKED); ++ /* lock->recursive flag also indicates if lock->writer_thread is ++ valid or stale. If we are the last of the recursive callers ++ then we must unset lock->recursive flag to indicate that the ++ lock->writer_thread is now stale. ++ Note that since we still hold the x-lock we can safely read the ++ lock_word. */ ++ if (lock->lock_word == 0) { ++ /* Last caller in a possible recursive chain. */ ++ lock->recursive = FALSE; + } + + #ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, pass, RW_LOCK_EX); + #endif + +- /* If there may be waiters, signal the lock */ +- if (UNIV_UNLIKELY(lock->waiters) +- && lock->writer_count == 0) { +- +- sg = TRUE; +- rw_lock_set_waiters(lock, 0); +- } +- +- mutex_exit(&(lock->mutex)); +- +- if (UNIV_UNLIKELY(sg)) { +-#ifdef __WIN__ +- os_event_set(lock->wait_ex_event); +-#endif +- os_event_set(lock->event); +- sync_array_object_signalled(sync_primary_wait_array); ++ if (rw_lock_lock_word_incr(lock, X_LOCK_DECR) == X_LOCK_DECR) { ++ /* Lock is now free. May have to signal read/write waiters. ++ We do not need to signal wait_ex waiters, since they cannot ++ exist when there is a writer. */ ++ if (lock->waiters) { ++ rw_lock_reset_waiter_flag(lock); ++ os_event_set(lock->event); ++ sync_array_object_signalled(sync_primary_wait_array); ++ } + } + + ut_ad(rw_lock_validate(lock)); +@@ -492,18 +604,18 @@ + /* Reset the exclusive lock if this thread no longer has an x-mode + lock */ + +- ut_ad(lock->writer_count > 0); +- +- lock->writer_count--; +- +- if (lock->writer_count == 0) { +- rw_lock_set_writer(lock, RW_LOCK_NOT_LOCKED); +- } ++ ut_ad((lock->lock_word % X_LOCK_DECR) == 0); + + #ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, 0, RW_LOCK_EX); + #endif + ++ if (lock->lock_word == 0) { ++ lock->recursive = FALSE; ++ } ++ ++ lock->lock_word += X_LOCK_DECR; ++ + ut_ad(!lock->waiters); + ut_ad(rw_lock_validate(lock)); + +diff -ruN a/innobase/include/sync0sync.h b/innobase/include/sync0sync.h +--- a/innobase/include/sync0sync.h 2009-10-22 15:15:05.000000000 +0900 ++++ b/innobase/include/sync0sync.h 2009-10-22 15:18:44.000000000 +0900 +@@ -1,8 +1,31 @@ ++/***************************************************************************** ++ ++Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. ++Copyright (c) 2008, Google Inc. ++ ++Portions of this file contain modifications contributed and copyrighted by ++Google, Inc. Those modifications are gratefully acknowledged and are described ++briefly in the InnoDB documentation. The contributions by Google are ++incorporated with their permission, and subject to the conditions contained in ++the file COPYING.Google. ++ ++This program is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License as published by the Free Software ++Foundation; version 2 of the License. ++ ++This program is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License along with ++this program; if not, write to the Free Software Foundation, Inc., 59 Temple ++Place, Suite 330, Boston, MA 02111-1307 USA ++ ++*****************************************************************************/ ++ + /****************************************************** + Mutex, the basic synchronization primitive + +-(c) 1995 Innobase Oy +- + Created 9/5/1995 Heikki Tuuri + *******************************************************/ + +@@ -465,8 +488,11 @@ + struct mutex_struct { + os_event_t event; /* Used by sync0arr.c for the wait queue */ + ulint lock_word; /* This ulint is the target of the atomic +- test-and-set instruction in Win32 */ +-#if !defined(_WIN32) || !defined(UNIV_CAN_USE_X86_ASSEMBLER) ++ test-and-set instruction in Win32 and ++ x86 32/64 with GCC 4.1.0 or later version */ ++#if defined(_WIN32) && defined(UNIV_CAN_USE_X86_ASSEMBLER) ++#elif defined(HAVE_ATOMIC_BUILTINS) ++#else + os_fast_mutex_t + os_fast_mutex; /* In other systems we use this OS mutex + in place of lock_word */ +@@ -525,8 +551,7 @@ + /* The number of system calls made in this module. Intended for performance + monitoring. */ + +-extern ulint mutex_system_call_count; +-extern ulint mutex_exit_count; ++extern ib_longlong mutex_exit_count; + + /* Latching order checks start when this is set TRUE */ + extern ibool sync_order_checks_on; +diff -ruN a/innobase/include/sync0sync.ic b/innobase/include/sync0sync.ic +--- a/innobase/include/sync0sync.ic 2009-09-10 04:02:59.000000000 +0900 ++++ b/innobase/include/sync0sync.ic 2009-10-22 15:18:44.000000000 +0900 +@@ -1,21 +1,34 @@ ++/***************************************************************************** ++ ++Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. ++Copyright (c) 2008, Google Inc. ++ ++Portions of this file contain modifications contributed and copyrighted by ++Google, Inc. Those modifications are gratefully acknowledged and are described ++briefly in the InnoDB documentation. The contributions by Google are ++incorporated with their permission, and subject to the conditions contained in ++the file COPYING.Google. ++ ++This program is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License as published by the Free Software ++Foundation; version 2 of the License. ++ ++This program is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License along with ++this program; if not, write to the Free Software Foundation, Inc., 59 Temple ++Place, Suite 330, Boston, MA 02111-1307 USA ++ ++*****************************************************************************/ ++ + /****************************************************** + Mutex, the basic synchronization primitive + +-(c) 1995 Innobase Oy +- + Created 9/5/1995 Heikki Tuuri + *******************************************************/ + +-#if defined(not_defined) && defined(__GNUC__) && defined(UNIV_INTEL_X86) +-/* %z0: Use the size of operand %0 which in our case is *m to determine +-instruction size, it should end up as xchgl. "1" in the input constraint, +-says that "in" has to go in the same place as "out".*/ +-#define TAS(m, in, out) \ +- asm volatile ("xchg%z0 %2, %0" \ +- : "=g" (*(m)), "=r" (out) \ +- : "1" (in)) /* Note: "1" here refers to "=r" (out) */ +-#endif +- + /********************************************************************** + Sets the waiters field in a mutex. */ + +@@ -94,12 +107,8 @@ + /* mutex_fence(); */ + + return(res); +-#elif defined(not_defined) && defined(__GNUC__) && defined(UNIV_INTEL_X86) +- ulint res; +- +- TAS(&mutex->lock_word, 1, res); +- +- return(res); ++#elif defined(HAVE_ATOMIC_BUILTINS) ++ return __sync_lock_test_and_set(&(mutex->lock_word), 1); + #else + ibool ret; + +@@ -136,10 +145,11 @@ + __asm MOV EDX, 0 + __asm MOV ECX, lw + __asm XCHG EDX, DWORD PTR [ECX] +-#elif defined(not_defined) && defined(__GNUC__) && defined(UNIV_INTEL_X86) +- ulint res; +- +- TAS(&mutex->lock_word, 0, res); ++#elif defined(HAVE_ATOMIC_BUILTINS) ++ /* In theory __sync_lock_release should be used to release the lock. ++ Unfortunately, it does not work properly alone. The workaround is ++ that more conservative __sync_lock_test_and_set is used instead. */ ++ __sync_lock_test_and_set(&(mutex->lock_word), 0); + #else + mutex->lock_word = 0; + +diff -ruN a/innobase/row/row0sel.c b/innobase/row/row0sel.c +--- a/innobase/row/row0sel.c 2009-10-22 15:15:05.000000000 +0900 ++++ b/innobase/row/row0sel.c 2009-10-22 15:18:44.000000000 +0900 +@@ -1178,7 +1178,7 @@ + rw_lock_s_lock(&btr_search_latch); + + search_latch_locked = TRUE; +- } else if (btr_search_latch.writer_is_wait_ex) { ++ } else if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_WAIT_EX) { + + /* There is an x-latch request waiting: release the + s-latch for a moment; as an s-latch here is often +@@ -3123,7 +3123,7 @@ + /* PHASE 0: Release a possible s-latch we are holding on the + adaptive hash index latch if there is someone waiting behind */ + +- if (UNIV_UNLIKELY(btr_search_latch.writer != RW_LOCK_NOT_LOCKED) ++ if (UNIV_UNLIKELY(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_NOT_LOCKED) + && trx->has_search_latch) { + + /* There is an x-latch request on the adaptive hash index: +diff -ruN a/innobase/sync/sync0arr.c b/innobase/sync/sync0arr.c +--- a/innobase/sync/sync0arr.c 2009-09-10 04:03:01.000000000 +0900 ++++ b/innobase/sync/sync0arr.c 2009-10-22 15:18:44.000000000 +0900 +@@ -1,8 +1,31 @@ ++/***************************************************************************** ++ ++Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. ++Copyright (c) 2008, Google Inc. ++ ++Portions of this file contain modifications contributed and copyrighted by ++Google, Inc. Those modifications are gratefully acknowledged and are described ++briefly in the InnoDB documentation. The contributions by Google are ++incorporated with their permission, and subject to the conditions contained in ++the file COPYING.Google. ++ ++This program is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License as published by the Free Software ++Foundation; version 2 of the License. ++ ++This program is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License along with ++this program; if not, write to the Free Software Foundation, Inc., 59 Temple ++Place, Suite 330, Boston, MA 02111-1307 USA ++ ++*****************************************************************************/ ++ + /****************************************************** + The wait array used in synchronization primitives + +-(c) 1995 Innobase Oy +- + Created 9/5/1995 Heikki Tuuri + *******************************************************/ + +@@ -297,25 +320,21 @@ + } + + /*********************************************************************** +-Puts the cell event in reset state. */ ++Returns the event that the thread owning the cell waits for. */ + static +-ib_longlong +-sync_cell_event_reset( +-/*==================*/ +- /* out: value of signal_count +- at the time of reset. */ +- ulint type, /* in: lock type mutex/rw_lock */ +- void* object) /* in: the rw_lock/mutex object */ ++os_event_t ++sync_cell_get_event( ++/*================*/ ++ sync_cell_t* cell) /* in: non-empty sync array cell */ + { ++ ulint type = cell->request_type; ++ + if (type == SYNC_MUTEX) { +- return(os_event_reset(((mutex_t *) object)->event)); +-#ifdef __WIN__ ++ return(((mutex_t *) cell->wait_object)->event); + } else if (type == RW_LOCK_WAIT_EX) { +- return(os_event_reset( +- ((rw_lock_t *) object)->wait_ex_event)); +-#endif +- } else { +- return(os_event_reset(((rw_lock_t *) object)->event)); ++ return(((rw_lock_t *) cell->wait_object)->wait_ex_event); ++ } else { /* RW_LOCK_SHARED and RW_LOCK_EX wait on the same event */ ++ return(((rw_lock_t *) cell->wait_object)->event); + } + } + +@@ -334,6 +353,7 @@ + ulint* index) /* out: index of the reserved cell */ + { + sync_cell_t* cell; ++ os_event_t event; + ulint i; + + ut_a(object); +@@ -372,8 +392,8 @@ + /* Make sure the event is reset and also store + the value of signal_count at which the event + was reset. */ +- cell->signal_count = sync_cell_event_reset(type, +- object); ++ event = sync_cell_get_event(cell); ++ cell->signal_count = os_event_reset(event); + + cell->reservation_time = time(NULL); + +@@ -413,19 +433,7 @@ + ut_a(!cell->waiting); + ut_ad(os_thread_get_curr_id() == cell->thread); + +- if (cell->request_type == SYNC_MUTEX) { +- event = ((mutex_t*) cell->wait_object)->event; +-#ifdef __WIN__ +- /* On windows if the thread about to wait is the one which +- has set the state of the rw_lock to RW_LOCK_WAIT_EX, then +- it waits on a special event i.e.: wait_ex_event. */ +- } else if (cell->request_type == RW_LOCK_WAIT_EX) { +- event = ((rw_lock_t*) cell->wait_object)->wait_ex_event; +-#endif +- } else { +- event = ((rw_lock_t*) cell->wait_object)->event; +- } +- ++ event = sync_cell_get_event(cell); + cell->waiting = TRUE; + + #ifdef UNIV_SYNC_DEBUG +@@ -464,6 +472,7 @@ + mutex_t* mutex; + rw_lock_t* rwlock; + ulint type; ++ ulint writer; + + type = cell->request_type; + +@@ -492,9 +501,7 @@ + (ulong) mutex->waiters); + + } else if (type == RW_LOCK_EX +-#ifdef __WIN__ + || type == RW_LOCK_WAIT_EX +-#endif + || type == RW_LOCK_SHARED) { + + fputs(type == RW_LOCK_EX ? "X-lock on" : "S-lock on", file); +@@ -505,21 +512,24 @@ + " RW-latch at %p created in file %s line %lu\n", + rwlock, rwlock->cfile_name, + (ulong) rwlock->cline); +- if (rwlock->writer != RW_LOCK_NOT_LOCKED) { ++ writer = rw_lock_get_writer(rwlock); ++ if (writer != RW_LOCK_NOT_LOCKED) { + fprintf(file, + "a writer (thread id %lu) has reserved it in mode %s", + (ulong) os_thread_pf(rwlock->writer_thread), +- rwlock->writer == RW_LOCK_EX ++ writer == RW_LOCK_EX + ? " exclusive\n" + : " wait exclusive\n"); + } + + fprintf(file, +- "number of readers %lu, waiters flag %lu\n" ++ "number of readers %lu, waiters flag %lu, " ++ "lock_word: %lx\n" + "Last time read locked in file %s line %lu\n" + "Last time write locked in file %s line %lu\n", +- (ulong) rwlock->reader_count, ++ (ulong) rw_lock_get_reader_count(rwlock), + (ulong) rwlock->waiters, ++ rwlock->lock_word, + rwlock->last_s_file_name, + (ulong) rwlock->last_s_line, + rwlock->last_x_file_name, +@@ -773,28 +783,30 @@ + return(TRUE); + } + +- } else if (cell->request_type == RW_LOCK_EX +- || cell->request_type == RW_LOCK_WAIT_EX) { ++ } else if (cell->request_type == RW_LOCK_EX) { + + lock = cell->wait_object; + +- if (rw_lock_get_reader_count(lock) == 0 +- && rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED) { ++ if (lock->lock_word > 0) { ++ /* Either unlocked or only read locked. */ + + return(TRUE); + } + +- if (rw_lock_get_reader_count(lock) == 0 +- && rw_lock_get_writer(lock) == RW_LOCK_WAIT_EX +- && os_thread_eq(lock->writer_thread, cell->thread)) { ++ } else if (cell->request_type == RW_LOCK_WAIT_EX) { ++ ++ lock = cell->wait_object; ++ ++ /* lock_word == 0 means all readers have left */ ++ if (lock->lock_word == 0) { + + return(TRUE); + } +- + } else if (cell->request_type == RW_LOCK_SHARED) { + lock = cell->wait_object; + +- if (rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED) { ++ /* lock_word > 0 means no writer or reserved writer */ ++ if (lock->lock_word > 0) { + + return(TRUE); + } +@@ -839,11 +851,15 @@ + /*========================*/ + sync_array_t* arr) /* in: wait array */ + { ++#ifdef HAVE_ATOMIC_BUILTINS ++ (void) os_atomic_increment(&arr->sg_count, 1); ++#else + sync_array_enter(arr); + + arr->sg_count++; + + sync_array_exit(arr); ++#endif + } + + /************************************************************************** +@@ -859,6 +875,7 @@ + sync_cell_t* cell; + ulint count; + ulint i; ++ os_event_t event; + + sync_array_enter(arr); + +@@ -868,36 +885,20 @@ + while (count < arr->n_reserved) { + + cell = sync_array_get_nth_cell(arr, i); ++ i++; + +- if (cell->wait_object != NULL) { +- ++ if (cell->wait_object == NULL) { ++ continue; ++ } + count++; + + if (sync_arr_cell_can_wake_up(cell)) { + +- if (cell->request_type == SYNC_MUTEX) { +- mutex_t* mutex; ++ event = sync_cell_get_event(cell); + +- mutex = cell->wait_object; +- os_event_set(mutex->event); +-#ifdef __WIN__ +- } else if (cell->request_type +- == RW_LOCK_WAIT_EX) { +- rw_lock_t* lock; +- +- lock = cell->wait_object; +- os_event_set(lock->wait_ex_event); +-#endif +- } else { +- rw_lock_t* lock; +- +- lock = cell->wait_object; +- os_event_set(lock->event); +- } +- } ++ os_event_set(event); + } + +- i++; + } + + sync_array_exit(arr); +@@ -1014,4 +1015,3 @@ + + sync_array_exit(arr); + } +- +diff -ruN a/innobase/sync/sync0rw.c b/innobase/sync/sync0rw.c +--- a/innobase/sync/sync0rw.c 2009-09-10 04:03:01.000000000 +0900 ++++ b/innobase/sync/sync0rw.c 2009-10-22 15:18:44.000000000 +0900 +@@ -1,8 +1,31 @@ ++/***************************************************************************** ++ ++Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. ++Copyright (c) 2008, Google Inc. ++ ++Portions of this file contain modifications contributed and copyrighted by ++Google, Inc. Those modifications are gratefully acknowledged and are described ++briefly in the InnoDB documentation. The contributions by Google are ++incorporated with their permission, and subject to the conditions contained in ++the file COPYING.Google. ++ ++This program is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License as published by the Free Software ++Foundation; version 2 of the License. ++ ++This program is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License along with ++this program; if not, write to the Free Software Foundation, Inc., 59 Temple ++Place, Suite 330, Boston, MA 02111-1307 USA ++ ++*****************************************************************************/ ++ + /****************************************************** + The read-write lock (for thread synchronization) + +-(c) 1995 Innobase Oy +- + Created 9/11/1995 Heikki Tuuri + *******************************************************/ + +@@ -15,17 +38,110 @@ + #include "mem0mem.h" + #include "srv0srv.h" + +-ulint rw_s_system_call_count = 0; +-ulint rw_s_spin_wait_count = 0; +-ulint rw_s_os_wait_count = 0; ++/* ++ IMPLEMENTATION OF THE RW_LOCK ++ ============================= ++The status of a rw_lock is held in lock_word. The initial value of lock_word is ++X_LOCK_DECR. lock_word is decremented by 1 for each s-lock and by X_LOCK_DECR ++for each x-lock. This describes the lock state for each value of lock_word: ++ ++lock_word == X_LOCK_DECR: Unlocked. ++0 < lock_word < X_LOCK_DECR: Read locked, no waiting writers. ++ (X_LOCK_DECR - lock_word) is the ++ number of readers that hold the lock. ++lock_word == 0: Write locked ++-X_LOCK_DECR < lock_word < 0: Read locked, with a waiting writer. ++ (-lock_word) is the number of readers ++ that hold the lock. ++lock_word <= -X_LOCK_DECR: Recursively write locked. lock_word has been ++ decremented by X_LOCK_DECR once for each lock, ++ so the number of locks is: ++ ((-lock_word) / X_LOCK_DECR) + 1 ++When lock_word <= -X_LOCK_DECR, we also know that lock_word % X_LOCK_DECR == 0: ++other values of lock_word are invalid. ++ ++The lock_word is always read and updated atomically and consistently, so that ++it always represents the state of the lock, and the state of the lock changes ++with a single atomic operation. This lock_word holds all of the information ++that a thread needs in order to determine if it is eligible to gain the lock ++or if it must spin or sleep. The one exception to this is that writer_thread ++must be verified before recursive write locks: to solve this scenario, we make ++writer_thread readable by all threads, but only writeable by the x-lock holder. ++ ++The other members of the lock obey the following rules to remain consistent: ++ ++recursive: This and the writer_thread field together control the ++ behaviour of recursive x-locking. ++ lock->recursive must be FALSE in following states: ++ 1) The writer_thread contains garbage i.e.: the ++ lock has just been initialized. ++ 2) The lock is not x-held and there is no ++ x-waiter waiting on WAIT_EX event. ++ 3) The lock is x-held or there is an x-waiter ++ waiting on WAIT_EX event but the 'pass' value ++ is non-zero. ++ lock->recursive is TRUE iff: ++ 1) The lock is x-held or there is an x-waiter ++ waiting on WAIT_EX event and the 'pass' value ++ is zero. ++ This flag must be set after the writer_thread field ++ has been updated with a memory ordering barrier. ++ It is unset before the lock_word has been incremented. ++writer_thread: Is used only in recursive x-locking. Can only be safely ++ read iff lock->recursive flag is TRUE. ++ This field is uninitialized at lock creation time and ++ is updated atomically when x-lock is acquired or when ++ move_ownership is called. A thread is only allowed to ++ set the value of this field to it's thread_id i.e.: a ++ thread cannot set writer_thread to some other thread's ++ id. ++waiters: May be set to 1 anytime, but to avoid unnecessary wake-up ++ signals, it should only be set to 1 when there are threads ++ waiting on event. Must be 1 when a writer starts waiting to ++ ensure the current x-locking thread sends a wake-up signal ++ during unlock. May only be reset to 0 immediately before a ++ a wake-up signal is sent to event. On most platforms, a ++ memory barrier is required after waiters is set, and before ++ verifying lock_word is still held, to ensure some unlocker ++ really does see the flags new value. ++event: Threads wait on event for read or writer lock when another ++ thread has an x-lock or an x-lock reservation (wait_ex). A ++ thread may only wait on event after performing the following ++ actions in order: ++ (1) Record the counter value of event (with os_event_reset). ++ (2) Set waiters to 1. ++ (3) Verify lock_word <= 0. ++ (1) must come before (2) to ensure signal is not missed. ++ (2) must come before (3) to ensure a signal is sent. ++ These restrictions force the above ordering. ++ Immediately before sending the wake-up signal, we should: ++ (1) Verify lock_word == X_LOCK_DECR (unlocked) ++ (2) Reset waiters to 0. ++wait_ex_event: A thread may only wait on the wait_ex_event after it has ++ performed the following actions in order: ++ (1) Decrement lock_word by X_LOCK_DECR. ++ (2) Record counter value of wait_ex_event (os_event_reset, ++ called from sync_array_reserve_cell). ++ (3) Verify that lock_word < 0. ++ (1) must come first to ensures no other threads become reader ++ or next writer, and notifies unlocker that signal must be sent. ++ (2) must come before (3) to ensure the signal is not missed. ++ These restrictions force the above ordering. ++ Immediately before sending the wake-up signal, we should: ++ Verify lock_word == 0 (waiting thread holds x_lock) ++*/ ++ ++ib_longlong rw_s_spin_wait_count = 0; ++ib_longlong rw_s_spin_round_count = 0; ++ib_longlong rw_s_os_wait_count = 0; ++ ++ib_longlong rw_s_exit_count = 0; ++ ++ib_longlong rw_x_spin_wait_count = 0; ++ib_longlong rw_x_spin_round_count = 0; ++ib_longlong rw_x_os_wait_count = 0; + +-ulint rw_s_exit_count = 0; +- +-ulint rw_x_system_call_count = 0; +-ulint rw_x_spin_wait_count = 0; +-ulint rw_x_os_wait_count = 0; +- +-ulint rw_x_exit_count = 0; ++ib_longlong rw_x_exit_count = 0; + + /* The global list of rw-locks */ + rw_lock_list_t rw_lock_list; +@@ -99,22 +215,30 @@ + object is created, then the following call initializes + the sync system. */ + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_create(rw_lock_get_mutex(lock)); + mutex_set_level(rw_lock_get_mutex(lock), SYNC_NO_ORDER_CHECK); + + lock->mutex.cfile_name = cfile_name; + lock->mutex.cline = cline; +-#if defined UNIV_DEBUG && !defined UNIV_HOTBACKUP ++# if defined UNIV_DEBUG && !defined UNIV_HOTBACKUP + lock->mutex.cmutex_name = cmutex_name; + lock->mutex.mutex_type = 1; +-#endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ ++# endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ + +- rw_lock_set_waiters(lock, 0); +- rw_lock_set_writer(lock, RW_LOCK_NOT_LOCKED); +- lock->writer_count = 0; +- rw_lock_set_reader_count(lock, 0); +- +- lock->writer_is_wait_ex = FALSE; ++#else /* HAVE_ATOMIC_BUILTINS */ ++# ifdef UNIV_DEBUG ++ UT_NOT_USED(cmutex_name); ++# endif ++#endif /* HAVE_ATOMIC_BUILTINS */ ++ ++ lock->lock_word = X_LOCK_DECR; ++ lock->waiters = 0; ++ ++ /* We set this value to signify that lock->writer_thread ++ contains garbage at initialization and cannot be used for ++ recursive x-locking. */ ++ lock->recursive = FALSE; + + #ifdef UNIV_SYNC_DEBUG + UT_LIST_INIT(lock->debug_list); +@@ -126,15 +250,13 @@ + lock->cfile_name = cfile_name; + lock->cline = cline; + ++ lock->count_os_wait = 0; + lock->last_s_file_name = "not yet reserved"; + lock->last_x_file_name = "not yet reserved"; + lock->last_s_line = 0; + lock->last_x_line = 0; + lock->event = os_event_create(NULL); +- +-#ifdef __WIN__ + lock->wait_ex_event = os_event_create(NULL); +-#endif + + mutex_enter(&rw_lock_list_mutex); + +@@ -158,23 +280,17 @@ + /*=========*/ + rw_lock_t* lock) /* in: rw-lock */ + { +-#ifdef UNIV_DEBUG + ut_a(rw_lock_validate(lock)); +-#endif /* UNIV_DEBUG */ +- ut_a(rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED); +- ut_a(rw_lock_get_waiters(lock) == 0); +- ut_a(rw_lock_get_reader_count(lock) == 0); ++ ut_a(lock->lock_word == X_LOCK_DECR); + +- lock->magic_n = 0; +- ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_free(rw_lock_get_mutex(lock)); ++#endif /* HAVE_ATOMIC_BUILTINS */ + + mutex_enter(&rw_lock_list_mutex); + os_event_free(lock->event); + +-#ifdef __WIN__ + os_event_free(lock->wait_ex_event); +-#endif + + if (UT_LIST_GET_PREV(list, lock)) { + ut_a(UT_LIST_GET_PREV(list, lock)->magic_n == RW_LOCK_MAGIC_N); +@@ -186,6 +302,8 @@ + UT_LIST_REMOVE(list, rw_lock_list, lock); + + mutex_exit(&rw_lock_list_mutex); ++ ++ lock->magic_n = 0; + } + + /********************************************************************** +@@ -199,19 +317,12 @@ + { + ut_a(lock); + +- mutex_enter(rw_lock_get_mutex(lock)); ++ ulint waiters = rw_lock_get_waiters(lock); ++ lint lock_word = lock->lock_word; + + ut_a(lock->magic_n == RW_LOCK_MAGIC_N); +- ut_a((rw_lock_get_reader_count(lock) == 0) +- || (rw_lock_get_writer(lock) != RW_LOCK_EX)); +- ut_a((rw_lock_get_writer(lock) == RW_LOCK_EX) +- || (rw_lock_get_writer(lock) == RW_LOCK_WAIT_EX) +- || (rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED)); +- ut_a((rw_lock_get_waiters(lock) == 0) +- || (rw_lock_get_waiters(lock) == 1)); +- ut_a((lock->writer != RW_LOCK_EX) || (lock->writer_count > 0)); +- +- mutex_exit(rw_lock_get_mutex(lock)); ++ ut_a(waiters == 0 || waiters == 1); ++ ut_a(lock_word > -X_LOCK_DECR ||(-lock_word) % X_LOCK_DECR == 0); + + return(TRUE); + } +@@ -232,18 +343,15 @@ + ulint line) /* in: line where requested */ + { + ulint index; /* index of the reserved wait cell */ +- ulint i; /* spin round count */ ++ ulint i = 0; /* spin round count */ + + ut_ad(rw_lock_validate(lock)); + ++ rw_s_spin_wait_count++; /* Count calls to this function */ + lock_loop: +- rw_s_spin_wait_count++; + + /* Spin waiting for the writer field to become free */ +- i = 0; +- +- while (rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED +- && i < SYNC_SPIN_ROUNDS) { ++ while (i < SYNC_SPIN_ROUNDS && lock->lock_word <= 0) { + if (srv_spin_wait_delay) { + ut_delay(ut_rnd_interval(0, srv_spin_wait_delay)); + } +@@ -262,28 +370,32 @@ + lock->cfile_name, (ulong) lock->cline, (ulong) i); + } + +- mutex_enter(rw_lock_get_mutex(lock)); +- + /* We try once again to obtain the lock */ +- + if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) { +- mutex_exit(rw_lock_get_mutex(lock)); ++ rw_s_spin_round_count += i; + + return; /* Success */ + } else { +- /* If we get here, locking did not succeed, we may +- suspend the thread to wait in the wait array */ + +- rw_s_system_call_count++; ++ if (i < SYNC_SPIN_ROUNDS) { ++ goto lock_loop; ++ } ++ ++ rw_s_spin_round_count += i; + + sync_array_reserve_cell(sync_primary_wait_array, + lock, RW_LOCK_SHARED, + file_name, line, + &index); + +- rw_lock_set_waiters(lock, 1); +- +- mutex_exit(rw_lock_get_mutex(lock)); ++ /* Set waiters before checking lock_word to ensure wake-up ++ signal is sent. This may lead to some unnecessary signals. */ ++ rw_lock_set_waiter_flag(lock); ++ ++ if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) { ++ sync_array_free_cell(sync_primary_wait_array, index); ++ return; /* Success */ ++ } + + if (srv_print_latch_waits) { + fprintf(stderr, +@@ -292,11 +404,13 @@ + lock, lock->cfile_name, (ulong) lock->cline); + } + +- rw_s_system_call_count++; ++ /* these stats may not be accurate */ ++ lock->count_os_wait++; + rw_s_os_wait_count++; + + sync_array_wait_event(sync_primary_wait_array, index); + ++ i = 0; + goto lock_loop; + } + } +@@ -318,114 +432,130 @@ + { + ut_ad(rw_lock_is_locked(lock, RW_LOCK_EX)); + +- mutex_enter(&(lock->mutex)); +- +- lock->writer_thread = os_thread_get_curr_id(); +- +- lock->pass = 0; +- +- mutex_exit(&(lock->mutex)); ++ rw_lock_set_writer_id_and_recursion_flag(lock, TRUE); + } + + /********************************************************************** +-Low-level function for acquiring an exclusive lock. */ ++Function for the next writer to call. Waits for readers to exit. ++The caller must have already decremented lock_word by X_LOCK_DECR.*/ + UNIV_INLINE +-ulint +-rw_lock_x_lock_low( +-/*===============*/ +- /* out: RW_LOCK_NOT_LOCKED if did +- not succeed, RW_LOCK_EX if success, +- RW_LOCK_WAIT_EX, if got wait reservation */ ++void ++rw_lock_x_lock_wait( ++/*================*/ + rw_lock_t* lock, /* in: pointer to rw-lock */ ++#ifdef UNIV_SYNC_DEBUG + ulint pass, /* in: pass value; != 0, if the lock will + be passed to another thread to unlock */ ++#endif + const char* file_name,/* in: file name where lock requested */ + ulint line) /* in: line where requested */ + { +-#ifdef UNIV_SYNC_DEBUG +- ut_ad(mutex_own(rw_lock_get_mutex(lock))); +-#endif /* UNIV_SYNC_DEBUG */ +- if (rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED) { ++ ulint index; ++ ulint i = 0; + +- if (rw_lock_get_reader_count(lock) == 0) { ++ ut_ad(lock->lock_word <= 0); ++ ++ while (lock->lock_word < 0) { ++ if (srv_spin_wait_delay) { ++ ut_delay(ut_rnd_interval(0, srv_spin_wait_delay)); ++ } ++ if(i < SYNC_SPIN_ROUNDS) { ++ i++; ++ continue; ++ } + +- rw_lock_set_writer(lock, RW_LOCK_EX); +- lock->writer_thread = os_thread_get_curr_id(); +- lock->writer_count++; +- lock->pass = pass; ++ /* If there is still a reader, then go to sleep.*/ ++ rw_x_spin_round_count += i; ++ i = 0; ++ sync_array_reserve_cell(sync_primary_wait_array, ++ lock, ++ RW_LOCK_WAIT_EX, ++ file_name, line, ++ &index); ++ /* Check lock_word to ensure wake-up isn't missed.*/ ++ if(lock->lock_word < 0) { + ++ /* these stats may not be accurate */ ++ lock->count_os_wait++; ++ rw_x_os_wait_count++; ++ ++ /* Add debug info as it is needed to detect possible ++ deadlock. We must add info for WAIT_EX thread for ++ deadlock detection to work properly. */ + #ifdef UNIV_SYNC_DEBUG +- rw_lock_add_debug_info(lock, pass, RW_LOCK_EX, ++ rw_lock_add_debug_info(lock, pass, RW_LOCK_WAIT_EX, + file_name, line); + #endif +- lock->last_x_file_name = file_name; +- lock->last_x_line = line; +- +- /* Locking succeeded, we may return */ +- return(RW_LOCK_EX); +- } else { +- /* There are readers, we have to wait */ +- rw_lock_set_writer(lock, RW_LOCK_WAIT_EX); +- lock->writer_thread = os_thread_get_curr_id(); +- lock->pass = pass; +- lock->writer_is_wait_ex = TRUE; + ++ sync_array_wait_event(sync_primary_wait_array, ++ index); + #ifdef UNIV_SYNC_DEBUG +- rw_lock_add_debug_info(lock, pass, RW_LOCK_WAIT_EX, +- file_name, line); ++ rw_lock_remove_debug_info(lock, pass, ++ RW_LOCK_WAIT_EX); + #endif +- +- return(RW_LOCK_WAIT_EX); ++ /* It is possible to wake when lock_word < 0. ++ We must pass the while-loop check to proceed.*/ ++ } else { ++ sync_array_free_cell(sync_primary_wait_array, ++ index); + } ++ } ++ rw_x_spin_round_count += i; ++} + +- } else if ((rw_lock_get_writer(lock) == RW_LOCK_WAIT_EX) +- && os_thread_eq(lock->writer_thread, +- os_thread_get_curr_id())) { ++/********************************************************************** ++Low-level function for acquiring an exclusive lock. */ ++UNIV_INLINE ++ibool ++rw_lock_x_lock_low( ++/*===============*/ ++ /* out: RW_LOCK_NOT_LOCKED if did ++ not succeed, RW_LOCK_EX if success. */ ++ rw_lock_t* lock, /* in: pointer to rw-lock */ ++ ulint pass, /* in: pass value; != 0, if the lock will ++ be passed to another thread to unlock */ ++ const char* file_name,/* in: file name where lock requested */ ++ ulint line) /* in: line where requested */ ++{ ++ os_thread_id_t curr_thread = os_thread_get_curr_id(); + +- if (rw_lock_get_reader_count(lock) == 0) { ++ if (rw_lock_lock_word_decr(lock, X_LOCK_DECR)) { + +- rw_lock_set_writer(lock, RW_LOCK_EX); +- lock->writer_count++; +- lock->pass = pass; +- lock->writer_is_wait_ex = FALSE; ++ /* lock->recursive also tells us if the writer_thread ++ field is stale or active. As we are going to write ++ our own thread id in that field it must be that the ++ current writer_thread value is not active. */ ++ ut_a(!lock->recursive); + ++ /* Decrement occurred: we are writer or next-writer. */ ++ rw_lock_set_writer_id_and_recursion_flag(lock, ++ pass ? FALSE : TRUE); ++ ++ rw_lock_x_lock_wait(lock, + #ifdef UNIV_SYNC_DEBUG +- rw_lock_remove_debug_info(lock, pass, RW_LOCK_WAIT_EX); +- rw_lock_add_debug_info(lock, pass, RW_LOCK_EX, +- file_name, line); ++ pass, + #endif ++ file_name, line); + +- lock->last_x_file_name = file_name; +- lock->last_x_line = line; +- +- /* Locking succeeded, we may return */ +- return(RW_LOCK_EX); ++ } else { ++ /* Decrement failed: relock or failed lock */ ++ if (!pass && lock->recursive ++ && os_thread_eq(lock->writer_thread, curr_thread)) { ++ /* Relock */ ++ lock->lock_word -= X_LOCK_DECR; ++ } else { ++ /* Another thread locked before us */ ++ return(FALSE); + } +- +- return(RW_LOCK_WAIT_EX); +- +- } else if ((rw_lock_get_writer(lock) == RW_LOCK_EX) +- && os_thread_eq(lock->writer_thread, +- os_thread_get_curr_id()) +- && (lock->pass == 0) +- && (pass == 0)) { +- +- lock->writer_count++; +- ++ } + #ifdef UNIV_SYNC_DEBUG +- rw_lock_add_debug_info(lock, pass, RW_LOCK_EX, file_name, +- line); ++ rw_lock_add_debug_info(lock, pass, RW_LOCK_EX, ++ file_name, line); + #endif ++ lock->last_x_file_name = file_name; ++ lock->last_x_line = (unsigned int) line; + +- lock->last_x_file_name = file_name; +- lock->last_x_line = line; +- +- /* Locking succeeded, we may return */ +- return(RW_LOCK_EX); +- } +- +- /* Locking did not succeed */ +- return(RW_LOCK_NOT_LOCKED); ++ return(TRUE); + } + + /********************************************************************** +@@ -448,47 +578,30 @@ + ulint line) /* in: line where requested */ + { + ulint index; /* index of the reserved wait cell */ +- ulint state; /* lock state acquired */ + ulint i; /* spin round count */ ++ ibool spinning = FALSE; + + ut_ad(rw_lock_validate(lock)); + +-lock_loop: +- /* Acquire the mutex protecting the rw-lock fields */ +- mutex_enter_fast(&(lock->mutex)); +- +- state = rw_lock_x_lock_low(lock, pass, file_name, line); ++ i = 0; + +- mutex_exit(&(lock->mutex)); ++lock_loop: + +- if (state == RW_LOCK_EX) { ++ if (rw_lock_x_lock_low(lock, pass, file_name, line)) { ++ rw_x_spin_round_count += i; + + return; /* Locking succeeded */ + +- } else if (state == RW_LOCK_NOT_LOCKED) { +- +- /* Spin waiting for the writer field to become free */ +- i = 0; +- +- while (rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED +- && i < SYNC_SPIN_ROUNDS) { +- if (srv_spin_wait_delay) { +- ut_delay(ut_rnd_interval(0, +- srv_spin_wait_delay)); +- } ++ } else { + +- i++; +- } +- if (i == SYNC_SPIN_ROUNDS) { +- os_thread_yield(); ++ if (!spinning) { ++ spinning = TRUE; ++ rw_x_spin_wait_count++; + } +- } else if (state == RW_LOCK_WAIT_EX) { + +- /* Spin waiting for the reader count field to become zero */ +- i = 0; +- +- while (rw_lock_get_reader_count(lock) != 0 +- && i < SYNC_SPIN_ROUNDS) { ++ /* Spin waiting for the lock_word to become free */ ++ while (i < SYNC_SPIN_ROUNDS ++ && lock->lock_word <= 0) { + if (srv_spin_wait_delay) { + ut_delay(ut_rnd_interval(0, + srv_spin_wait_delay)); +@@ -498,12 +611,13 @@ + } + if (i == SYNC_SPIN_ROUNDS) { + os_thread_yield(); ++ } else { ++ goto lock_loop; + } +- } else { +- i = 0; /* Eliminate a compiler warning */ +- ut_error; + } + ++ rw_x_spin_round_count += i; ++ + if (srv_print_latch_waits) { + fprintf(stderr, + "Thread %lu spin wait rw-x-lock at %p cfile %s cline %lu rnds %lu\n", +@@ -511,39 +625,20 @@ + lock->cfile_name, (ulong) lock->cline, (ulong) i); + } + +- rw_x_spin_wait_count++; +- +- /* We try once again to obtain the lock. Acquire the mutex protecting +- the rw-lock fields */ +- +- mutex_enter(rw_lock_get_mutex(lock)); +- +- state = rw_lock_x_lock_low(lock, pass, file_name, line); +- +- if (state == RW_LOCK_EX) { +- mutex_exit(rw_lock_get_mutex(lock)); +- +- return; /* Locking succeeded */ +- } +- +- rw_x_system_call_count++; +- + sync_array_reserve_cell(sync_primary_wait_array, + lock, +-#ifdef __WIN__ +- /* On windows RW_LOCK_WAIT_EX signifies +- that this thread should wait on the +- special wait_ex_event. */ +- (state == RW_LOCK_WAIT_EX) +- ? RW_LOCK_WAIT_EX : +-#endif + RW_LOCK_EX, + file_name, line, + &index); + +- rw_lock_set_waiters(lock, 1); +- +- mutex_exit(rw_lock_get_mutex(lock)); ++ /* Waiters must be set before checking lock_word, to ensure signal ++ is sent. This could lead to a few unnecessary wake-up signals. */ ++ rw_lock_set_waiter_flag(lock); ++ ++ if (rw_lock_x_lock_low(lock, pass, file_name, line)) { ++ sync_array_free_cell(sync_primary_wait_array, index); ++ return; /* Locking succeeded */ ++ } + + if (srv_print_latch_waits) { + fprintf(stderr, +@@ -552,11 +647,13 @@ + lock->cfile_name, (ulong) lock->cline); + } + +- rw_x_system_call_count++; ++ /* these stats may not be accurate */ ++ lock->count_os_wait++; + rw_x_os_wait_count++; + + sync_array_wait_event(sync_primary_wait_array, index); + ++ i = 0; + goto lock_loop; + } + +@@ -697,7 +794,9 @@ + rw_lock_t* lock, /* in: rw-lock */ + ulint level) /* in: level */ + { ++#ifdef UNIV_SYNC_DEBUG + lock->level = level; ++#endif /* UNIV_SYNC_DEBUG */ + } + + #ifdef UNIV_SYNC_DEBUG +@@ -718,7 +817,7 @@ + ut_ad(lock); + ut_ad(rw_lock_validate(lock)); + +- mutex_enter(&(lock->mutex)); ++ rw_lock_debug_mutex_enter(); + + info = UT_LIST_GET_FIRST(lock->debug_list); + +@@ -728,7 +827,7 @@ + && (info->pass == 0) + && (info->lock_type == lock_type)) { + +- mutex_exit(&(lock->mutex)); ++ rw_lock_debug_mutex_exit(); + /* Found! */ + + return(TRUE); +@@ -736,7 +835,7 @@ + + info = UT_LIST_GET_NEXT(list, info); + } +- mutex_exit(&(lock->mutex)); ++ rw_lock_debug_mutex_exit(); + + return(FALSE); + } +@@ -758,22 +857,18 @@ + ut_ad(lock); + ut_ad(rw_lock_validate(lock)); + +- mutex_enter(&(lock->mutex)); +- + if (lock_type == RW_LOCK_SHARED) { +- if (lock->reader_count > 0) { ++ if (rw_lock_get_reader_count(lock) > 0) { + ret = TRUE; + } + } else if (lock_type == RW_LOCK_EX) { +- if (lock->writer == RW_LOCK_EX) { ++ if (rw_lock_get_writer(lock) == RW_LOCK_EX) { + ret = TRUE; + } + } else { + ut_error; + } + +- mutex_exit(&(lock->mutex)); +- + return(ret); + } + +@@ -801,11 +896,10 @@ + + count++; + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_enter(&(lock->mutex)); +- +- if ((rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED) +- || (rw_lock_get_reader_count(lock) != 0) +- || (rw_lock_get_waiters(lock) != 0)) { ++#endif ++ if (lock->lock_word != X_LOCK_DECR) { + + fprintf(stderr, "RW-LOCK: %p ", lock); + +@@ -821,8 +915,10 @@ + info = UT_LIST_GET_NEXT(list, info); + } + } +- ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(&(lock->mutex)); ++#endif ++ + lock = UT_LIST_GET_NEXT(list, lock); + } + +@@ -845,9 +941,10 @@ + "RW-LATCH INFO\n" + "RW-LATCH: %p ", lock); + +- if ((rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED) +- || (rw_lock_get_reader_count(lock) != 0) +- || (rw_lock_get_waiters(lock) != 0)) { ++#ifndef HAVE_ATOMIC_BUILTINS ++ mutex_enter(&(lock->mutex)); ++#endif ++ if (lock->lock_word != X_LOCK_DECR) { + + if (rw_lock_get_waiters(lock)) { + fputs(" Waiters for the lock exist\n", stderr); +@@ -861,6 +958,9 @@ + info = UT_LIST_GET_NEXT(list, info); + } + } ++#ifndef HAVE_ATOMIC_BUILTINS ++ mutex_exit(&(lock->mutex)); ++#endif + } + + /************************************************************************* +@@ -909,14 +1009,11 @@ + lock = UT_LIST_GET_FIRST(rw_lock_list); + + while (lock != NULL) { +- mutex_enter(rw_lock_get_mutex(lock)); + +- if ((rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED) +- || (rw_lock_get_reader_count(lock) != 0)) { ++ if (lock->lock_word != X_LOCK_DECR) { + count++; + } + +- mutex_exit(rw_lock_get_mutex(lock)); + lock = UT_LIST_GET_NEXT(list, lock); + } + +diff -ruN a/innobase/sync/sync0sync.c b/innobase/sync/sync0sync.c +--- a/innobase/sync/sync0sync.c 2009-10-22 15:15:05.000000000 +0900 ++++ b/innobase/sync/sync0sync.c 2009-10-22 15:18:44.000000000 +0900 +@@ -1,8 +1,31 @@ ++/***************************************************************************** ++ ++Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. ++Copyright (c) 2008, Google Inc. ++ ++Portions of this file contain modifications contributed and copyrighted by ++Google, Inc. Those modifications are gratefully acknowledged and are described ++briefly in the InnoDB documentation. The contributions by Google are ++incorporated with their permission, and subject to the conditions contained in ++the file COPYING.Google. ++ ++This program is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License as published by the Free Software ++Foundation; version 2 of the License. ++ ++This program is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License along with ++this program; if not, write to the Free Software Foundation, Inc., 59 Temple ++Place, Suite 330, Boston, MA 02111-1307 USA ++ ++*****************************************************************************/ ++ + /****************************************************** + Mutex, the basic synchronization primitive + +-(c) 1995 Innobase Oy +- + Created 9/5/1995 Heikki Tuuri + *******************************************************/ + +@@ -140,17 +163,12 @@ + + ulint sync_dummy = 0; + +-/* The number of system calls made in this module. Intended for performance +-monitoring. */ +- +-ulint mutex_system_call_count = 0; +- + /* Number of spin waits on mutexes: for performance monitoring */ + +-ulint mutex_spin_round_count = 0; +-ulint mutex_spin_wait_count = 0; +-ulint mutex_os_wait_count = 0; +-ulint mutex_exit_count = 0; ++ib_longlong mutex_spin_round_count = 0; ++ib_longlong mutex_spin_wait_count = 0; ++ib_longlong mutex_os_wait_count = 0; ++ib_longlong mutex_exit_count = 0; + + /* The global array of wait cells for implementation of the database's own + mutexes and read-write locks */ +@@ -240,6 +258,8 @@ + { + #if defined(_WIN32) && defined(UNIV_CAN_USE_X86_ASSEMBLER) + mutex_reset_lock_word(mutex); ++#elif defined(HAVE_ATOMIC_BUILTINS) ++ mutex_reset_lock_word(mutex); + #else + os_fast_mutex_init(&(mutex->os_fast_mutex)); + mutex->lock_word = 0; +@@ -325,7 +345,9 @@ + + os_event_free(mutex->event); + +-#if !defined(_WIN32) || !defined(UNIV_CAN_USE_X86_ASSEMBLER) ++#if defined(_WIN32) && defined(UNIV_CAN_USE_X86_ASSEMBLER) ++#elif defined(HAVE_ATOMIC_BUILTINS) ++#else + os_fast_mutex_free(&(mutex->os_fast_mutex)); + #endif + /* If we free the mutex protecting the mutex list (freeing is +@@ -421,6 +443,12 @@ + #endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ + ut_ad(mutex); + ++ /* This update is not thread safe, but we don't mind if the count ++ isn't exact. Moved out of ifdef that follows because we are willing ++ to sacrifice the cost of counting this as the data is valuable. ++ Count the number of calls to mutex_spin_wait. */ ++ mutex_spin_wait_count++; ++ + mutex_loop: + + i = 0; +@@ -433,7 +461,6 @@ + + spin_loop: + #if defined UNIV_DEBUG && !defined UNIV_HOTBACKUP +- mutex_spin_wait_count++; + mutex->count_spin_loop++; + #endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ + +@@ -502,8 +529,6 @@ + sync_array_reserve_cell(sync_primary_wait_array, mutex, + SYNC_MUTEX, file_name, line, &index); + +- mutex_system_call_count++; +- + /* The memory order of the array reservation and the change in the + waiters field is important: when we suspend a thread, we first + reserve the cell and then set waiters field to 1. When threads are +@@ -551,7 +576,6 @@ + mutex->cfile_name, (ulong) mutex->cline, (ulong) i); + #endif + +- mutex_system_call_count++; + mutex_os_wait_count++; + + #ifndef UNIV_HOTBACKUP +@@ -1368,20 +1392,31 @@ + FILE* file) /* in: file where to print */ + { + #ifdef UNIV_SYNC_DEBUG +- fprintf(stderr, "Mutex exits %lu, rws exits %lu, rwx exits %lu\n", ++ fprintf(file, "Mutex exits %llu, rws exits %llu, rwx exits %llu\n", + mutex_exit_count, rw_s_exit_count, rw_x_exit_count); + #endif + + fprintf(file, +-"Mutex spin waits %lu, rounds %lu, OS waits %lu\n" +-"RW-shared spins %lu, OS waits %lu; RW-excl spins %lu, OS waits %lu\n", +- (ulong) mutex_spin_wait_count, +- (ulong) mutex_spin_round_count, +- (ulong) mutex_os_wait_count, +- (ulong) rw_s_spin_wait_count, +- (ulong) rw_s_os_wait_count, +- (ulong) rw_x_spin_wait_count, +- (ulong) rw_x_os_wait_count); ++ "Mutex spin waits %llu, rounds %llu, OS waits %llu\n" ++ "RW-shared spins %llu, OS waits %llu;" ++ " RW-excl spins %llu, OS waits %llu\n", ++ mutex_spin_wait_count, ++ mutex_spin_round_count, ++ mutex_os_wait_count, ++ rw_s_spin_wait_count, ++ rw_s_os_wait_count, ++ rw_x_spin_wait_count, ++ rw_x_os_wait_count); ++ ++ fprintf(file, ++ "Spin rounds per wait: %.2f mutex, %.2f RW-shared, " ++ "%.2f RW-excl\n", ++ (double) mutex_spin_round_count / ++ (mutex_spin_wait_count ? mutex_spin_wait_count : 1), ++ (double) rw_s_spin_round_count / ++ (rw_s_spin_wait_count ? rw_s_spin_wait_count : 1), ++ (double) rw_x_spin_round_count / ++ (rw_x_spin_wait_count ? rw_x_spin_wait_count : 1)); + } + + /*********************************************************************** +diff -ruN a/patch_info/innodb_rw_lock.info b/patch_info/innodb_rw_lock.info +--- /dev/null 1970-01-01 09:00:00.000000000 +0900 ++++ b/patch_info/innodb_rw_lock.info 2009-10-22 15:18:30.000000000 +0900 +@@ -0,0 +1,6 @@ ++File=innodb_rw_lock.patch ++Name=Fix of InnoDB rw_locks ported from InnoDB Plugin ++Version=1.0 ++Author=InnoBase Oy. ++License=GPL ++Comment= diff --git a/percona/5.0.91-b22-20100522/innodb_rw_lock_old.patch b/percona/5.0.91-b22-20100522/innodb_rw_lock_old.patch new file mode 100644 index 0000000..b4a1a79 --- /dev/null +++ b/percona/5.0.91-b22-20100522/innodb_rw_lock_old.patch @@ -0,0 +1,1357 @@ +diff -ruN a/innobase/btr/btr0sea.c b/innobase/btr/btr0sea.c +--- a/innobase/btr/btr0sea.c 2009-05-20 14:21:44.000000000 +0900 ++++ b/innobase/btr/btr0sea.c 2009-05-20 14:39:34.000000000 +0900 +@@ -773,7 +773,7 @@ + rw_lock_s_lock(&btr_search_latch); + } + +- ut_ad(btr_search_latch.writer != RW_LOCK_EX); ++ ut_ad(btr_search_latch.writer_count == 0); + ut_ad(btr_search_latch.reader_count > 0); + + rec = ha_search_and_get_data(btr_search_sys->hash_index, fold); +diff -ruN a/innobase/include/sync0rw.h b/innobase/include/sync0rw.h +--- a/innobase/include/sync0rw.h 2009-01-30 06:42:20.000000000 +0900 ++++ b/innobase/include/sync0rw.h 2009-04-16 16:15:28.000000000 +0900 +@@ -325,7 +325,17 @@ + Accessor functions for rw lock. */ + UNIV_INLINE + ulint +-rw_lock_get_waiters( ++rw_lock_get_s_waiters( ++/*==================*/ ++ rw_lock_t* lock); ++UNIV_INLINE ++ulint ++rw_lock_get_x_waiters( ++/*==================*/ ++ rw_lock_t* lock); ++UNIV_INLINE ++ulint ++rw_lock_get_wx_waiters( + /*================*/ + rw_lock_t* lock); + UNIV_INLINE +@@ -408,6 +418,17 @@ + rw_lock_debug_t* info); /* in: debug struct */ + #endif /* UNIV_SYNC_DEBUG */ + ++#ifdef HAVE_ATOMIC_BUILTINS ++/* This value means NOT_LOCKED */ ++#define RW_LOCK_BIAS 0x00100000 ++#else ++#error HAVE_ATOMIC_BUILTINS is not defined. Do you use enough new GCC or compatibles? ++#error Or do you use exact options for CFLAGS? ++#error e.g. (for x86_32): "-m32 -march=i586 -mtune=i686" ++#error e.g. (for Sparc_64): "-m64 -mcpu=v9" ++#error Otherwise, this build may be slower than normal version. ++#endif ++ + /* NOTE! The structure appears here only for the compiler to know its size. + Do not use its fields directly! The structure used in the spin lock + implementation of a read-write lock. Several threads may have a shared lock +@@ -417,9 +438,9 @@ + field. Then no new readers are allowed in. */ + + struct rw_lock_struct { +- os_event_t event; /* Used by sync0arr.c for thread queueing */ +- +-#ifdef __WIN__ ++ /* Used by sync0arr.c for thread queueing */ ++ os_event_t s_event; /* Used for s_lock */ ++ os_event_t x_event; /* Used for x_lock */ + os_event_t wait_ex_event; /* This windows specific event is + used by the thread which has set the + lock state to RW_LOCK_WAIT_EX. The +@@ -427,31 +448,35 @@ + thread will be the next one to proceed + once the current the event gets + signalled. See LEMMA 2 in sync0sync.c */ ++ ++#ifdef HAVE_ATOMIC_BUILTINS ++ volatile lint lock_word; /* Used by using atomic builtin */ + #endif + +- ulint reader_count; /* Number of readers who have locked this ++ volatile ulint reader_count; /* Number of readers who have locked this + lock in the shared mode */ +- ulint writer; /* This field is set to RW_LOCK_EX if there ++ volatile ulint writer; /* This field is set to RW_LOCK_EX if there + is a writer owning the lock (in exclusive + mode), RW_LOCK_WAIT_EX if a writer is + queueing for the lock, and + RW_LOCK_NOT_LOCKED, otherwise. */ +- os_thread_id_t writer_thread; ++ volatile os_thread_id_t writer_thread; + /* Thread id of a possible writer thread */ +- ulint writer_count; /* Number of times the same thread has ++ volatile ulint writer_count; /* Number of times the same thread has + recursively locked the lock in the exclusive + mode */ ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_t mutex; /* The mutex protecting rw_lock_struct */ ++#endif + ulint pass; /* Default value 0. This is set to some + value != 0 given by the caller of an x-lock + operation, if the x-lock is to be passed to + another thread to unlock (which happens in + asynchronous i/o). */ +- ulint waiters; /* This ulint is set to 1 if there are +- waiters (readers or writers) in the global +- wait array, waiting for this rw_lock. +- Otherwise, == 0. */ +- ibool writer_is_wait_ex; ++ volatile ulint s_waiters; /* 1: there are waiters (s_lock) */ ++ volatile ulint x_waiters; /* 1: there are waiters (x_lock) */ ++ volatile ulint wait_ex_waiters; /* 1: there are waiters (wait_ex) */ ++ volatile ibool writer_is_wait_ex; + /* This is TRUE if the writer field is + RW_LOCK_WAIT_EX; this field is located far + from the memory update hotspot fields which +diff -ruN a/innobase/include/sync0rw.ic b/innobase/include/sync0rw.ic +--- a/innobase/include/sync0rw.ic 2009-01-30 06:42:20.000000000 +0900 ++++ b/innobase/include/sync0rw.ic 2009-04-16 17:06:53.000000000 +0900 +@@ -47,20 +47,64 @@ + Accessor functions for rw lock. */ + UNIV_INLINE + ulint +-rw_lock_get_waiters( ++rw_lock_get_s_waiters( + /*================*/ + rw_lock_t* lock) + { +- return(lock->waiters); ++ return(lock->s_waiters); + } + UNIV_INLINE +-void +-rw_lock_set_waiters( ++ulint ++rw_lock_get_x_waiters( + /*================*/ ++ rw_lock_t* lock) ++{ ++ return(lock->x_waiters); ++} ++UNIV_INLINE ++ulint ++rw_lock_get_wx_waiters( ++/*================*/ ++ rw_lock_t* lock) ++{ ++ return(lock->wait_ex_waiters); ++} ++UNIV_INLINE ++void ++rw_lock_set_s_waiters( + rw_lock_t* lock, + ulint flag) + { +- lock->waiters = flag; ++#ifdef HAVE_ATOMIC_BUILTINS ++ __sync_lock_test_and_set(&lock->s_waiters, flag); ++#else ++ lock->s_waiters = flag; ++#endif ++} ++UNIV_INLINE ++void ++rw_lock_set_x_waiters( ++ rw_lock_t* lock, ++ ulint flag) ++{ ++#ifdef HAVE_ATOMIC_BUILTINS ++ __sync_lock_test_and_set(&lock->x_waiters, flag); ++#else ++ lock->x_waiters = flag; ++#endif ++} ++UNIV_INLINE ++void ++rw_lock_set_wx_waiters( ++/*================*/ ++ rw_lock_t* lock, ++ ulint flag) ++{ ++#ifdef HAVE_ATOMIC_BUILTINS ++ __sync_lock_test_and_set(&lock->wait_ex_waiters, flag); ++#else ++ lock->wait_ex_waiters = flag; ++#endif + } + UNIV_INLINE + ulint +@@ -68,7 +112,19 @@ + /*===============*/ + rw_lock_t* lock) + { ++#ifdef HAVE_ATOMIC_BUILTINS ++ if (lock->writer == RW_LOCK_NOT_LOCKED) { ++ return(RW_LOCK_NOT_LOCKED); ++ } ++ ++ if (lock->writer_is_wait_ex) { ++ return(RW_LOCK_WAIT_EX); ++ } else { ++ return(RW_LOCK_EX); ++ } ++#else + return(lock->writer); ++#endif + } + UNIV_INLINE + void +@@ -96,6 +152,7 @@ + { + lock->reader_count = count; + } ++#ifndef HAVE_ATOMIC_BUILTINS + UNIV_INLINE + mutex_t* + rw_lock_get_mutex( +@@ -104,6 +161,7 @@ + { + return(&(lock->mutex)); + } ++#endif + + /********************************************************************** + Returns the value of writer_count for the lock. Does not reserve the lock +@@ -133,14 +191,26 @@ + const char* file_name, /* in: file name where lock requested */ + ulint line) /* in: line where requested */ + { +-#ifdef UNIV_SYNC_DEBUG ++#if defined(UNIV_SYNC_DEBUG) && !defined(HAVE_ATOMIC_BUILTINS) + ut_ad(mutex_own(rw_lock_get_mutex(lock))); + #endif /* UNIV_SYNC_DEBUG */ + /* Check if the writer field is free */ + ++#ifdef HAVE_ATOMIC_BUILTINS ++ if (UNIV_LIKELY(rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED)) { ++ /* try s-lock */ ++ if(__sync_sub_and_fetch(&(lock->lock_word),1) <= 0) { ++ /* fail */ ++ __sync_fetch_and_add(&(lock->lock_word),1); ++ return(FALSE); /* locking did not succeed */ ++ } ++ /* success */ ++ __sync_fetch_and_add(&(lock->reader_count),1); ++#else + if (UNIV_LIKELY(lock->writer == RW_LOCK_NOT_LOCKED)) { + /* Set the shared lock by incrementing the reader count */ + lock->reader_count++; ++#endif + + #ifdef UNIV_SYNC_DEBUG + rw_lock_add_debug_info(lock, pass, RW_LOCK_SHARED, file_name, +@@ -167,11 +237,15 @@ + const char* file_name, /* in: file name where requested */ + ulint line) /* in: line where lock requested */ + { +- ut_ad(lock->writer == RW_LOCK_NOT_LOCKED); ++ ut_ad(rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED); + ut_ad(rw_lock_get_reader_count(lock) == 0); + + /* Set the shared lock by incrementing the reader count */ ++#ifdef HAVE_ATOMIC_BUILTINS ++ __sync_fetch_and_add(&(lock->reader_count),1); ++#else + lock->reader_count++; ++#endif + + lock->last_s_file_name = file_name; + lock->last_s_line = line; +@@ -199,7 +273,11 @@ + + rw_lock_set_writer(lock, RW_LOCK_EX); + lock->writer_thread = os_thread_get_curr_id(); ++#ifdef HAVE_ATOMIC_BUILTINS ++ __sync_fetch_and_add(&(lock->writer_count),1); ++#else + lock->writer_count++; ++#endif + lock->pass = 0; + + lock->last_x_file_name = file_name; +@@ -241,15 +319,21 @@ + ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED)); /* see NOTE above */ + #endif /* UNIV_SYNC_DEBUG */ + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_enter(rw_lock_get_mutex(lock)); ++#endif + + if (UNIV_LIKELY(rw_lock_s_lock_low(lock, pass, file_name, line))) { ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(rw_lock_get_mutex(lock)); ++#endif + + return; /* Success */ + } else { + /* Did not succeed, try spin wait */ ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(rw_lock_get_mutex(lock)); ++#endif + + rw_lock_s_lock_spin(lock, pass, file_name, line); + +@@ -272,11 +356,23 @@ + { + ibool success = FALSE; + ++#ifdef HAVE_ATOMIC_BUILTINS ++ if (rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED) { ++ /* try s-lock */ ++ if(__sync_sub_and_fetch(&(lock->lock_word),1) <= 0) { ++ /* fail */ ++ __sync_fetch_and_add(&(lock->lock_word),1); ++ return(FALSE); /* locking did not succeed */ ++ } ++ /* success */ ++ __sync_fetch_and_add(&(lock->reader_count),1); ++#else + mutex_enter(rw_lock_get_mutex(lock)); + + if (lock->writer == RW_LOCK_NOT_LOCKED) { + /* Set the shared lock by incrementing the reader count */ + lock->reader_count++; ++#endif + + #ifdef UNIV_SYNC_DEBUG + rw_lock_add_debug_info(lock, 0, RW_LOCK_SHARED, file_name, +@@ -289,7 +385,9 @@ + success = TRUE; + } + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(rw_lock_get_mutex(lock)); ++#endif + + return(success); + } +@@ -309,6 +407,54 @@ + { + ibool success = FALSE; + os_thread_id_t curr_thread = os_thread_get_curr_id(); ++#ifdef HAVE_ATOMIC_BUILTINS ++ if (lock->reader_count == 0) { ++ /* try to lock writer */ ++ if(__sync_lock_test_and_set(&(lock->writer),RW_LOCK_EX) ++ == RW_LOCK_NOT_LOCKED) { ++ /* success */ ++retry_x_lock: ++ /* try x-lock */ ++ if(__sync_sub_and_fetch(&(lock->lock_word), ++ RW_LOCK_BIAS) == 0) { ++ /* success */ ++ lock->writer_thread = curr_thread; ++ lock->pass = 0; ++ lock->writer_is_wait_ex = FALSE; ++ /* next function may work as memory barrier */ ++ relock: ++ __sync_fetch_and_add(&(lock->writer_count),1); ++ ++#ifdef UNIV_SYNC_DEBUG ++ rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line); ++#endif ++ ++ lock->last_x_file_name = file_name; ++ lock->last_x_line = line; ++ ++ ut_ad(rw_lock_validate(lock)); ++ ++ return(TRUE); ++ } else { ++ /* fail (x-lock) */ ++ if (__sync_fetch_and_add(&(lock->lock_word),RW_LOCK_BIAS) ++ == 0) ++ goto retry_x_lock; ++ } ++ ++ __sync_lock_test_and_set(&(lock->writer),RW_LOCK_NOT_LOCKED); ++ } ++ } ++ ++ if (lock->pass == 0 ++ && os_thread_eq(lock->writer_thread, curr_thread)) { ++ goto relock; ++ } ++ ++ //ut_ad(rw_lock_validate(lock)); ++ ++ return(FALSE); ++#else + mutex_enter(rw_lock_get_mutex(lock)); + + if (UNIV_UNLIKELY(rw_lock_get_reader_count(lock) != 0)) { +@@ -339,6 +485,7 @@ + ut_ad(rw_lock_validate(lock)); + + return(success); ++#endif + } + + /********************************************************************** +@@ -354,16 +501,33 @@ + #endif + ) + { ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_t* mutex = &(lock->mutex); +- ibool sg = FALSE; ++#endif ++ ibool x_sg = FALSE; ++ ibool wx_sg = FALSE; ++#ifdef HAVE_ATOMIC_BUILTINS ++ ibool last = FALSE; ++#endif + ++#ifndef HAVE_ATOMIC_BUILTINS + /* Acquire the mutex protecting the rw-lock fields */ + mutex_enter(mutex); ++#endif + + /* Reset the shared lock by decrementing the reader count */ + + ut_a(lock->reader_count > 0); ++#ifdef HAVE_ATOMIC_BUILTINS ++ /* unlock lock_word */ ++ __sync_fetch_and_add(&(lock->lock_word),1); ++ ++ if(__sync_sub_and_fetch(&(lock->reader_count),1) == 0) { ++ last = TRUE; ++ } ++#else + lock->reader_count--; ++#endif + + #ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, pass, RW_LOCK_SHARED); +@@ -372,22 +536,39 @@ + /* If there may be waiters and this was the last s-lock, + signal the object */ + +- if (UNIV_UNLIKELY(lock->waiters) ++#ifdef HAVE_ATOMIC_BUILTINS ++ if (UNIV_UNLIKELY(last && __sync_lock_test_and_set(&lock->wait_ex_waiters, 0))) { ++ os_event_set(lock->wait_ex_event); ++ sync_array_object_signalled(sync_primary_wait_array); ++ } ++ else if (UNIV_UNLIKELY(last && __sync_lock_test_and_set(&lock->x_waiters, 0))) { ++ os_event_set(lock->x_event); ++ sync_array_object_signalled(sync_primary_wait_array); ++ } ++#else ++ if (UNIV_UNLIKELY(lock->wait_ex_waiters) + && lock->reader_count == 0) { +- sg = TRUE; ++ wx_sg = TRUE; + +- rw_lock_set_waiters(lock, 0); ++ rw_lock_set_wx_waiters(lock, 0); ++ } ++ else if (UNIV_UNLIKELY(lock->x_waiters) ++ && lock->reader_count == 0) { ++ x_sg = TRUE; ++ ++ rw_lock_set_x_waiters(lock, 0); + } + + mutex_exit(mutex); + +- if (UNIV_UNLIKELY(sg)) { +-#ifdef __WIN__ ++ if (UNIV_UNLIKELY(wx_sg)) { + os_event_set(lock->wait_ex_event); +-#endif +- os_event_set(lock->event); ++ sync_array_object_signalled(sync_primary_wait_array); ++ } else if (UNIV_UNLIKELY(x_sg)) { ++ os_event_set(lock->x_event); + sync_array_object_signalled(sync_primary_wait_array); + } ++#endif + + ut_ad(rw_lock_validate(lock)); + +@@ -409,13 +590,22 @@ + + ut_ad(lock->reader_count > 0); + ++#ifdef HAVE_ATOMIC_BUILTINS ++ __sync_sub_and_fetch(&(lock->reader_count),1); ++#else + lock->reader_count--; ++#endif + + #ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, 0, RW_LOCK_SHARED); + #endif + ++#ifdef HAVE_ATOMIC_BUILTINS ++ ut_ad(!lock->s_waiters); ++ ut_ad(!lock->x_waiters); ++#else + ut_ad(!lock->waiters); ++#endif + ut_ad(rw_lock_validate(lock)); + #ifdef UNIV_SYNC_PERF_STAT + rw_s_exit_count++; +@@ -435,41 +625,83 @@ + #endif + ) + { +- ibool sg = FALSE; ++#ifdef HAVE_ATOMIC_BUILTINS ++ ibool last = FALSE; ++#endif ++ ibool s_sg = FALSE; ++ ibool x_sg = FALSE; + ++#ifndef HAVE_ATOMIC_BUILTINS + /* Acquire the mutex protecting the rw-lock fields */ + mutex_enter(&(lock->mutex)); ++#endif + + /* Reset the exclusive lock if this thread no longer has an x-mode + lock */ + + ut_ad(lock->writer_count > 0); + ++#ifdef HAVE_ATOMIC_BUILTINS ++ if(__sync_sub_and_fetch(&(lock->writer_count),1) == 0) { ++ last = TRUE; ++ } ++ ++ if (last) { ++ /* unlock lock_word */ ++ __sync_fetch_and_add(&(lock->lock_word),RW_LOCK_BIAS); ++ ++ /* FIXME: It is a value of bad manners for pthread. ++ But we shouldn't keep an ID of not-owner. */ ++ lock->writer_thread = -1; ++ __sync_lock_test_and_set(&(lock->writer),RW_LOCK_NOT_LOCKED); ++ } ++#else + lock->writer_count--; + + if (lock->writer_count == 0) { + rw_lock_set_writer(lock, RW_LOCK_NOT_LOCKED); + } ++#endif + + #ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, pass, RW_LOCK_EX); + #endif + + /* If there may be waiters, signal the lock */ +- if (UNIV_UNLIKELY(lock->waiters) +- && lock->writer_count == 0) { +- +- sg = TRUE; +- rw_lock_set_waiters(lock, 0); ++#ifdef HAVE_ATOMIC_BUILTINS ++ if (last) { ++ if(__sync_lock_test_and_set(&lock->s_waiters, 0)){ ++ s_sg = TRUE; ++ } ++ if(__sync_lock_test_and_set(&lock->x_waiters, 0)){ ++ x_sg = TRUE; ++ } ++ } ++#else ++ if (lock->writer_count == 0) { ++ if(lock->s_waiters){ ++ s_sg = TRUE; ++ rw_lock_set_s_waiters(lock, 0); ++ } ++ if(lock->x_waiters){ ++ x_sg = TRUE; ++ rw_lock_set_x_waiters(lock, 0); ++ } + } + + mutex_exit(&(lock->mutex)); ++#endif + +- if (UNIV_UNLIKELY(sg)) { ++ if (UNIV_UNLIKELY(s_sg)) { ++ os_event_set(lock->s_event); ++ sync_array_object_signalled(sync_primary_wait_array); ++ } ++ if (UNIV_UNLIKELY(x_sg)) { + #ifdef __WIN__ ++ /* I doubt the necessity of it. */ + os_event_set(lock->wait_ex_event); + #endif +- os_event_set(lock->event); ++ os_event_set(lock->x_event); + sync_array_object_signalled(sync_primary_wait_array); + } + +@@ -494,9 +726,13 @@ + + ut_ad(lock->writer_count > 0); + ++#ifdef HAVE_ATOMIC_BUILTINS ++ if(__sync_sub_and_fetch(&(lock->writer_count),1) == 0) { ++#else + lock->writer_count--; + + if (lock->writer_count == 0) { ++#endif + rw_lock_set_writer(lock, RW_LOCK_NOT_LOCKED); + } + +@@ -504,7 +740,12 @@ + rw_lock_remove_debug_info(lock, 0, RW_LOCK_EX); + #endif + ++#ifdef HAVE_ATOMIC_BUILTINS ++ ut_ad(!lock->s_waiters); ++ ut_ad(!lock->x_waiters); ++#else + ut_ad(!lock->waiters); ++#endif + ut_ad(rw_lock_validate(lock)); + + #ifdef UNIV_SYNC_PERF_STAT +diff -ruN a/innobase/sync/sync0arr.c b/innobase/sync/sync0arr.c +--- a/innobase/sync/sync0arr.c 2009-01-30 06:42:24.000000000 +0900 ++++ b/innobase/sync/sync0arr.c 2009-04-16 16:15:28.000000000 +0900 +@@ -309,13 +309,13 @@ + { + if (type == SYNC_MUTEX) { + return(os_event_reset(((mutex_t *) object)->event)); +-#ifdef __WIN__ + } else if (type == RW_LOCK_WAIT_EX) { + return(os_event_reset( + ((rw_lock_t *) object)->wait_ex_event)); +-#endif +- } else { +- return(os_event_reset(((rw_lock_t *) object)->event)); ++ } else if (type == RW_LOCK_SHARED) { ++ return(os_event_reset(((rw_lock_t *) object)->s_event)); ++ } else { /* RW_LOCK_EX */ ++ return(os_event_reset(((rw_lock_t *) object)->x_event)); + } + } + +@@ -415,15 +415,12 @@ + + if (cell->request_type == SYNC_MUTEX) { + event = ((mutex_t*) cell->wait_object)->event; +-#ifdef __WIN__ +- /* On windows if the thread about to wait is the one which +- has set the state of the rw_lock to RW_LOCK_WAIT_EX, then +- it waits on a special event i.e.: wait_ex_event. */ + } else if (cell->request_type == RW_LOCK_WAIT_EX) { + event = ((rw_lock_t*) cell->wait_object)->wait_ex_event; +-#endif +- } else { +- event = ((rw_lock_t*) cell->wait_object)->event; ++ } else if (cell->request_type == RW_LOCK_SHARED) { ++ event = ((rw_lock_t*) cell->wait_object)->s_event; ++ } else { ++ event = ((rw_lock_t*) cell->wait_object)->x_event; + } + + cell->waiting = TRUE; +@@ -464,6 +461,7 @@ + mutex_t* mutex; + rw_lock_t* rwlock; + ulint type; ++ ulint writer; + + type = cell->request_type; + +@@ -492,12 +490,10 @@ + (ulong) mutex->waiters); + + } else if (type == RW_LOCK_EX +-#ifdef __WIN__ + || type == RW_LOCK_WAIT_EX +-#endif + || type == RW_LOCK_SHARED) { + +- fputs(type == RW_LOCK_EX ? "X-lock on" : "S-lock on", file); ++ fputs(type == RW_LOCK_SHARED ? "S-lock on" : "X-lock on", file); + + rwlock = cell->old_wait_rw_lock; + +@@ -505,21 +501,23 @@ + " RW-latch at %p created in file %s line %lu\n", + rwlock, rwlock->cfile_name, + (ulong) rwlock->cline); +- if (rwlock->writer != RW_LOCK_NOT_LOCKED) { ++ writer = rw_lock_get_writer(rwlock); ++ if (writer != RW_LOCK_NOT_LOCKED) { + fprintf(file, + "a writer (thread id %lu) has reserved it in mode %s", + (ulong) os_thread_pf(rwlock->writer_thread), +- rwlock->writer == RW_LOCK_EX ++ writer == RW_LOCK_EX + ? " exclusive\n" + : " wait exclusive\n"); + } + + fprintf(file, +- "number of readers %lu, waiters flag %lu\n" ++ "number of readers %lu, s_waiters flag %lu, x_waiters flag %lu\n" + "Last time read locked in file %s line %lu\n" + "Last time write locked in file %s line %lu\n", + (ulong) rwlock->reader_count, +- (ulong) rwlock->waiters, ++ (ulong) rwlock->s_waiters, ++ (ulong) (rwlock->x_waiters || rwlock->wait_ex_waiters), + rwlock->last_s_file_name, + (ulong) rwlock->last_s_line, + rwlock->last_x_file_name, +@@ -839,11 +837,15 @@ + /*========================*/ + sync_array_t* arr) /* in: wait array */ + { ++#ifdef HAVE_ATOMIC_BUILTINS ++ __sync_fetch_and_add(&(arr->sg_count),1); ++#else + sync_array_enter(arr); + + arr->sg_count++; + + sync_array_exit(arr); ++#endif + } + + /************************************************************************** +@@ -880,19 +882,23 @@ + + mutex = cell->wait_object; + os_event_set(mutex->event); +-#ifdef __WIN__ + } else if (cell->request_type + == RW_LOCK_WAIT_EX) { + rw_lock_t* lock; + + lock = cell->wait_object; + os_event_set(lock->wait_ex_event); +-#endif +- } else { ++ } else if (cell->request_type ++ == RW_LOCK_SHARED) { + rw_lock_t* lock; + + lock = cell->wait_object; +- os_event_set(lock->event); ++ os_event_set(lock->s_event); ++ } else { ++ rw_lock_t* lock; ++ ++ lock = cell->wait_object; ++ os_event_set(lock->x_event); + } + } + } +diff -ruN a/innobase/sync/sync0rw.c b/innobase/sync/sync0rw.c +--- a/innobase/sync/sync0rw.c 2009-01-30 06:42:24.000000000 +0900 ++++ b/innobase/sync/sync0rw.c 2009-04-16 17:33:59.000000000 +0900 +@@ -99,6 +99,7 @@ + object is created, then the following call initializes + the sync system. */ + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_create(rw_lock_get_mutex(lock)); + mutex_set_level(rw_lock_get_mutex(lock), SYNC_NO_ORDER_CHECK); + +@@ -108,8 +109,14 @@ + lock->mutex.cmutex_name = cmutex_name; + lock->mutex.mutex_type = 1; + #endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ ++#endif /* !HAVE_ATOMIC_BUILTINS */ + +- rw_lock_set_waiters(lock, 0); ++#ifdef HAVE_ATOMIC_BUILTINS ++ lock->lock_word = RW_LOCK_BIAS; ++#endif ++ rw_lock_set_s_waiters(lock, 0); ++ rw_lock_set_x_waiters(lock, 0); ++ rw_lock_set_wx_waiters(lock, 0); + rw_lock_set_writer(lock, RW_LOCK_NOT_LOCKED); + lock->writer_count = 0; + rw_lock_set_reader_count(lock, 0); +@@ -130,11 +137,9 @@ + lock->last_x_file_name = "not yet reserved"; + lock->last_s_line = 0; + lock->last_x_line = 0; +- lock->event = os_event_create(NULL); +- +-#ifdef __WIN__ ++ lock->s_event = os_event_create(NULL); ++ lock->x_event = os_event_create(NULL); + lock->wait_ex_event = os_event_create(NULL); +-#endif + + mutex_enter(&rw_lock_list_mutex); + +@@ -162,19 +167,21 @@ + ut_a(rw_lock_validate(lock)); + #endif /* UNIV_DEBUG */ + ut_a(rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED); +- ut_a(rw_lock_get_waiters(lock) == 0); ++ ut_a(rw_lock_get_s_waiters(lock) == 0); ++ ut_a(rw_lock_get_x_waiters(lock) == 0); ++ ut_a(rw_lock_get_wx_waiters(lock) == 0); + ut_a(rw_lock_get_reader_count(lock) == 0); + + lock->magic_n = 0; + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_free(rw_lock_get_mutex(lock)); ++#endif + + mutex_enter(&rw_lock_list_mutex); +- os_event_free(lock->event); +- +-#ifdef __WIN__ ++ os_event_free(lock->s_event); ++ os_event_free(lock->x_event); + os_event_free(lock->wait_ex_event); +-#endif + + if (UT_LIST_GET_PREV(list, lock)) { + ut_a(UT_LIST_GET_PREV(list, lock)->magic_n == RW_LOCK_MAGIC_N); +@@ -192,26 +199,43 @@ + Checks that the rw-lock has been initialized and that there are no + simultaneous shared and exclusive locks. */ + ++/* MEMO: If HAVE_ATOMIC_BUILTINS, we should use this function statically. */ ++ + ibool + rw_lock_validate( + /*=============*/ + rw_lock_t* lock) + { ++ ulint test; + ut_a(lock); + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_enter(rw_lock_get_mutex(lock)); ++#endif + + ut_a(lock->magic_n == RW_LOCK_MAGIC_N); ++#ifndef HAVE_ATOMIC_BUILTINS + ut_a((rw_lock_get_reader_count(lock) == 0) + || (rw_lock_get_writer(lock) != RW_LOCK_EX)); +- ut_a((rw_lock_get_writer(lock) == RW_LOCK_EX) +- || (rw_lock_get_writer(lock) == RW_LOCK_WAIT_EX) +- || (rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED)); +- ut_a((rw_lock_get_waiters(lock) == 0) +- || (rw_lock_get_waiters(lock) == 1)); ++#endif ++ test = rw_lock_get_writer(lock); ++ ut_a((test == RW_LOCK_EX) ++ || (test == RW_LOCK_WAIT_EX) ++ || (test == RW_LOCK_NOT_LOCKED)); ++ test = rw_lock_get_s_waiters(lock); ++ ut_a((test == 0) ++ || (test == 1)); ++ test = rw_lock_get_x_waiters(lock); ++ ut_a((test == 0) ++ || (test == 1)); ++ test = rw_lock_get_wx_waiters(lock); ++ ut_a((test == 0) ++ || (test == 1)); ++#ifndef HAVE_ATOMIC_BUILTINS + ut_a((lock->writer != RW_LOCK_EX) || (lock->writer_count > 0)); + + mutex_exit(rw_lock_get_mutex(lock)); ++#endif + + return(TRUE); + } +@@ -237,13 +261,14 @@ + ut_ad(rw_lock_validate(lock)); + + lock_loop: ++ i = 0; ++spin_loop: + rw_s_spin_wait_count++; + + /* Spin waiting for the writer field to become free */ +- i = 0; + +- while (rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED +- && i < SYNC_SPIN_ROUNDS) { ++ while (i < SYNC_SPIN_ROUNDS ++ && rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED) { + if (srv_spin_wait_delay) { + ut_delay(ut_rnd_interval(0, srv_spin_wait_delay)); + } +@@ -262,15 +287,27 @@ + lock->cfile_name, (ulong) lock->cline, (ulong) i); + } + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_enter(rw_lock_get_mutex(lock)); ++#endif + + /* We try once again to obtain the lock */ + + if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) { ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(rw_lock_get_mutex(lock)); ++#endif + + return; /* Success */ + } else { ++#ifdef HAVE_ATOMIC_BUILTINS ++ /* like sync0sync.c doing */ ++ i++; ++ ++ if (i < SYNC_SPIN_ROUNDS) { ++ goto spin_loop; ++ } ++#endif + /* If we get here, locking did not succeed, we may + suspend the thread to wait in the wait array */ + +@@ -281,9 +318,26 @@ + file_name, line, + &index); + +- rw_lock_set_waiters(lock, 1); ++ rw_lock_set_s_waiters(lock, 1); ++ ++#ifdef HAVE_ATOMIC_BUILTINS ++ /* like sync0sync.c doing */ ++ for (i = 0; i < 4; i++) { ++ if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) { ++ sync_array_free_cell(sync_primary_wait_array, index); ++ return; /* Success */ ++ } ++ } + ++ /* If wait_ex_waiter stalls, wakes it. */ ++ if (lock->reader_count == 0 ++ && __sync_lock_test_and_set(&lock->wait_ex_waiters, 0)) { ++ os_event_set(lock->wait_ex_event); ++ sync_array_object_signalled(sync_primary_wait_array); ++ } ++#else + mutex_exit(rw_lock_get_mutex(lock)); ++#endif + + if (srv_print_latch_waits) { + fprintf(stderr, +@@ -318,13 +372,19 @@ + { + ut_ad(rw_lock_is_locked(lock, RW_LOCK_EX)); + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_enter(&(lock->mutex)); ++#endif + + lock->writer_thread = os_thread_get_curr_id(); + + lock->pass = 0; + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(&(lock->mutex)); ++#else ++ __sync_synchronize(); ++#endif + } + + /********************************************************************** +@@ -342,6 +402,89 @@ + const char* file_name,/* in: file name where lock requested */ + ulint line) /* in: line where requested */ + { ++#ifdef HAVE_ATOMIC_BUILTINS ++ os_thread_id_t curr_thread = os_thread_get_curr_id(); ++retry_writer: ++ /* try to lock writer */ ++ if(__sync_lock_test_and_set(&(lock->writer),RW_LOCK_EX) ++ == RW_LOCK_NOT_LOCKED) { ++ /* success */ ++ /* obtain RW_LOCK_WAIT_EX right */ ++ lock->writer_thread = curr_thread; ++ lock->pass = pass; ++ lock->writer_is_wait_ex = TRUE; ++ /* atomic operation may be safer about memory order. */ ++ __sync_synchronize(); ++#ifdef UNIV_SYNC_DEBUG ++ rw_lock_add_debug_info(lock, pass, RW_LOCK_WAIT_EX, ++ file_name, line); ++#endif ++ } ++ ++ if (!os_thread_eq(lock->writer_thread, curr_thread)) { ++ return(RW_LOCK_NOT_LOCKED); ++ } ++ ++ switch(rw_lock_get_writer(lock)) { ++ case RW_LOCK_WAIT_EX: ++ /* have right to try x-lock */ ++retry_x_lock: ++ /* try x-lock */ ++ if(__sync_sub_and_fetch(&(lock->lock_word), ++ RW_LOCK_BIAS) == 0) { ++ /* success */ ++ lock->pass = pass; ++ lock->writer_is_wait_ex = FALSE; ++ __sync_fetch_and_add(&(lock->writer_count),1); ++ ++#ifdef UNIV_SYNC_DEBUG ++ rw_lock_remove_debug_info(lock, pass, RW_LOCK_WAIT_EX); ++ rw_lock_add_debug_info(lock, pass, RW_LOCK_EX, ++ file_name, line); ++#endif ++ ++ lock->last_x_file_name = file_name; ++ lock->last_x_line = line; ++ ++ /* Locking succeeded, we may return */ ++ return(RW_LOCK_EX); ++ } else if(__sync_fetch_and_add(&(lock->lock_word), ++ RW_LOCK_BIAS) == 0) { ++ /* retry x-lock */ ++ goto retry_x_lock; ++ } ++ ++ /* There are readers, we have to wait */ ++ return(RW_LOCK_WAIT_EX); ++ ++ break; ++ ++ case RW_LOCK_EX: ++ /* already have x-lock */ ++ if ((lock->pass == 0)&&(pass == 0)) { ++ __sync_fetch_and_add(&(lock->writer_count),1); ++ ++#ifdef UNIV_SYNC_DEBUG ++ rw_lock_add_debug_info(lock, pass, RW_LOCK_EX, file_name, ++ line); ++#endif ++ ++ lock->last_x_file_name = file_name; ++ lock->last_x_line = line; ++ ++ /* Locking succeeded, we may return */ ++ return(RW_LOCK_EX); ++ } ++ ++ return(RW_LOCK_NOT_LOCKED); ++ ++ break; ++ ++ default: /* RW_LOCK_NOT_LOCKED? maybe impossible */ ++ goto retry_writer; ++ } ++#else /* HAVE_ATOMIC_BUILTINS */ ++ + #ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(rw_lock_get_mutex(lock))); + #endif /* UNIV_SYNC_DEBUG */ +@@ -423,6 +566,7 @@ + /* Locking succeeded, we may return */ + return(RW_LOCK_EX); + } ++#endif /* HAVE_ATOMIC_BUILTINS */ + + /* Locking did not succeed */ + return(RW_LOCK_NOT_LOCKED); +@@ -448,19 +592,33 @@ + ulint line) /* in: line where requested */ + { + ulint index; /* index of the reserved wait cell */ +- ulint state; /* lock state acquired */ ++ ulint state = RW_LOCK_NOT_LOCKED; /* lock state acquired */ ++#ifdef HAVE_ATOMIC_BUILTINS ++ ulint prev_state = RW_LOCK_NOT_LOCKED; ++#endif + ulint i; /* spin round count */ + + ut_ad(rw_lock_validate(lock)); + + lock_loop: ++ i = 0; ++ ++#ifdef HAVE_ATOMIC_BUILTINS ++ prev_state = state; ++#else + /* Acquire the mutex protecting the rw-lock fields */ + mutex_enter_fast(&(lock->mutex)); ++#endif + + state = rw_lock_x_lock_low(lock, pass, file_name, line); + ++#ifdef HAVE_ATOMIC_BUILTINS ++ if (state != prev_state) i=0; /* if progress, reset counter. */ ++#else + mutex_exit(&(lock->mutex)); ++#endif + ++spin_loop: + if (state == RW_LOCK_EX) { + + return; /* Locking succeeded */ +@@ -468,10 +626,9 @@ + } else if (state == RW_LOCK_NOT_LOCKED) { + + /* Spin waiting for the writer field to become free */ +- i = 0; + +- while (rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED +- && i < SYNC_SPIN_ROUNDS) { ++ while (i < SYNC_SPIN_ROUNDS ++ && lock->lock_word != RW_LOCK_BIAS) { + if (srv_spin_wait_delay) { + ut_delay(ut_rnd_interval(0, + srv_spin_wait_delay)); +@@ -485,9 +642,12 @@ + } else if (state == RW_LOCK_WAIT_EX) { + + /* Spin waiting for the reader count field to become zero */ +- i = 0; + ++#ifdef HAVE_ATOMIC_BUILTINS ++ while (lock->lock_word != RW_LOCK_BIAS ++#else + while (rw_lock_get_reader_count(lock) != 0 ++#endif + && i < SYNC_SPIN_ROUNDS) { + if (srv_spin_wait_delay) { + ut_delay(ut_rnd_interval(0, +@@ -500,7 +660,6 @@ + os_thread_yield(); + } + } else { +- i = 0; /* Eliminate a compiler warning */ + ut_error; + } + +@@ -516,34 +675,69 @@ + /* We try once again to obtain the lock. Acquire the mutex protecting + the rw-lock fields */ + ++#ifdef HAVE_ATOMIC_BUILTINS ++ prev_state = state; ++#else + mutex_enter(rw_lock_get_mutex(lock)); ++#endif + + state = rw_lock_x_lock_low(lock, pass, file_name, line); + ++#ifdef HAVE_ATOMIC_BUILTINS ++ if (state != prev_state) i=0; /* if progress, reset counter. */ ++#endif ++ + if (state == RW_LOCK_EX) { ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(rw_lock_get_mutex(lock)); ++#endif + + return; /* Locking succeeded */ + } + ++#ifdef HAVE_ATOMIC_BUILTINS ++ /* like sync0sync.c doing */ ++ i++; ++ ++ if (i < SYNC_SPIN_ROUNDS) { ++ goto spin_loop; ++ } ++#endif ++ + rw_x_system_call_count++; + + sync_array_reserve_cell(sync_primary_wait_array, + lock, +-#ifdef __WIN__ +- /* On windows RW_LOCK_WAIT_EX signifies +- that this thread should wait on the +- special wait_ex_event. */ + (state == RW_LOCK_WAIT_EX) + ? RW_LOCK_WAIT_EX : +-#endif + RW_LOCK_EX, + file_name, line, + &index); + +- rw_lock_set_waiters(lock, 1); ++ if (state == RW_LOCK_WAIT_EX) { ++ rw_lock_set_wx_waiters(lock, 1); ++ } else { ++ rw_lock_set_x_waiters(lock, 1); ++ } + ++#ifdef HAVE_ATOMIC_BUILTINS ++ /* like sync0sync.c doing */ ++ for (i = 0; i < 4; i++) { ++ prev_state = state; ++ state = rw_lock_x_lock_low(lock, pass, file_name, line); ++ if (state == RW_LOCK_EX) { ++ sync_array_free_cell(sync_primary_wait_array, index); ++ return; /* Locking succeeded */ ++ } ++ if (state != prev_state) { ++ /* retry! */ ++ sync_array_free_cell(sync_primary_wait_array, index); ++ goto lock_loop; ++ } ++ } ++#else + mutex_exit(rw_lock_get_mutex(lock)); ++#endif + + if (srv_print_latch_waits) { + fprintf(stderr, +@@ -718,7 +912,9 @@ + ut_ad(lock); + ut_ad(rw_lock_validate(lock)); + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_enter(&(lock->mutex)); ++#endif + + info = UT_LIST_GET_FIRST(lock->debug_list); + +@@ -728,7 +924,9 @@ + && (info->pass == 0) + && (info->lock_type == lock_type)) { + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(&(lock->mutex)); ++#endif + /* Found! */ + + return(TRUE); +@@ -736,7 +934,9 @@ + + info = UT_LIST_GET_NEXT(list, info); + } ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(&(lock->mutex)); ++#endif + + return(FALSE); + } +@@ -758,21 +958,25 @@ + ut_ad(lock); + ut_ad(rw_lock_validate(lock)); + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_enter(&(lock->mutex)); ++#endif + + if (lock_type == RW_LOCK_SHARED) { + if (lock->reader_count > 0) { + ret = TRUE; + } + } else if (lock_type == RW_LOCK_EX) { +- if (lock->writer == RW_LOCK_EX) { ++ if (rw_lock_get_writer(lock) == RW_LOCK_EX) { + ret = TRUE; + } + } else { + ut_error; + } + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(&(lock->mutex)); ++#endif + + return(ret); + } +@@ -801,16 +1005,26 @@ + + count++; + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_enter(&(lock->mutex)); ++#endif + + if ((rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED) + || (rw_lock_get_reader_count(lock) != 0) +- || (rw_lock_get_waiters(lock) != 0)) { ++ || (rw_lock_get_s_waiters(lock) != 0) ++ || (rw_lock_get_x_waiters(lock) != 0) ++ || (rw_lock_get_wx_waiters(lock) != 0)) { + + fprintf(stderr, "RW-LOCK: %p ", lock); + +- if (rw_lock_get_waiters(lock)) { +- fputs(" Waiters for the lock exist\n", stderr); ++ if (rw_lock_get_s_waiters(lock)) { ++ fputs(" s_waiters for the lock exist,", stderr); ++ } ++ if (rw_lock_get_x_waiters(lock)) { ++ fputs(" x_waiters for the lock exist\n", stderr); ++ } ++ if (rw_lock_get_wx_waiters(lock)) { ++ fputs(" wait_ex_waiters for the lock exist\n", stderr); + } else { + putc('\n', stderr); + } +@@ -822,7 +1036,9 @@ + } + } + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(&(lock->mutex)); ++#endif + lock = UT_LIST_GET_NEXT(list, lock); + } + +@@ -847,10 +1063,18 @@ + + if ((rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED) + || (rw_lock_get_reader_count(lock) != 0) +- || (rw_lock_get_waiters(lock) != 0)) { ++ || (rw_lock_get_s_waiters(lock) != 0) ++ || (rw_lock_get_x_waiters(lock) != 0) ++ || (rw_lock_get_wx_waiters(lock) != 0)) { + +- if (rw_lock_get_waiters(lock)) { +- fputs(" Waiters for the lock exist\n", stderr); ++ if (rw_lock_get_s_waiters(lock)) { ++ fputs(" s_waiters for the lock exist,", stderr); ++ } ++ if (rw_lock_get_x_waiters(lock)) { ++ fputs(" x_waiters for the lock exist\n", stderr); ++ } ++ if (rw_lock_get_wx_waiters(lock)) { ++ fputs(" wait_ex_waiters for the lock exist\n", stderr); + } else { + putc('\n', stderr); + } +@@ -909,14 +1133,18 @@ + lock = UT_LIST_GET_FIRST(rw_lock_list); + + while (lock != NULL) { ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_enter(rw_lock_get_mutex(lock)); ++#endif + + if ((rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED) + || (rw_lock_get_reader_count(lock) != 0)) { + count++; + } + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(rw_lock_get_mutex(lock)); ++#endif + lock = UT_LIST_GET_NEXT(list, lock); + } + +diff -ruN a/patch_info/innodb_rw_lock.info b/patch_info/innodb_rw_lock.info +--- /dev/null 1970-01-01 09:00:00.000000000 +0900 ++++ b/patch_info/innodb_rw_lock.info 2009-04-16 16:15:28.000000000 +0900 +@@ -0,0 +1,6 @@ ++File=innodb_rw_lock.patch ++Name=Fix of InnoDB rw_locks ++Version=1.0 ++Author=Yasufumi Kinoshita ++License=BSD ++Comment= diff --git a/percona/5.0.91-b22-20100522/innodb_show_bp.patch b/percona/5.0.91-b22-20100522/innodb_show_bp.patch new file mode 100644 index 0000000..d964785 --- /dev/null +++ b/percona/5.0.91-b22-20100522/innodb_show_bp.patch @@ -0,0 +1,453 @@ +diff -r fe944d2c6e1f innobase/btr/btr0btr.c +--- a/innobase/btr/btr0btr.c Mon Nov 10 19:47:27 2008 -0800 ++++ b/innobase/btr/btr0btr.c Mon Nov 10 19:48:24 2008 -0800 +@@ -2989,3 +2989,11 @@ + + return(TRUE); + } ++ ++dulint ++btr_page_get_index_id_noninline( ++/*============*/ ++ page_t* page) /* in: index page */ ++{ ++ return btr_page_get_index_id(page); ++} +diff -r fe944d2c6e1f innobase/buf/buf0buf.c +--- a/innobase/buf/buf0buf.c Mon Nov 10 19:47:27 2008 -0800 ++++ b/innobase/buf/buf0buf.c Mon Nov 10 19:48:24 2008 -0800 +@@ -2629,3 +2629,13 @@ + buf_block_print(block); + } + ++buf_block_t* ++buf_pool_get_nth_block_no_inline( ++/*===================*/ ++ /* out: pointer to block */ ++ buf_pool_t* buf_pool,/* in: buf_pool */ ++ ulint i) /* in: index of the block */{ ++ ++return buf_pool_get_nth_block(buf_pool, i); ++ ++} +diff -r fe944d2c6e1f innobase/include/btr0btr.h +--- a/innobase/include/btr0btr.h Mon Nov 10 19:47:27 2008 -0800 ++++ b/innobase/include/btr0btr.h Mon Nov 10 19:48:24 2008 -0800 +@@ -69,6 +69,12 @@ + UNIV_INLINE + dulint + btr_page_get_index_id( ++/*==================*/ ++ /* out: index id */ ++ page_t* page); /* in: index page */ ++ ++dulint ++btr_page_get_index_id_noninline( + /*==================*/ + /* out: index id */ + page_t* page); /* in: index page */ +diff -r fe944d2c6e1f innobase/include/buf0buf.h +--- a/innobase/include/buf0buf.h Mon Nov 10 19:47:27 2008 -0800 ++++ b/innobase/include/buf0buf.h Mon Nov 10 19:48:24 2008 -0800 +@@ -703,6 +703,8 @@ + buf_get_free_list_len(void); + /*=======================*/ + ++void buf_pool_dump(void); ++buf_block_t* buf_pool_get_nth_block_no_inline(buf_pool_t* pool, ulint i); + + + /* The buffer control block structure */ +diff -r fe944d2c6e1f innobase/include/page0page.h +--- a/innobase/include/page0page.h Mon Nov 10 19:47:27 2008 -0800 ++++ b/innobase/include/page0page.h Mon Nov 10 19:48:24 2008 -0800 +@@ -260,6 +260,12 @@ + /*============*/ + /* out: number of user records */ + page_t* page); /* in: index page */ ++ ++ulint ++page_get_n_recs_noninline( ++/*============*/ ++ /* out: number of user records */ ++ page_t* page); /* in: index page */ + /******************************************************************* + Returns the number of records before the given record in chain. + The number includes infimum and supremum records. */ +@@ -519,6 +525,12 @@ + UNIV_INLINE + ulint + page_get_data_size( ++/*===============*/ ++ /* out: data in bytes */ ++ page_t* page); /* in: index page */ ++ ++ulint ++page_get_data_size_noninline( + /*===============*/ + /* out: data in bytes */ + page_t* page); /* in: index page */ +diff -r fe944d2c6e1f innobase/page/page0page.c +--- a/innobase/page/page0page.c Mon Nov 10 19:47:27 2008 -0800 ++++ b/innobase/page/page0page.c Mon Nov 10 19:48:24 2008 -0800 +@@ -1994,3 +1994,25 @@ + page_cur_move_to_next(&cur); + } + } ++ ++ulint ++page_get_n_recs_noninline( ++/*============*/ ++ /* out: number of user records */ ++ page_t* page) /* in: index page */ ++{ ++ return page_get_n_recs(page); ++} ++ ++ ++ulint ++page_get_data_size_noninline( ++/*============*/ ++ /* out: number of user records */ ++ page_t* page) /* in: index page */ ++{ ++ return page_get_data_size(page); ++} ++ ++ ++ +diff -r fe944d2c6e1f mysql-test/r/information_schema.result +--- a/mysql-test/r/information_schema.result Mon Nov 10 19:47:27 2008 -0800 ++++ b/mysql-test/r/information_schema.result Mon Nov 10 19:48:25 2008 -0800 +@@ -42,6 +42,7 @@ + COLLATION_CHARACTER_SET_APPLICABILITY + COLUMNS + COLUMN_PRIVILEGES ++INNODB_BUFFER_POOL_CONTENT + INDEX_STATISTICS + KEY_COLUMN_USAGE + PROCESSLIST +@@ -741,7 +742,7 @@ + CREATE VIEW a1 (t_CRASHME) AS SELECT f1 FROM t_crashme GROUP BY f1; + CREATE VIEW a2 AS SELECT t_CRASHME FROM a1; + count(*) +-107 ++108 + drop view a2, a1; + drop table t_crashme; + select table_schema,table_name, column_name from +@@ -802,6 +803,7 @@ + TABLE_NAME COLUMN_NAME PRIVILEGES + COLUMNS TABLE_NAME select + COLUMN_PRIVILEGES TABLE_NAME select ++INNODB_BUFFER_POOL_CONTENT TABLE_NAME select + INDEX_STATISTICS TABLE_NAME select + KEY_COLUMN_USAGE TABLE_NAME select + STATISTICS TABLE_NAME select +@@ -815,7 +817,7 @@ + flush privileges; + SELECT table_schema, count(*) FROM information_schema.TABLES GROUP BY TABLE_SCHEMA; + table_schema count(*) +-information_schema 22 ++information_schema 23 + mysql 17 + create table t1 (i int, j int); + create trigger trg1 before insert on t1 for each row +@@ -1206,6 +1208,7 @@ + COLLATION_CHARACTER_SET_APPLICABILITY COLLATION_NAME + COLUMNS TABLE_SCHEMA + COLUMN_PRIVILEGES TABLE_SCHEMA ++INNODB_BUFFER_POOL_CONTENT TABLE_SCHEMA + INDEX_STATISTICS TABLE_SCHEMA + KEY_COLUMN_USAGE CONSTRAINT_SCHEMA + PROCESSLIST ID +@@ -1243,6 +1246,7 @@ + COLLATION_CHARACTER_SET_APPLICABILITY COLLATION_NAME + COLUMNS TABLE_SCHEMA + COLUMN_PRIVILEGES TABLE_SCHEMA ++INNODB_BUFFER_POOL_CONTENT TABLE_SCHEMA + INDEX_STATISTICS TABLE_SCHEMA + KEY_COLUMN_USAGE CONSTRAINT_SCHEMA + PROCESSLIST ID +@@ -1332,6 +1336,7 @@ + COLUMNS information_schema.COLUMNS 1 + COLUMN_PRIVILEGES information_schema.COLUMN_PRIVILEGES 1 + INDEX_STATISTICS information_schema.INDEX_STATISTICS 1 ++INNODB_BUFFER_POOL_CONTENT information_schema.INNODB_BUFFER_POOL_CONTENT 1 + KEY_COLUMN_USAGE information_schema.KEY_COLUMN_USAGE 1 + PROCESSLIST information_schema.PROCESSLIST 1 + PROFILING information_schema.PROFILING 1 +diff -r fe944d2c6e1f mysql-test/r/information_schema_db.result +--- a/mysql-test/r/information_schema_db.result Mon Nov 10 19:47:27 2008 -0800 ++++ b/mysql-test/r/information_schema_db.result Mon Nov 10 19:48:25 2008 -0800 +@@ -11,6 +11,7 @@ + COLLATION_CHARACTER_SET_APPLICABILITY + COLUMNS + COLUMN_PRIVILEGES ++INNODB_BUFFER_POOL_CONTENT + INDEX_STATISTICS + KEY_COLUMN_USAGE + PROCESSLIST +diff -r fe944d2c6e1f mysql-test/r/mysqlshow.result +--- a/mysql-test/r/mysqlshow.result Mon Nov 10 19:47:27 2008 -0800 ++++ b/mysql-test/r/mysqlshow.result Mon Nov 10 19:48:25 2008 -0800 +@@ -85,6 +85,7 @@ + | COLLATION_CHARACTER_SET_APPLICABILITY | + | COLUMNS | + | COLUMN_PRIVILEGES | ++| INNODB_BUFFER_POOL_CONTENT | + | INDEX_STATISTICS | + | KEY_COLUMN_USAGE | + | PROCESSLIST | +@@ -112,6 +113,7 @@ + | COLLATION_CHARACTER_SET_APPLICABILITY | + | COLUMNS | + | COLUMN_PRIVILEGES | ++| INNODB_BUFFER_POOL_CONTENT | + | INDEX_STATISTICS | + | KEY_COLUMN_USAGE | + | PROCESSLIST | +diff -r fe944d2c6e1f patch_info/innodb_show_bp.info +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/patch_info/innodb_show_bp.info Mon Nov 10 19:48:25 2008 -0800 +@@ -0,0 +1,6 @@ ++File=innodb_show_bp.patch ++Name=show innodb buffer pool content ++Version=1.0 ++Author=Percona <info@percona.com> ++License=GPL ++Comment= +diff -r fe944d2c6e1f sql/ha_innodb.cc +--- a/sql/ha_innodb.cc Mon Nov 10 19:47:27 2008 -0800 ++++ b/sql/ha_innodb.cc Mon Nov 10 19:48:25 2008 -0800 +@@ -128,10 +128,12 @@ + #include "../innobase/include/lock0lock.h" + #include "../innobase/include/dict0crea.h" + #include "../innobase/include/btr0cur.h" ++#include "../innobase/include/buf0buf.h" + #include "../innobase/include/btr0btr.h" + #include "../innobase/include/fsp0fsp.h" + #include "../innobase/include/sync0sync.h" + #include "../innobase/include/fil0fil.h" ++#include "../innobase/include/page0page.h" + #include "../innobase/include/trx0xa.h" + } + +@@ -6483,6 +6485,116 @@ + DBUG_RETURN(FALSE); + } + ++bool ++innodb_I_S_buffer_pool_content(THD* thd, TABLE_LIST *tables) ++{ ++ ulint size; ++ ulint i; ++ dulint id; ++ ulint n_found; ++ buf_frame_t* frame; ++ dict_index_t* index; ++ buf_block_t* block; ++ ++ const char *p; ++ char db_name_raw[NAME_LEN*5+1]; ++ char table_name_raw[NAME_LEN*5+1]; ++ ++ DBUG_ENTER("innodb_I_S_buffer_pool_content"); ++ ++ ++ size = buf_pool->curr_size; ++ ++ n_found = 0; ++ ++ TABLE *table= tables->table; ++ ++ ++ //buf_pool_dump(); ++ ++ ++ for (i = 0; i < size; i++) { ++ block = buf_pool_get_nth_block_no_inline(buf_pool, i); ++ frame = block->frame; ++ if (fil_page_get_type(frame)==0) continue; ++ ++ char page_type[64]; ++ ++ switch(fil_page_get_type(frame)) ++ { ++ case FIL_PAGE_INDEX: ++ strcpy(page_type, "index"); ++ break; ++ case FIL_PAGE_UNDO_LOG: ++ strcpy(page_type, "undo_log"); ++ break; ++ case FIL_PAGE_INODE: ++ strcpy(page_type, "inode"); ++ break; ++ case FIL_PAGE_IBUF_FREE_LIST: ++ strcpy(page_type, "ibuf_free_list"); ++ break; ++ default: ++ sprintf(page_type, "unknown", fil_page_get_type(frame)); ++ } ++ ++ table->field[0]->store((longlong)i, TRUE); ++ table->field[1]->store((longlong)block->space, TRUE); ++ table->field[2]->store((longlong)block->offset, TRUE); ++ table->field[3]->store((longlong)page_get_n_recs_noninline(block->frame), TRUE); ++ table->field[4]->store( ( fil_page_get_type(frame) == FIL_PAGE_INDEX ) ? (longlong)page_get_data_size_noninline(block->frame):0, TRUE); ++ table->field[5]->store((longlong)block->flush_type, TRUE); ++ table->field[6]->store((longlong)block->buf_fix_count, TRUE); ++ table->field[7]->store((longlong)block->LRU_position, TRUE); ++ table->field[8]->store((longlong)fil_page_get_type(frame), TRUE); ++ ++ table->field[9]->store(page_type, strlen(page_type), system_charset_info); ++ ++ //fprintf(stderr, "block N %d, space %d, offset %d, records %d, datasize %d, page_type %s, flush_type %d, buf_fix_count %d, LRU_position %d", i, block->space, block->offset, page_get_n_recs_noninline(block->frame), page_get_data_size_noninline(block->frame), page_type,block->flush_type, block->buf_fix_count, block->LRU_position); ++ ++ // flush_type, buf_fix_count, LRU_position ++ ++ if (fil_page_get_type(frame) == FIL_PAGE_INDEX) { ++ ++ id = btr_page_get_index_id_noninline(frame); ++ index = dict_index_get_if_in_cache(id); ++ if (index) { ++ table->field[10]->store(index->name, strlen(index->name), system_charset_info); ++ // fprintf(stderr, " index %s, table %s", index->name, index->table_name); ++ ++ if((p = strchr((char *) index->table_name, '/'))) ++ { ++ strncpy(db_name_raw, index->table_name, p-index->table_name); ++ db_name_raw[p-index->table_name] = 0; ++ table->field[11]->store(db_name_raw, strlen(db_name_raw), system_charset_info); ++ p++; ++ } else { ++ table->field[11]->store(NULL, 0, system_charset_info); ++ p = index->table_name; ++ } ++ strcpy(table_name_raw, p); ++ ++ table->field[12]->store(table_name_raw, strlen(table_name_raw), system_charset_info); ++ } else { ++ table->field[10]->store(NULL, 0, system_charset_info); ++ table->field[11]->store(NULL, 0, system_charset_info); ++ table->field[12]->store(NULL, 0, system_charset_info); ++ } ++ }else{ ++ table->field[10]->store(NULL, 0, system_charset_info); ++ table->field[11]->store(NULL, 0, system_charset_info); ++ table->field[12]->store(NULL, 0, system_charset_info); ++ } ++ //fprintf(stderr, "\n"); ++ if (schema_table_store_record(thd, table)) ++ { ++ DBUG_RETURN(1); ++ } ++ } ++ ++ DBUG_RETURN(0); ++} ++ + /**************************************************************************** + Implements the SHOW MUTEX STATUS command. . */ + +diff -r fe944d2c6e1f sql/ha_innodb.h +--- a/sql/ha_innodb.h Mon Nov 10 19:47:27 2008 -0800 ++++ b/sql/ha_innodb.h Mon Nov 10 19:48:25 2008 -0800 +@@ -263,6 +263,7 @@ + + int innobase_drop_database(char *path); + bool innodb_show_status(THD* thd); ++bool innodb_I_S_buffer_pool_content(THD* thd, TABLE_LIST *tables); + bool innodb_mutex_show_status(THD* thd); + void innodb_export_status(void); + +diff -r fe944d2c6e1f sql/sql_parse.cc +--- a/sql/sql_parse.cc Mon Nov 10 19:47:27 2008 -0800 ++++ b/sql/sql_parse.cc Mon Nov 10 19:48:25 2008 -0800 +@@ -2926,6 +2926,7 @@ + case SCH_COLUMN_PRIVILEGES: + case SCH_TABLE_CONSTRAINTS: + case SCH_KEY_COLUMN_USAGE: ++ case SCH_INNODB_I_S_BUFFER_POOL_CONTENT: + default: + break; + } +diff -r fe944d2c6e1f sql/sql_show.cc +--- a/sql/sql_show.cc Mon Nov 10 19:47:27 2008 -0800 ++++ b/sql/sql_show.cc Mon Nov 10 19:48:25 2008 -0800 +@@ -27,6 +27,10 @@ + + #ifdef HAVE_BERKELEY_DB + #include "ha_berkeley.h" // For berkeley_show_logs ++#endif ++ ++#ifdef HAVE_INNOBASE_DB ++#include "ha_innodb.h" + #endif + + #ifndef NO_EMBEDDED_ACCESS_CHECKS +@@ -4042,6 +4046,19 @@ + DBUG_RETURN(res); + } + ++int fill_innodb_bp_content(THD *thd, TABLE_LIST *tables, COND *cond) ++{ ++ DBUG_ENTER("fill_innodb_bp_content"); ++ int res= 0; ++ ++ /* deny access to non-superusers */ ++ if (check_global_access(thd, PROCESS_ACL)) { ++ DBUG_RETURN(0); ++ } ++ ++ innodb_I_S_buffer_pool_content(thd, tables); ++ DBUG_RETURN(res); ++} + + /* + Find schema_tables elment by name +@@ -4951,6 +4962,24 @@ + }; + + ++ST_FIELD_INFO innodb_bp_content_fields_info[]= ++{ ++ {"BLOCK_NUM", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Block_num"}, ++ {"SPACE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Space"}, ++ {"OFFSET", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Offset"}, ++ {"RECORDS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Records"}, ++ {"DATASIZE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Datasize"}, ++ {"FLUSH_TYPE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Flush_type"}, ++ {"FIX_COUNT", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Fix_count"}, ++ {"LRU_POSITION", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "LRU_position"}, ++ {"PAGE_TYPE_ID", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Page_type_id"}, ++ {"PAGE_TYPE", NAME_LEN, MYSQL_TYPE_STRING, 0, 0, "Page_type"}, ++ {"INDEX_NAME", NAME_LEN, MYSQL_TYPE_STRING, 0, 0, "Index_name"}, ++ {"TABLE_SCHEMA", NAME_LEN, MYSQL_TYPE_STRING, 0, 0, "Table_schem"}, ++ {"TABLE_NAME", NAME_LEN, MYSQL_TYPE_STRING, 0, 0, "Table_name"}, ++ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0} ++}; ++ + /* + Description of ST_FIELD_INFO in table.h + */ +@@ -4969,6 +4998,8 @@ + get_all_tables, make_columns_old_format, get_schema_column_record, 1, 2, 0}, + {"COLUMN_PRIVILEGES", column_privileges_fields_info, create_schema_table, + fill_schema_column_privileges, 0, 0, -1, -1, 0}, ++ {"INNODB_BUFFER_POOL_CONTENT", innodb_bp_content_fields_info, create_schema_table, ++ fill_innodb_bp_content, 0, 0, -1, -1, 0}, + {"INDEX_STATISTICS", index_stats_fields_info, create_schema_table, + fill_schema_index_stats, make_old_format, 0, -1, -1, 0}, + {"KEY_COLUMN_USAGE", key_column_usage_fields_info, create_schema_table, +diff -r fe944d2c6e1f sql/table.h +--- a/sql/table.h Mon Nov 10 19:47:27 2008 -0800 ++++ b/sql/table.h Mon Nov 10 19:48:25 2008 -0800 +@@ -375,6 +375,7 @@ + SCH_COLLATION_CHARACTER_SET_APPLICABILITY, + SCH_COLUMNS, + SCH_COLUMN_PRIVILEGES, ++ SCH_INNODB_I_S_BUFFER_POOL_CONTENT, + SCH_INDEX_STATS, + SCH_KEY_COLUMN_USAGE, + SCH_OPEN_TABLES, diff --git a/percona/5.0.91-b22-20100522/innodb_show_hashed_memory.patch b/percona/5.0.91-b22-20100522/innodb_show_hashed_memory.patch new file mode 100644 index 0000000..191193e --- /dev/null +++ b/percona/5.0.91-b22-20100522/innodb_show_hashed_memory.patch @@ -0,0 +1,275 @@ +diff -ruN mysql-5.0.67_highperf/innobase/buf/buf0buf.c mysql-5.0.67_highperf_tmp/innobase/buf/buf0buf.c +--- mysql-5.0.67_highperf/innobase/buf/buf0buf.c 2008-11-12 09:25:58.000000000 +0900 ++++ mysql-5.0.67_highperf_tmp/innobase/buf/buf0buf.c 2008-11-12 09:27:52.000000000 +0900 +@@ -2454,13 +2454,15 @@ + (ulong) UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped)); + } + fprintf(file, +- "Buffer pool size %lu\n" +- "Free buffers %lu\n" +- "Database pages %lu\n" +- "Modified db pages %lu\n" ++ "Buffer pool size %lu\n" ++ "Buffer pool size, bytes %lu\n" ++ "Free buffers %lu\n" ++ "Database pages %lu\n" ++ "Modified db pages %lu\n" + "Pending reads %lu\n" + "Pending writes: LRU %lu, flush list %lu, single page %lu\n", + (ulong) size, ++ (ulong) size * UNIV_PAGE_SIZE, + (ulong) UT_LIST_GET_LEN(buf_pool->free), + (ulong) UT_LIST_GET_LEN(buf_pool->LRU), + (ulong) UT_LIST_GET_LEN(buf_pool->flush_list), +diff -ruN mysql-5.0.67_highperf/innobase/fil/fil0fil.c mysql-5.0.67_highperf_tmp/innobase/fil/fil0fil.c +--- mysql-5.0.67_highperf/innobase/fil/fil0fil.c 2008-11-12 09:26:07.000000000 +0900 ++++ mysql-5.0.67_highperf_tmp/innobase/fil/fil0fil.c 2008-11-12 09:27:52.000000000 +0900 +@@ -4472,3 +4472,30 @@ + + return(mach_read_from_2(page + FIL_PAGE_TYPE)); + } ++ ++/************************************************************************* ++Return local hash table informations. */ ++ ++ulint ++fil_system_hash_cells(void) ++/*=======================*/ ++{ ++ if (fil_system) { ++ return (fil_system->spaces->n_cells ++ + fil_system->name_hash->n_cells); ++ } else { ++ return 0; ++ } ++} ++ ++ulint ++fil_system_hash_nodes(void) ++/*=======================*/ ++{ ++ if (fil_system) { ++ return (UT_LIST_GET_LEN(fil_system->space_list) ++ * (sizeof(fil_space_t) + MEM_BLOCK_HEADER_SIZE)); ++ } else { ++ return 0; ++ } ++} +diff -ruN mysql-5.0.67_highperf/innobase/include/fil0fil.h mysql-5.0.67_highperf_tmp/innobase/include/fil0fil.h +--- mysql-5.0.67_highperf/innobase/include/fil0fil.h 2008-11-12 09:26:07.000000000 +0900 ++++ mysql-5.0.67_highperf_tmp/innobase/include/fil0fil.h 2008-11-12 09:27:52.000000000 +0900 +@@ -701,6 +701,16 @@ + written to page, the return value not defined */ + byte* page); /* in: file page */ + ++/************************************************************************* ++Return local hash table informations. */ ++ ++ulint ++fil_system_hash_cells(void); ++/*========================*/ ++ ++ulint ++fil_system_hash_nodes(void); ++/*========================*/ + + typedef struct fil_space_struct fil_space_t; + +diff -ruN mysql-5.0.67_highperf/innobase/include/thr0loc.h mysql-5.0.67_highperf_tmp/innobase/include/thr0loc.h +--- mysql-5.0.67_highperf/innobase/include/thr0loc.h 2008-11-12 09:24:58.000000000 +0900 ++++ mysql-5.0.67_highperf_tmp/innobase/include/thr0loc.h 2008-11-12 09:27:52.000000000 +0900 +@@ -77,6 +77,17 @@ + /*=============================*/ + /* out: pointer to the in_ibuf field */ + ++/************************************************************************* ++Return local hash table informations. */ ++ ++ulint ++thr_local_hash_cells(void); ++/*=======================*/ ++ ++ulint ++thr_local_hash_nodes(void); ++/*=======================*/ ++ + #ifndef UNIV_NONINL + #include "thr0loc.ic" + #endif +diff -ruN mysql-5.0.67_highperf/innobase/srv/srv0srv.c mysql-5.0.67_highperf_tmp/innobase/srv/srv0srv.c +--- mysql-5.0.67_highperf/innobase/srv/srv0srv.c 2008-11-12 09:26:07.000000000 +0900 ++++ mysql-5.0.67_highperf_tmp/innobase/srv/srv0srv.c 2008-11-12 09:54:19.000000000 +0900 +@@ -1645,6 +1645,14 @@ + time_t current_time; + ulint n_reserved; + ++ ulint btr_search_sys_subtotal; ++ ulint lock_sys_subtotal; ++ ulint recv_sys_subtotal; ++ ulint io_counter_subtotal; ++ ++ ulint i; ++ trx_t* trx; ++ + mutex_enter(&srv_innodb_monitor_mutex); + + current_time = time(NULL); +@@ -1747,6 +1755,91 @@ + ut_total_allocated_memory, + mem_pool_get_reserved(mem_comm_pool)); + ++ /* Calcurate reserved memories */ ++ if (btr_search_sys && btr_search_sys->hash_index->heap) { ++ btr_search_sys_subtotal = mem_heap_get_size(btr_search_sys->hash_index->heap); ++ } else { ++ btr_search_sys_subtotal = 0; ++ for (i=0; i < btr_search_sys->hash_index->n_mutexes; i++) { ++ btr_search_sys_subtotal += mem_heap_get_size(btr_search_sys->hash_index->heaps[i]); ++ } ++ } ++ ++ lock_sys_subtotal = 0; ++ if (trx_sys) { ++ mutex_enter(&kernel_mutex); ++ trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list); ++ while (trx) { ++ lock_sys_subtotal += ((trx->lock_heap) ? mem_heap_get_size(trx->lock_heap) : 0); ++ trx = UT_LIST_GET_NEXT(mysql_trx_list, trx); ++ } ++ mutex_exit(&kernel_mutex); ++ } ++ ++ recv_sys_subtotal = ((recv_sys && recv_sys->addr_hash) ++ ? mem_heap_get_size(recv_sys->heap) : 0); ++ ++ io_counter_subtotal = ((buf_pool->io_counter_heap) ++ ? mem_heap_get_size(buf_pool->io_counter_heap) : 0); ++ ++ fprintf(file, ++ "Internal hash tables (constant factor + variable factor)\n" ++ " Adaptive hash index %lu \t(%lu + %lu)\n" ++ " Page hash %lu\n" ++ " Dictionary cache %lu \t(%lu + %lu)\n" ++ " File system %lu \t(%lu + %lu)\n" ++ " Lock system %lu \t(%lu + %lu)\n" ++ " Recovery system %lu \t(%lu + %lu)\n" ++ " Threads %lu \t(%lu + %lu)\n" ++ " innodb_io_pattern %lu \t(%lu + %lu)\n", ++ ++ (ulong) (btr_search_sys ++ ? (btr_search_sys->hash_index->n_cells * sizeof(hash_cell_t)) : 0) ++ + btr_search_sys_subtotal, ++ (ulong) (btr_search_sys ++ ? (btr_search_sys->hash_index->n_cells * sizeof(hash_cell_t)) : 0), ++ (ulong) btr_search_sys_subtotal, ++ ++ (ulong) (buf_pool->page_hash->n_cells * sizeof(hash_cell_t)), ++ ++ (ulong) (dict_sys ? ((dict_sys->table_hash->n_cells ++ + dict_sys->table_id_hash->n_cells ++ + dict_sys->col_hash->n_cells) * sizeof(hash_cell_t) ++ + dict_sys->size) : 0), ++ (ulong) (dict_sys ? ((dict_sys->table_hash->n_cells ++ + dict_sys->table_id_hash->n_cells ++ + dict_sys->col_hash->n_cells) * sizeof(hash_cell_t)) : 0), ++ (ulong) (dict_sys ? (dict_sys->size) : 0), ++ ++ (ulong) (fil_system_hash_cells() * sizeof(hash_cell_t) ++ + fil_system_hash_nodes()), ++ (ulong) (fil_system_hash_cells() * sizeof(hash_cell_t)), ++ (ulong) fil_system_hash_nodes(), ++ ++ (ulong) ((lock_sys ? (lock_sys->rec_hash->n_cells * sizeof(hash_cell_t)) : 0) ++ + lock_sys_subtotal), ++ (ulong) (lock_sys ? (lock_sys->rec_hash->n_cells * sizeof(hash_cell_t)) : 0), ++ (ulong) lock_sys_subtotal, ++ ++ (ulong) (((recv_sys && recv_sys->addr_hash) ++ ? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0) ++ + recv_sys_subtotal), ++ (ulong) ((recv_sys && recv_sys->addr_hash) ++ ? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0), ++ (ulong) recv_sys_subtotal, ++ ++ (ulong) (thr_local_hash_cells() * sizeof(hash_cell_t) ++ + thr_local_hash_nodes()), ++ (ulong) (thr_local_hash_cells() * sizeof(hash_cell_t)), ++ (ulong) thr_local_hash_nodes(), ++ ++ (ulong) (((buf_pool->io_counter_hash) /* needs &(buf_pool->mutex) ? */ ++ ? (buf_pool->io_counter_hash->n_cells * sizeof(hash_cell_t)) : 0) ++ + io_counter_subtotal), ++ (ulong) ((buf_pool->io_counter_hash) /* needs &(buf_pool->mutex) ? */ ++ ? (buf_pool->io_counter_hash->n_cells * sizeof(hash_cell_t)) : 0), ++ (ulong) io_counter_subtotal); ++ + if (srv_use_awe) { + fprintf(file, + "In addition to that %lu MB of AWE memory allocated\n", +diff -ruN mysql-5.0.67_highperf/innobase/thr/thr0loc.c mysql-5.0.67_highperf_tmp/innobase/thr/thr0loc.c +--- mysql-5.0.67_highperf/innobase/thr/thr0loc.c 2008-11-12 09:24:58.000000000 +0900 ++++ mysql-5.0.67_highperf_tmp/innobase/thr/thr0loc.c 2008-11-12 09:27:52.000000000 +0900 +@@ -32,6 +32,7 @@ + + /* The hash table. The module is not yet initialized when it is NULL. */ + hash_table_t* thr_local_hash = NULL; ++ulint thr_local_hash_n_nodes = 0; + + /* The private data for each thread should be put to + the structure below and the accessor functions written +@@ -223,6 +224,7 @@ + HASH_INSERT(thr_local_t, hash, thr_local_hash, + os_thread_pf(os_thread_get_curr_id()), + local); ++ thr_local_hash_n_nodes++; + + mutex_exit(&thr_local_mutex); + } +@@ -251,6 +253,7 @@ + + HASH_DELETE(thr_local_t, hash, thr_local_hash, + os_thread_pf(id), local); ++ thr_local_hash_n_nodes--; + + mutex_exit(&thr_local_mutex); + +@@ -274,3 +277,29 @@ + mutex_create(&thr_local_mutex); + mutex_set_level(&thr_local_mutex, SYNC_THR_LOCAL); + } ++ ++/************************************************************************* ++Return local hash table informations. */ ++ ++ulint ++thr_local_hash_cells(void) ++/*======================*/ ++{ ++ if (thr_local_hash) { ++ return (thr_local_hash->n_cells); ++ } else { ++ return 0; ++ } ++} ++ ++ulint ++thr_local_hash_nodes(void) ++/*======================*/ ++{ ++ if (thr_local_hash) { ++ return (thr_local_hash_n_nodes ++ * (sizeof(thr_local_t) + MEM_BLOCK_HEADER_SIZE)); ++ } else { ++ return 0; ++ } ++} +diff -ruN mysql-5.0.67_highperf/patch_info/innodb_show_hashed_memory.info mysql-5.0.67_highperf_tmp/patch_info/innodb_show_hashed_memory.info +--- /dev/null 1970-01-01 09:00:00.000000000 +0900 ++++ mysql-5.0.67_highperf_tmp/patch_info/innodb_show_hashed_memory.info 2008-11-12 09:27:52.000000000 +0900 +@@ -0,0 +1,6 @@ ++File=innodb_show_hashed_memory.patch ++Name=Adds additional information of InnoDB internal hash table memories in SHOW INNODB STATUS ++Version=1.0 ++Author=Percona <info@percona.com> ++License=GPL ++Comment= diff --git a/percona/5.0.91-b22-20100522/innodb_show_hashed_memory_standalone.patch b/percona/5.0.91-b22-20100522/innodb_show_hashed_memory_standalone.patch new file mode 100644 index 0000000..bf8f6b4 --- /dev/null +++ b/percona/5.0.91-b22-20100522/innodb_show_hashed_memory_standalone.patch @@ -0,0 +1,264 @@ +diff -ruN mysql-5.0.67_highperf/innobase/buf/buf0buf.c mysql-5.0.67_highperf_tmp/innobase/buf/buf0buf.c +--- mysql-5.0.67_highperf/innobase/buf/buf0buf.c 2008-11-12 09:25:58.000000000 +0900 ++++ mysql-5.0.67_highperf_tmp/innobase/buf/buf0buf.c 2008-11-12 09:27:52.000000000 +0900 +@@ -2454,13 +2454,15 @@ + (ulong) UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped)); + } + fprintf(file, +- "Buffer pool size %lu\n" +- "Free buffers %lu\n" +- "Database pages %lu\n" +- "Modified db pages %lu\n" ++ "Buffer pool size %lu\n" ++ "Buffer pool size, bytes %lu\n" ++ "Free buffers %lu\n" ++ "Database pages %lu\n" ++ "Modified db pages %lu\n" + "Pending reads %lu\n" + "Pending writes: LRU %lu, flush list %lu, single page %lu\n", + (ulong) size, ++ (ulong) size * UNIV_PAGE_SIZE, + (ulong) UT_LIST_GET_LEN(buf_pool->free), + (ulong) UT_LIST_GET_LEN(buf_pool->LRU), + (ulong) UT_LIST_GET_LEN(buf_pool->flush_list), +diff -ruN mysql-5.0.67_highperf/innobase/fil/fil0fil.c mysql-5.0.67_highperf_tmp/innobase/fil/fil0fil.c +--- mysql-5.0.67_highperf/innobase/fil/fil0fil.c 2008-11-12 09:26:07.000000000 +0900 ++++ mysql-5.0.67_highperf_tmp/innobase/fil/fil0fil.c 2008-11-12 09:27:52.000000000 +0900 +@@ -4472,3 +4472,30 @@ + + return(mach_read_from_2(page + FIL_PAGE_TYPE)); + } ++ ++/************************************************************************* ++Return local hash table informations. */ ++ ++ulint ++fil_system_hash_cells(void) ++/*=======================*/ ++{ ++ if (fil_system) { ++ return (fil_system->spaces->n_cells ++ + fil_system->name_hash->n_cells); ++ } else { ++ return 0; ++ } ++} ++ ++ulint ++fil_system_hash_nodes(void) ++/*=======================*/ ++{ ++ if (fil_system) { ++ return (UT_LIST_GET_LEN(fil_system->space_list) ++ * (sizeof(fil_space_t) + MEM_BLOCK_HEADER_SIZE)); ++ } else { ++ return 0; ++ } ++} +diff -ruN mysql-5.0.67_highperf/innobase/include/fil0fil.h mysql-5.0.67_highperf_tmp/innobase/include/fil0fil.h +--- mysql-5.0.67_highperf/innobase/include/fil0fil.h 2008-11-12 09:26:07.000000000 +0900 ++++ mysql-5.0.67_highperf_tmp/innobase/include/fil0fil.h 2008-11-12 09:27:52.000000000 +0900 +@@ -701,6 +701,16 @@ + written to page, the return value not defined */ + byte* page); /* in: file page */ + ++/************************************************************************* ++Return local hash table informations. */ ++ ++ulint ++fil_system_hash_cells(void); ++/*========================*/ ++ ++ulint ++fil_system_hash_nodes(void); ++/*========================*/ + + typedef struct fil_space_struct fil_space_t; + +diff -ruN mysql-5.0.67_highperf/innobase/include/thr0loc.h mysql-5.0.67_highperf_tmp/innobase/include/thr0loc.h +--- mysql-5.0.67_highperf/innobase/include/thr0loc.h 2008-11-12 09:24:58.000000000 +0900 ++++ mysql-5.0.67_highperf_tmp/innobase/include/thr0loc.h 2008-11-12 09:27:52.000000000 +0900 +@@ -77,6 +77,17 @@ + /*=============================*/ + /* out: pointer to the in_ibuf field */ + ++/************************************************************************* ++Return local hash table informations. */ ++ ++ulint ++thr_local_hash_cells(void); ++/*=======================*/ ++ ++ulint ++thr_local_hash_nodes(void); ++/*=======================*/ ++ + #ifndef UNIV_NONINL + #include "thr0loc.ic" + #endif +diff -ruN mysql-5.0.67_highperf/innobase/srv/srv0srv.c mysql-5.0.67_highperf_tmp/innobase/srv/srv0srv.c +--- mysql-5.0.67_highperf/innobase/srv/srv0srv.c 2008-11-12 09:26:07.000000000 +0900 ++++ mysql-5.0.67_highperf_tmp/innobase/srv/srv0srv.c 2008-11-12 09:54:19.000000000 +0900 +@@ -1645,6 +1645,14 @@ + time_t current_time; + ulint n_reserved; + ++ ulint btr_search_sys_subtotal; ++ ulint lock_sys_subtotal; ++ ulint recv_sys_subtotal; ++ ulint io_counter_subtotal; ++ ++ ulint i; ++ trx_t* trx; ++ + mutex_enter(&srv_innodb_monitor_mutex); + + current_time = time(NULL); +@@ -1747,6 +1755,80 @@ + ut_total_allocated_memory, + mem_pool_get_reserved(mem_comm_pool)); + ++ /* Calcurate reserved memories */ ++ if (btr_search_sys && btr_search_sys->hash_index->heap) { ++ btr_search_sys_subtotal = mem_heap_get_size(btr_search_sys->hash_index->heap); ++ } else { ++ btr_search_sys_subtotal = 0; ++ for (i=0; i < btr_search_sys->hash_index->n_mutexes; i++) { ++ btr_search_sys_subtotal += mem_heap_get_size(btr_search_sys->hash_index->heaps[i]); ++ } ++ } ++ ++ lock_sys_subtotal = 0; ++ if (trx_sys) { ++ mutex_enter(&kernel_mutex); ++ trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list); ++ while (trx) { ++ lock_sys_subtotal += ((trx->lock_heap) ? mem_heap_get_size(trx->lock_heap) : 0); ++ trx = UT_LIST_GET_NEXT(mysql_trx_list, trx); ++ } ++ mutex_exit(&kernel_mutex); ++ } ++ ++ recv_sys_subtotal = ((recv_sys && recv_sys->addr_hash) ++ ? mem_heap_get_size(recv_sys->heap) : 0); ++ ++ fprintf(file, ++ "Internal hash tables (constant factor + variable factor)\n" ++ " Adaptive hash index %lu \t(%lu + %lu)\n" ++ " Page hash %lu\n" ++ " Dictionary cache %lu \t(%lu + %lu)\n" ++ " File system %lu \t(%lu + %lu)\n" ++ " Lock system %lu \t(%lu + %lu)\n" ++ " Recovery system %lu \t(%lu + %lu)\n" ++ " Threads %lu \t(%lu + %lu)\n", ++ ++ (ulong) (btr_search_sys ++ ? (btr_search_sys->hash_index->n_cells * sizeof(hash_cell_t)) : 0) ++ + btr_search_sys_subtotal, ++ (ulong) (btr_search_sys ++ ? (btr_search_sys->hash_index->n_cells * sizeof(hash_cell_t)) : 0), ++ (ulong) btr_search_sys_subtotal, ++ ++ (ulong) (buf_pool->page_hash->n_cells * sizeof(hash_cell_t)), ++ ++ (ulong) (dict_sys ? ((dict_sys->table_hash->n_cells ++ + dict_sys->table_id_hash->n_cells ++ + dict_sys->col_hash->n_cells) * sizeof(hash_cell_t) ++ + dict_sys->size) : 0), ++ (ulong) (dict_sys ? ((dict_sys->table_hash->n_cells ++ + dict_sys->table_id_hash->n_cells ++ + dict_sys->col_hash->n_cells) * sizeof(hash_cell_t)) : 0), ++ (ulong) (dict_sys ? (dict_sys->size) : 0), ++ ++ (ulong) (fil_system_hash_cells() * sizeof(hash_cell_t) ++ + fil_system_hash_nodes()), ++ (ulong) (fil_system_hash_cells() * sizeof(hash_cell_t)), ++ (ulong) fil_system_hash_nodes(), ++ ++ (ulong) ((lock_sys ? (lock_sys->rec_hash->n_cells * sizeof(hash_cell_t)) : 0) ++ + lock_sys_subtotal), ++ (ulong) (lock_sys ? (lock_sys->rec_hash->n_cells * sizeof(hash_cell_t)) : 0), ++ (ulong) lock_sys_subtotal, ++ ++ (ulong) (((recv_sys && recv_sys->addr_hash) ++ ? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0) ++ + recv_sys_subtotal), ++ (ulong) ((recv_sys && recv_sys->addr_hash) ++ ? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0), ++ (ulong) recv_sys_subtotal, ++ ++ (ulong) (thr_local_hash_cells() * sizeof(hash_cell_t) ++ + thr_local_hash_nodes()), ++ (ulong) (thr_local_hash_cells() * sizeof(hash_cell_t)), ++ (ulong) thr_local_hash_nodes()); ++ + if (srv_use_awe) { + fprintf(file, + "In addition to that %lu MB of AWE memory allocated\n", +diff -ruN mysql-5.0.67_highperf/innobase/thr/thr0loc.c mysql-5.0.67_highperf_tmp/innobase/thr/thr0loc.c +--- mysql-5.0.67_highperf/innobase/thr/thr0loc.c 2008-11-12 09:24:58.000000000 +0900 ++++ mysql-5.0.67_highperf_tmp/innobase/thr/thr0loc.c 2008-11-12 09:27:52.000000000 +0900 +@@ -32,6 +32,7 @@ + + /* The hash table. The module is not yet initialized when it is NULL. */ + hash_table_t* thr_local_hash = NULL; ++ulint thr_local_hash_n_nodes = 0; + + /* The private data for each thread should be put to + the structure below and the accessor functions written +@@ -223,6 +224,7 @@ + HASH_INSERT(thr_local_t, hash, thr_local_hash, + os_thread_pf(os_thread_get_curr_id()), + local); ++ thr_local_hash_n_nodes++; + + mutex_exit(&thr_local_mutex); + } +@@ -251,6 +253,7 @@ + + HASH_DELETE(thr_local_t, hash, thr_local_hash, + os_thread_pf(id), local); ++ thr_local_hash_n_nodes--; + + mutex_exit(&thr_local_mutex); + +@@ -274,3 +277,29 @@ + mutex_create(&thr_local_mutex); + mutex_set_level(&thr_local_mutex, SYNC_THR_LOCAL); + } ++ ++/************************************************************************* ++Return local hash table informations. */ ++ ++ulint ++thr_local_hash_cells(void) ++/*======================*/ ++{ ++ if (thr_local_hash) { ++ return (thr_local_hash->n_cells); ++ } else { ++ return 0; ++ } ++} ++ ++ulint ++thr_local_hash_nodes(void) ++/*======================*/ ++{ ++ if (thr_local_hash) { ++ return (thr_local_hash_n_nodes ++ * (sizeof(thr_local_t) + MEM_BLOCK_HEADER_SIZE)); ++ } else { ++ return 0; ++ } ++} +diff -ruN mysql-5.0.67_highperf/patch_info/innodb_show_hashed_memory.info mysql-5.0.67_highperf_tmp/patch_info/innodb_show_hashed_memory.info +--- /dev/null 1970-01-01 09:00:00.000000000 +0900 ++++ mysql-5.0.67_highperf_tmp/patch_info/innodb_show_hashed_memory.info 2008-11-12 09:27:52.000000000 +0900 +@@ -0,0 +1,6 @@ ++File=innodb_show_hashed_memory.patch ++Name=Adds additional information of InnoDB internal hash table memories in SHOW INNODB STATUS ++Version=1.0 ++Author=Percona <info@percona.com> ++License=GPL ++Comment= diff --git a/percona/5.0.91-b22-20100522/innodb_split_buf_pool_mutex.patch b/percona/5.0.91-b22-20100522/innodb_split_buf_pool_mutex.patch new file mode 100644 index 0000000..a23c1e9 --- /dev/null +++ b/percona/5.0.91-b22-20100522/innodb_split_buf_pool_mutex.patch @@ -0,0 +1,1914 @@ +diff -ruN a/innobase/btr/btr0sea.c b/innobase/btr/btr0sea.c +--- a/innobase/btr/btr0sea.c 2009-08-28 11:08:16.000000000 +0900 ++++ b/innobase/btr/btr0sea.c 2009-08-28 11:06:20.000000000 +0900 +@@ -1101,7 +1101,7 @@ + ulint* offsets; + + rw_lock_x_lock(&btr_search_latch); +- mutex_enter(&buf_pool->mutex); ++ mutex_enter(&buf_pool->LRU_mutex); + + table = btr_search_sys->hash_index; + +@@ -1186,7 +1186,7 @@ + block = UT_LIST_GET_PREV(LRU, block); + } + +- mutex_exit(&buf_pool->mutex); ++ mutex_exit(&buf_pool->LRU_mutex); + rw_lock_x_unlock(&btr_search_latch); + + if (UNIV_LIKELY_NULL(heap)) { +diff -ruN a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c +--- a/innobase/buf/buf0buf.c 2009-08-28 11:08:16.000000000 +0900 ++++ b/innobase/buf/buf0buf.c 2009-08-28 11:06:30.000000000 +0900 +@@ -549,6 +549,17 @@ + mutex_create(&(buf_pool->mutex)); + mutex_set_level(&(buf_pool->mutex), SYNC_BUF_POOL); + ++ mutex_create(&(buf_pool->LRU_mutex)); ++ mutex_set_level(&(buf_pool->LRU_mutex), SYNC_BUF_LRU_LIST); ++ rw_lock_create(&(buf_pool->hash_latch)); ++ rw_lock_set_level(&(buf_pool->hash_latch), SYNC_BUF_PAGE_HASH); ++ mutex_create(&(buf_pool->free_mutex)); ++ mutex_set_level(&(buf_pool->free_mutex), SYNC_BUF_FREE_LIST); ++ mutex_create(&(buf_pool->flush_list_mutex)); ++ mutex_set_level(&(buf_pool->flush_list_mutex), SYNC_BUF_FLUSH_LIST); ++ ++ mutex_enter(&(buf_pool->LRU_mutex)); ++ rw_lock_x_lock(&(buf_pool->hash_latch)); + mutex_enter(&(buf_pool->mutex)); + + if (srv_use_awe) { +@@ -724,6 +735,8 @@ + block->in_free_list = TRUE; + } + ++ mutex_exit(&(buf_pool->LRU_mutex)); ++ rw_lock_x_unlock(&(buf_pool->hash_latch)); + mutex_exit(&(buf_pool->mutex)); + + if (srv_use_adaptive_hash_indexes) { +@@ -753,6 +766,7 @@ + { + buf_block_t* bck; + ++ ut_error; /* don't support AWE */ + #ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(buf_pool->mutex))); + #endif /* UNIV_SYNC_DEBUG */ +@@ -851,7 +865,7 @@ + buf_block_t* block) /* in: block to make younger */ + { + #ifdef UNIV_SYNC_DEBUG +- ut_ad(!mutex_own(&(buf_pool->mutex))); ++ ut_ad(!mutex_own(&(buf_pool->LRU_mutex))); + #endif /* UNIV_SYNC_DEBUG */ + + /* Note that we read freed_page_clock's without holding any mutex: +@@ -860,12 +874,12 @@ + if (buf_pool->freed_page_clock >= block->freed_page_clock + + 1 + (buf_pool->curr_size / 4)) { + +- mutex_enter(&buf_pool->mutex); ++ mutex_enter(&buf_pool->LRU_mutex); + /* There has been freeing activity in the LRU list: + best to move to the head of the LRU list */ + + buf_LRU_make_block_young(block); +- mutex_exit(&buf_pool->mutex); ++ mutex_exit(&buf_pool->LRU_mutex); + } + } + +@@ -881,7 +895,7 @@ + { + buf_block_t* block; + +- mutex_enter(&(buf_pool->mutex)); ++ mutex_enter(&(buf_pool->LRU_mutex)); + + block = buf_block_align(frame); + +@@ -889,7 +903,7 @@ + + buf_LRU_make_block_young(block); + +- mutex_exit(&(buf_pool->mutex)); ++ mutex_exit(&(buf_pool->LRU_mutex)); + } + + /************************************************************************ +@@ -900,7 +914,7 @@ + /*===========*/ + buf_block_t* block) /* in, own: block to be freed */ + { +- mutex_enter(&(buf_pool->mutex)); ++ //mutex_enter(&(buf_pool->mutex)); + + mutex_enter(&block->mutex); + +@@ -910,7 +924,7 @@ + + mutex_exit(&block->mutex); + +- mutex_exit(&(buf_pool->mutex)); ++ //mutex_exit(&(buf_pool->mutex)); + } + + /************************************************************************* +@@ -951,11 +965,11 @@ + { + buf_block_t* block; + +- mutex_enter_fast(&(buf_pool->mutex)); ++ rw_lock_s_lock(&(buf_pool->hash_latch)); + + block = buf_page_hash_get(space, offset); + +- mutex_exit(&(buf_pool->mutex)); ++ rw_lock_s_unlock(&(buf_pool->hash_latch)); + + return(block); + } +@@ -972,7 +986,7 @@ + { + buf_block_t* block; + +- mutex_enter_fast(&(buf_pool->mutex)); ++ rw_lock_s_lock(&(buf_pool->hash_latch)); + + block = buf_page_hash_get(space, offset); + +@@ -980,7 +994,7 @@ + block->check_index_page_at_flush = FALSE; + } + +- mutex_exit(&(buf_pool->mutex)); ++ rw_lock_s_unlock(&(buf_pool->hash_latch)); + } + + /************************************************************************ +@@ -999,7 +1013,7 @@ + buf_block_t* block; + ibool is_hashed; + +- mutex_enter_fast(&(buf_pool->mutex)); ++ rw_lock_s_lock(&(buf_pool->hash_latch)); + + block = buf_page_hash_get(space, offset); + +@@ -1009,7 +1023,7 @@ + is_hashed = block->is_hashed; + } + +- mutex_exit(&(buf_pool->mutex)); ++ rw_lock_s_unlock(&(buf_pool->hash_latch)); + + return(is_hashed); + } +@@ -1051,7 +1065,7 @@ + { + buf_block_t* block; + +- mutex_enter_fast(&(buf_pool->mutex)); ++ rw_lock_s_lock(&(buf_pool->hash_latch)); + + block = buf_page_hash_get(space, offset); + +@@ -1059,7 +1073,7 @@ + block->file_page_was_freed = TRUE; + } + +- mutex_exit(&(buf_pool->mutex)); ++ rw_lock_s_unlock(&(buf_pool->hash_latch)); + + return(block); + } +@@ -1080,7 +1094,7 @@ + { + buf_block_t* block; + +- mutex_enter_fast(&(buf_pool->mutex)); ++ rw_lock_s_lock(&(buf_pool->hash_latch)); + + block = buf_page_hash_get(space, offset); + +@@ -1088,7 +1102,7 @@ + block->file_page_was_freed = FALSE; + } + +- mutex_exit(&(buf_pool->mutex)); ++ rw_lock_s_unlock(&(buf_pool->hash_latch)); + + return(block); + } +@@ -1167,26 +1181,33 @@ + buf_pool->n_page_gets++; + loop: + block = NULL; +- mutex_enter_fast(&(buf_pool->mutex)); ++ //mutex_enter_fast(&(buf_pool->mutex)); + + if (guess) { + block = buf_block_align(guess); + ++ mutex_enter(&block->mutex); + if ((offset != block->offset) || (space != block->space) + || (block->state != BUF_BLOCK_FILE_PAGE)) { + ++ mutex_exit(&block->mutex); + block = NULL; + } + } + + if (block == NULL) { ++ rw_lock_s_lock(&(buf_pool->hash_latch)); + block = buf_page_hash_get(space, offset); ++ if(block) { ++ mutex_enter(&block->mutex); ++ } ++ rw_lock_s_unlock(&(buf_pool->hash_latch)); + } + + if (block == NULL) { + /* Page not in buf_pool: needs to be read from file */ + +- mutex_exit(&(buf_pool->mutex)); ++ //mutex_exit(&(buf_pool->mutex)); + + if (mode == BUF_GET_IF_IN_POOL) { + +@@ -1205,7 +1226,7 @@ + goto loop; + } + +- mutex_enter(&block->mutex); ++ //mutex_enter(&block->mutex); + + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + +@@ -1217,7 +1238,7 @@ + + if (mode == BUF_GET_IF_IN_POOL) { + /* The page is only being read to buffer */ +- mutex_exit(&buf_pool->mutex); ++ //mutex_exit(&buf_pool->mutex); + mutex_exit(&block->mutex); + + return(NULL); +@@ -1242,7 +1263,7 @@ + #else + buf_block_buf_fix_inc(block); + #endif +- mutex_exit(&buf_pool->mutex); ++ //mutex_exit(&buf_pool->mutex); + + /* Check if this is the first access to the page */ + +@@ -1685,7 +1706,7 @@ + buf_block_t* block) /* in: block to init */ + { + #ifdef UNIV_SYNC_DEBUG +- ut_ad(mutex_own(&(buf_pool->mutex))); ++ ut_ad(mutex_own(&(buf_pool->LRU_mutex))); + ut_ad(mutex_own(&(block->mutex))); + #endif /* UNIV_SYNC_DEBUG */ + ut_a(block->state != BUF_BLOCK_FILE_PAGE); +@@ -1792,7 +1813,8 @@ + + ut_a(block); + +- mutex_enter(&(buf_pool->mutex)); ++ mutex_enter(&(buf_pool->LRU_mutex)); ++ rw_lock_x_lock(&(buf_pool->hash_latch)); + mutex_enter(&block->mutex); + + if (fil_tablespace_deleted_or_being_deleted_in_mem(space, +@@ -1807,7 +1829,8 @@ + being deleted, or the page is already in buf_pool, return */ + + mutex_exit(&block->mutex); +- mutex_exit(&(buf_pool->mutex)); ++ mutex_exit(&(buf_pool->LRU_mutex)); ++ rw_lock_x_unlock(&(buf_pool->hash_latch)); + + buf_block_free(block); + +@@ -1822,10 +1845,14 @@ + ut_ad(block); + + buf_page_init(space, offset, block); ++ rw_lock_x_unlock(&(buf_pool->hash_latch)); + + /* The block must be put to the LRU list, to the old blocks */ + + buf_LRU_add_block(block, TRUE); /* TRUE == to old blocks */ ++ mutex_exit(&(buf_pool->LRU_mutex)); ++ ++ mutex_enter(&(buf_pool->mutex)); /* for consistency about aio */ + + block->io_fix = BUF_IO_READ; + +@@ -1874,7 +1901,8 @@ + + free_block = buf_LRU_get_free_block(); + +- mutex_enter(&(buf_pool->mutex)); ++ mutex_enter(&(buf_pool->LRU_mutex)); ++ rw_lock_x_lock(&(buf_pool->hash_latch)); + + block = buf_page_hash_get(space, offset); + +@@ -1885,7 +1913,8 @@ + block->file_page_was_freed = FALSE; + + /* Page can be found in buf_pool */ +- mutex_exit(&(buf_pool->mutex)); ++ mutex_exit(&(buf_pool->LRU_mutex)); ++ rw_lock_x_unlock(&(buf_pool->hash_latch)); + + buf_block_free(free_block); + +@@ -1908,6 +1937,7 @@ + mutex_enter(&block->mutex); + + buf_page_init(space, offset, block); ++ rw_lock_x_unlock(&(buf_pool->hash_latch)); + + /* The block must be put to the LRU list */ + buf_LRU_add_block(block, FALSE); +@@ -1919,7 +1949,7 @@ + #endif + buf_pool->n_pages_created++; + +- mutex_exit(&(buf_pool->mutex)); ++ mutex_exit(&(buf_pool->LRU_mutex)); + + mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX); + +@@ -1933,7 +1963,7 @@ + ibuf_merge_or_delete_for_page(NULL, space, offset, TRUE); + + /* Flush pages from the end of the LRU list if necessary */ +- buf_flush_free_margin(); ++ buf_flush_free_margin(FALSE); + + frame = block->frame; + +@@ -1969,6 +1999,7 @@ + { + ulint io_type; + ulint read_page_no; ++ ulint flush_type; + + buf_io_counter_t* io_counter; + ulint fold; +@@ -2051,9 +2082,6 @@ + } + } + +- mutex_enter(&(buf_pool->mutex)); +- mutex_enter(&block->mutex); +- + #ifdef UNIV_IBUF_DEBUG + ut_a(ibuf_count_get(block->space, block->offset) == 0); + #endif +@@ -2062,9 +2090,12 @@ + removes the newest lock debug record, without checking the thread + id. */ + +- block->io_fix = 0; +- + if (io_type == BUF_IO_READ) { ++ mutex_enter(&block->mutex); ++ mutex_enter(&(buf_pool->mutex)); ++ ++ block->io_fix = 0; ++ + /* NOTE that the call to ibuf may have moved the ownership of + the x-latch to this OS thread: do not let this confuse you in + debugging! */ +@@ -2095,6 +2126,8 @@ + } + } + ++ mutex_exit(&(buf_pool->mutex)); ++ mutex_exit(&block->mutex); + #ifdef UNIV_DEBUG + if (buf_debug_prints) { + fputs("Has read ", stderr); +@@ -2103,11 +2136,24 @@ + } else { + ut_ad(io_type == BUF_IO_WRITE); + ++ flush_type = block->flush_type; ++ if (flush_type == BUF_FLUSH_LRU) { ++ mutex_enter(&(buf_pool->LRU_mutex)); ++ } ++ mutex_enter(&block->mutex); ++ mutex_enter(&(buf_pool->mutex)); ++ ++ block->io_fix = 0; ++ + /* Write means a flush operation: call the completion + routine in the flush system */ + + buf_flush_write_complete(block); + ++ if (flush_type == BUF_FLUSH_LRU) { ++ mutex_exit(&(buf_pool->LRU_mutex)); ++ } ++ + rw_lock_s_unlock_gen(&(block->lock), BUF_IO_WRITE); + /* io_counter here */ + if (srv_io_pattern && srv_io_pattern_trace_running) { +@@ -2132,6 +2178,9 @@ + + buf_pool->n_pages_written++; + ++ mutex_exit(&(buf_pool->mutex)); ++ mutex_exit(&block->mutex); ++ + #ifdef UNIV_DEBUG + if (buf_debug_prints) { + fputs("Has written ", stderr); +@@ -2139,9 +2188,6 @@ + #endif /* UNIV_DEBUG */ + } + +- mutex_exit(&block->mutex); +- mutex_exit(&(buf_pool->mutex)); +- + #ifdef UNIV_DEBUG + if (buf_debug_prints) { + fprintf(stderr, "page space %lu page no %lu\n", +@@ -2169,11 +2215,11 @@ + freed = buf_LRU_search_and_free_block(100); + } + +- mutex_enter(&(buf_pool->mutex)); ++ mutex_enter(&(buf_pool->LRU_mutex)); + + ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0); + +- mutex_exit(&(buf_pool->mutex)); ++ mutex_exit(&(buf_pool->LRU_mutex)); + } + + /************************************************************************* +@@ -2195,7 +2241,10 @@ + + ut_ad(buf_pool); + +- mutex_enter(&(buf_pool->mutex)); ++ //mutex_enter(&(buf_pool->mutex)); ++ mutex_enter(&(buf_pool->LRU_mutex)); ++ rw_lock_x_lock(&(buf_pool->hash_latch)); ++ /* for keep the new latch order, it cannot validate correctly... */ + + for (i = 0; i < buf_pool->curr_size; i++) { + +@@ -2256,18 +2305,26 @@ + } + + ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru); ++ /* because of latching order with block->mutex, we cannot get free_mutex before that */ ++/* + if (UT_LIST_GET_LEN(buf_pool->free) != n_free) { + fprintf(stderr, "Free list len %lu, free blocks %lu\n", + (ulong) UT_LIST_GET_LEN(buf_pool->free), (ulong) n_free); + ut_error; + } ++*/ ++ /* because of latching order with block->mutex, we cannot get flush_list_mutex before that */ ++/* + ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush); + + ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_single_flush); + ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush); + ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush); ++*/ + +- mutex_exit(&(buf_pool->mutex)); ++ //mutex_exit(&(buf_pool->mutex)); ++ mutex_exit(&(buf_pool->LRU_mutex)); ++ rw_lock_x_unlock(&(buf_pool->hash_latch)); + + ut_a(buf_LRU_validate()); + ut_a(buf_flush_validate()); +@@ -2299,7 +2356,9 @@ + index_ids = mem_alloc(sizeof(dulint) * size); + counts = mem_alloc(sizeof(ulint) * size); + +- mutex_enter(&(buf_pool->mutex)); ++ mutex_enter(&(buf_pool->LRU_mutex)); ++ mutex_enter(&(buf_pool->free_mutex)); ++ mutex_enter(&(buf_pool->flush_list_mutex)); + + fprintf(stderr, + "buf_pool size %lu\n" +@@ -2352,7 +2411,9 @@ + } + } + +- mutex_exit(&(buf_pool->mutex)); ++ mutex_exit(&(buf_pool->LRU_mutex)); ++ mutex_exit(&(buf_pool->free_mutex)); ++ mutex_exit(&(buf_pool->flush_list_mutex)); + + for (i = 0; i < n_found; i++) { + index = dict_index_get_if_in_cache(index_ids[i]); +@@ -2387,7 +2448,7 @@ + ulint i; + ulint fixed_pages_number = 0; + +- mutex_enter(&(buf_pool->mutex)); ++ //mutex_enter(&(buf_pool->mutex)); + + for (i = 0; i < buf_pool->curr_size; i++) { + +@@ -2404,7 +2465,7 @@ + } + } + +- mutex_exit(&(buf_pool->mutex)); ++ //mutex_exit(&(buf_pool->mutex)); + return fixed_pages_number; + } + #endif /* UNIV_DEBUG */ +@@ -2432,7 +2493,7 @@ + { + ulint ratio; + +- mutex_enter(&(buf_pool->mutex)); ++ //mutex_enter(&(buf_pool->mutex)); /* optimistic */ + + ratio = (100 * UT_LIST_GET_LEN(buf_pool->flush_list)) + / (1 + UT_LIST_GET_LEN(buf_pool->LRU) +@@ -2440,7 +2501,7 @@ + + /* 1 + is there to avoid division by zero */ + +- mutex_exit(&(buf_pool->mutex)); ++ //mutex_exit(&(buf_pool->mutex)); /* optimistic */ + + return(ratio); + } +@@ -2460,7 +2521,10 @@ + ut_ad(buf_pool); + size = buf_pool->curr_size; + ++ mutex_enter(&(buf_pool->LRU_mutex)); ++ mutex_enter(&(buf_pool->free_mutex)); + mutex_enter(&(buf_pool->mutex)); ++ mutex_enter(&(buf_pool->flush_list_mutex)); + + if (srv_use_awe) { + fprintf(stderr, +@@ -2533,7 +2597,10 @@ + buf_pool->n_pages_written_old = buf_pool->n_pages_written; + buf_pool->n_pages_awe_remapped_old = buf_pool->n_pages_awe_remapped; + ++ mutex_exit(&(buf_pool->LRU_mutex)); ++ mutex_exit(&(buf_pool->free_mutex)); + mutex_exit(&(buf_pool->mutex)); ++ mutex_exit(&(buf_pool->flush_list_mutex)); + } + + /************************************************************************** +@@ -2563,7 +2630,7 @@ + + ut_ad(buf_pool); + +- mutex_enter(&(buf_pool->mutex)); ++ //mutex_enter(&(buf_pool->mutex)); /* optimistic */ + + for (i = 0; i < buf_pool->curr_size; i++) { + +@@ -2586,7 +2653,7 @@ + mutex_exit(&block->mutex); + } + +- mutex_exit(&(buf_pool->mutex)); ++ //mutex_exit(&(buf_pool->mutex)); /* optimistic */ + + return(TRUE); + } +@@ -2626,11 +2693,11 @@ + { + ulint len; + +- mutex_enter(&(buf_pool->mutex)); ++ mutex_enter(&(buf_pool->free_mutex)); + + len = UT_LIST_GET_LEN(buf_pool->free); + +- mutex_exit(&(buf_pool->mutex)); ++ mutex_exit(&(buf_pool->free_mutex)); + + return(len); + } +diff -ruN a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c +--- a/innobase/buf/buf0flu.c 2009-08-28 11:08:17.000000000 +0900 ++++ b/innobase/buf/buf0flu.c 2009-08-28 11:06:30.000000000 +0900 +@@ -49,7 +49,9 @@ + buf_block_t* block) /* in: block which is modified */ + { + #ifdef UNIV_SYNC_DEBUG +- ut_ad(mutex_own(&(buf_pool->mutex))); ++ //ut_ad(mutex_own(&(buf_pool->mutex))); ++ ut_ad(mutex_own(&block->mutex)); ++ ut_ad(mutex_own(&(buf_pool->flush_list_mutex))); + #endif /* UNIV_SYNC_DEBUG */ + + ut_a(block->state == BUF_BLOCK_FILE_PAGE); +@@ -79,7 +81,9 @@ + buf_block_t* b; + + #ifdef UNIV_SYNC_DEBUG +- ut_ad(mutex_own(&(buf_pool->mutex))); ++ //ut_ad(mutex_own(&(buf_pool->mutex))); ++ ut_ad(mutex_own(&block->mutex)); ++ ut_ad(mutex_own(&(buf_pool->flush_list_mutex))); + #endif /* UNIV_SYNC_DEBUG */ + + prev_b = NULL; +@@ -130,16 +134,18 @@ + BUF_BLOCK_FILE_PAGE and in the LRU list */ + { + #ifdef UNIV_SYNC_DEBUG +- ut_ad(mutex_own(&(buf_pool->mutex))); ++ //ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&block->mutex)); + #endif /* UNIV_SYNC_DEBUG */ +- if (block->state != BUF_BLOCK_FILE_PAGE) { ++ if (!block->in_LRU_list || block->state != BUF_BLOCK_FILE_PAGE) { ++ /* permited not to own LRU_mutex.. */ ++/* + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: buffer block state %lu in the LRU list!\n", + (ulong)block->state); + ut_print_buf(stderr, (byte*)block, sizeof(buf_block_t)); +- ++*/ + return(FALSE); + } + +@@ -165,12 +171,13 @@ + ulint flush_type)/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */ + { + #ifdef UNIV_SYNC_DEBUG +- ut_ad(mutex_own(&(buf_pool->mutex))); ++ //ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&(block->mutex))); + #endif /* UNIV_SYNC_DEBUG */ +- ut_a(block->state == BUF_BLOCK_FILE_PAGE); ++ //ut_a(block->state == BUF_BLOCK_FILE_PAGE); + +- if ((ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0) ++ if (block->state == BUF_BLOCK_FILE_PAGE ++ && (ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0) + && (block->io_fix == 0)) { + if (flush_type != BUF_FLUSH_LRU) { + +@@ -199,15 +206,17 @@ + { + ut_ad(block); + #ifdef UNIV_SYNC_DEBUG +- ut_ad(mutex_own(&(buf_pool->mutex))); ++ //ut_ad(mutex_own(&(buf_pool->mutex))); + #endif /* UNIV_SYNC_DEBUG */ + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + ++ mutex_enter(&(buf_pool->flush_list_mutex)); + block->oldest_modification = ut_dulint_zero; + + UT_LIST_REMOVE(flush_list, buf_pool->flush_list, block); + + ut_d(UT_LIST_VALIDATE(flush_list, buf_block_t, buf_pool->flush_list)); ++ mutex_exit(&(buf_pool->flush_list_mutex)); + + (buf_pool->n_flush[block->flush_type])--; + +@@ -553,18 +562,20 @@ + ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST + || flush_type == BUF_FLUSH_SINGLE_PAGE); + +- mutex_enter(&(buf_pool->mutex)); ++ rw_lock_s_lock(&(buf_pool->hash_latch)); + + block = buf_page_hash_get(space, offset); + + ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE); + + if (!block) { +- mutex_exit(&(buf_pool->mutex)); ++ rw_lock_s_unlock(&(buf_pool->hash_latch)); + return(0); + } + + mutex_enter(&block->mutex); ++ mutex_enter(&(buf_pool->mutex)); ++ rw_lock_s_unlock(&(buf_pool->hash_latch)); + + if (flush_type == BUF_FLUSH_LIST + && buf_flush_ready_for_flush(block, flush_type)) { +@@ -761,7 +772,7 @@ + high = fil_space_get_size(space); + } + +- mutex_enter(&(buf_pool->mutex)); ++ rw_lock_s_lock(&(buf_pool->hash_latch)); + + for (i = low; i < high; i++) { + +@@ -795,7 +806,7 @@ + + mutex_exit(&block->mutex); + +- mutex_exit(&(buf_pool->mutex)); ++ rw_lock_s_unlock(&(buf_pool->hash_latch)); + + /* Note: as we release the buf_pool mutex + above, in buf_flush_try_page we cannot be sure +@@ -806,14 +817,14 @@ + count += buf_flush_try_page(space, i, + flush_type); + +- mutex_enter(&(buf_pool->mutex)); ++ rw_lock_s_lock(&(buf_pool->hash_latch)); + } else { + mutex_exit(&block->mutex); + } + } + } + +- mutex_exit(&(buf_pool->mutex)); ++ rw_lock_s_unlock(&(buf_pool->hash_latch)); + + return(count); + } +@@ -848,6 +859,7 @@ + ulint space; + ulint offset; + ibool found; ++ ulint remaining = 0; + + ut_ad((flush_type == BUF_FLUSH_LRU) + || (flush_type == BUF_FLUSH_LIST)); +@@ -866,6 +878,12 @@ + } + + (buf_pool->init_flush)[flush_type] = TRUE; ++ ++ mutex_exit(&(buf_pool->mutex)); ++ ++ if (flush_type == BUF_FLUSH_LRU) { ++ mutex_enter(&(buf_pool->LRU_mutex)); ++ } + + for (;;) { + /* If we have flushed enough, leave the loop */ +@@ -882,7 +900,10 @@ + } else { + ut_ad(flush_type == BUF_FLUSH_LIST); + ++ mutex_enter(&(buf_pool->flush_list_mutex)); ++ remaining = UT_LIST_GET_LEN(buf_pool->flush_list); + block = UT_LIST_GET_LAST(buf_pool->flush_list); ++ mutex_exit(&(buf_pool->flush_list_mutex)); + if (!block + || (ut_dulint_cmp(block->oldest_modification, + lsn_limit) >= 0)) { +@@ -912,7 +933,9 @@ + offset = block->offset; + + mutex_exit(&block->mutex); +- mutex_exit(&(buf_pool->mutex)); ++ if (flush_type == BUF_FLUSH_LRU) { ++ mutex_exit(&(buf_pool->LRU_mutex)); ++ } + + old_page_count = page_count; + +@@ -932,7 +955,9 @@ + flush_type, offset, + page_count - old_page_count); */ + +- mutex_enter(&(buf_pool->mutex)); ++ if (flush_type == BUF_FLUSH_LRU) { ++ mutex_enter(&(buf_pool->LRU_mutex)); ++ } + + } else if (flush_type == BUF_FLUSH_LRU) { + +@@ -944,17 +969,26 @@ + + mutex_exit(&block->mutex); + ++ mutex_enter(&(buf_pool->flush_list_mutex)); + block = UT_LIST_GET_PREV(flush_list, block); ++ mutex_exit(&(buf_pool->flush_list_mutex)); ++ remaining--; + } + } + + /* If we could not find anything to flush, leave the loop */ + +- if (!found) { ++ if (!found && !remaining) { + break; + } + } + ++ if (flush_type == BUF_FLUSH_LRU) { ++ mutex_exit(&(buf_pool->LRU_mutex)); ++ } ++ ++ mutex_enter(&(buf_pool->mutex)); ++ + (buf_pool->init_flush)[flush_type] = FALSE; + + if ((buf_pool->n_flush[flush_type] == 0) +@@ -1013,11 +1047,15 @@ + buf_block_t* block; + ulint n_replaceable; + ulint distance = 0; ++ ibool optimistic = TRUE; + +- mutex_enter(&(buf_pool->mutex)); +- ++ //mutex_enter(&(buf_pool->mutex)); ++retry: + n_replaceable = UT_LIST_GET_LEN(buf_pool->free); + ++ if (!optimistic) ++ mutex_enter(&(buf_pool->LRU_mutex)); ++ + block = UT_LIST_GET_LAST(buf_pool->LRU); + + while ((block != NULL) +@@ -1025,6 +1063,12 @@ + + BUF_FLUSH_EXTRA_MARGIN) + && (distance < BUF_LRU_FREE_SEARCH_LEN)) { + ++ if (!block->in_LRU_list) { ++ /* reatart. but it is very optimistic */ ++ block = UT_LIST_GET_LAST(buf_pool->LRU); ++ continue; ++ } ++ + mutex_enter(&block->mutex); + + if (buf_flush_ready_for_replace(block)) { +@@ -1038,11 +1082,17 @@ + block = UT_LIST_GET_PREV(LRU, block); + } + +- mutex_exit(&(buf_pool->mutex)); ++ //mutex_exit(&(buf_pool->mutex)); ++ if (!optimistic) ++ mutex_exit(&(buf_pool->LRU_mutex)); + + if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) { + + return(0); ++ } else if (optimistic) { ++ /* confirm it again with LRU_mutex for exactness */ ++ optimistic = FALSE; ++ goto retry; + } + + return(BUF_FLUSH_FREE_BLOCK_MARGIN + BUF_FLUSH_EXTRA_MARGIN +@@ -1057,8 +1107,9 @@ + immediately, without waiting. */ + + void +-buf_flush_free_margin(void) ++buf_flush_free_margin( + /*=======================*/ ++ ibool wait) + { + ulint n_to_flush; + ulint n_flushed; +@@ -1068,7 +1119,7 @@ + if (n_to_flush > 0) { + n_flushed = buf_flush_batch(BUF_FLUSH_LRU, n_to_flush, + ut_dulint_zero); +- if (n_flushed == ULINT_UNDEFINED) { ++ if (wait && n_flushed == ULINT_UNDEFINED) { + /* There was an LRU type flush batch already running; + let us wait for it to end */ + +@@ -1118,11 +1169,11 @@ + { + ibool ret; + +- mutex_enter(&(buf_pool->mutex)); ++ mutex_enter(&(buf_pool->flush_list_mutex)); + + ret = buf_flush_validate_low(); + +- mutex_exit(&(buf_pool->mutex)); ++ mutex_exit(&(buf_pool->flush_list_mutex)); + + return(ret); + } +diff -ruN a/innobase/buf/buf0lru.c b/innobase/buf/buf0lru.c +--- a/innobase/buf/buf0lru.c 2009-07-07 21:53:57.000000000 +0900 ++++ b/innobase/buf/buf0lru.c 2009-08-28 11:06:30.000000000 +0900 +@@ -108,7 +108,7 @@ + + page_arr = ut_malloc(sizeof(ulint) + * BUF_LRU_DROP_SEARCH_HASH_SIZE); +- mutex_enter(&buf_pool->mutex); ++ mutex_enter(&buf_pool->LRU_mutex); + + scan_again: + num_entries = 0; +@@ -147,12 +147,12 @@ + } + /* Array full. We release the buf_pool->mutex to + obey the latching order. */ +- mutex_exit(&buf_pool->mutex); ++ mutex_exit(&buf_pool->LRU_mutex); + + buf_LRU_drop_page_hash_batch(id, page_arr, + num_entries); + num_entries = 0; +- mutex_enter(&buf_pool->mutex); ++ mutex_enter(&buf_pool->LRU_mutex); + } else { + mutex_exit(&block->mutex); + } +@@ -177,7 +177,7 @@ + } + } + +- mutex_exit(&buf_pool->mutex); ++ mutex_exit(&buf_pool->LRU_mutex); + + /* Drop any remaining batch of search hashed pages. */ + buf_LRU_drop_page_hash_batch(id, page_arr, num_entries); +@@ -206,7 +206,8 @@ + buf_LRU_drop_page_hash_for_tablespace(id); + + scan_again: +- mutex_enter(&(buf_pool->mutex)); ++ mutex_enter(&(buf_pool->LRU_mutex)); ++ rw_lock_x_lock(&(buf_pool->hash_latch)); + + all_freed = TRUE; + +@@ -244,7 +245,8 @@ + + mutex_exit(&block->mutex); + +- mutex_exit(&(buf_pool->mutex)); ++ mutex_exit(&(buf_pool->LRU_mutex)); ++ rw_lock_x_unlock(&(buf_pool->hash_latch)); + + /* Note that the following call will acquire + an S-latch on the page */ +@@ -274,7 +276,8 @@ + block = UT_LIST_GET_PREV(LRU, block); + } + +- mutex_exit(&(buf_pool->mutex)); ++ mutex_exit(&(buf_pool->LRU_mutex)); ++ rw_lock_x_unlock(&(buf_pool->hash_latch)); + + if (!all_freed) { + os_thread_sleep(20000); +@@ -297,14 +300,14 @@ + ulint len; + ulint limit; + +- mutex_enter(&(buf_pool->mutex)); ++ mutex_enter(&(buf_pool->LRU_mutex)); + + len = UT_LIST_GET_LEN(buf_pool->LRU); + + if (len < BUF_LRU_OLD_MIN_LEN) { + /* The LRU list is too short to do read-ahead */ + +- mutex_exit(&(buf_pool->mutex)); ++ mutex_exit(&(buf_pool->LRU_mutex)); + + return(0); + } +@@ -313,7 +316,7 @@ + + limit = block->LRU_position - len / BUF_LRU_INITIAL_RATIO; + +- mutex_exit(&(buf_pool->mutex)); ++ mutex_exit(&(buf_pool->LRU_mutex)); + + return(limit); + } +@@ -337,13 +340,15 @@ + ulint distance = 0; + ibool freed; + +- mutex_enter(&(buf_pool->mutex)); ++ /* optimistic search... */ ++ //mutex_enter(&(buf_pool->mutex)); + ++retry: + freed = FALSE; + block = UT_LIST_GET_LAST(buf_pool->LRU); + + while (block != NULL) { +- ut_a(block->in_LRU_list); ++ //ut_a(block->in_LRU_list); /* optimistic */ + + mutex_enter(&block->mutex); + +@@ -358,9 +363,17 @@ + } + #endif /* UNIV_DEBUG */ + ++ mutex_exit(&block->mutex); ++ ++ mutex_enter(&(buf_pool->LRU_mutex));/* optimistic */ ++ ++ rw_lock_x_lock(&(buf_pool->hash_latch)); ++ mutex_enter(&block->mutex); ++ if(block->in_LRU_list && buf_flush_ready_for_replace(block)) { + buf_LRU_block_remove_hashed_page(block); ++ rw_lock_x_unlock(&(buf_pool->hash_latch)); + +- mutex_exit(&(buf_pool->mutex)); ++ mutex_exit(&(buf_pool->LRU_mutex)); + mutex_exit(&block->mutex); + + /* Remove possible adaptive hash index built on the +@@ -373,7 +386,6 @@ + + ut_a(block->buf_fix_count == 0); + +- mutex_enter(&(buf_pool->mutex)); + mutex_enter(&block->mutex); + + buf_LRU_block_free_hashed_page(block); +@@ -381,6 +393,16 @@ + mutex_exit(&block->mutex); + + break; ++ } else { /* someone may interrupt...??? */ ++ mutex_exit(&(buf_pool->LRU_mutex));/* optimistic */ ++ ++ rw_lock_x_unlock(&(buf_pool->hash_latch)); ++ ++ if (!(block->in_LRU_list)) { ++ mutex_exit(&block->mutex); ++ goto retry; ++ } ++ } + } + + mutex_exit(&block->mutex); +@@ -391,6 +413,7 @@ + if (!freed && n_iterations <= 10 + && distance > 100 + (n_iterations * buf_pool->curr_size) + / 10) { ++ mutex_enter(&(buf_pool->mutex)); + buf_pool->LRU_flush_ended = 0; + + mutex_exit(&(buf_pool->mutex)); +@@ -398,6 +421,8 @@ + return(FALSE); + } + } ++ ++ mutex_enter(&(buf_pool->mutex)); + if (buf_pool->LRU_flush_ended > 0) { + buf_pool->LRU_flush_ended--; + } +@@ -449,7 +474,8 @@ + { + ibool ret = FALSE; + +- mutex_enter(&(buf_pool->mutex)); ++ mutex_enter(&(buf_pool->LRU_mutex)); ++ mutex_enter(&(buf_pool->free_mutex)); + + if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) + + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->max_size / 4) { +@@ -457,7 +483,8 @@ + ret = TRUE; + } + +- mutex_exit(&(buf_pool->mutex)); ++ mutex_exit(&(buf_pool->LRU_mutex)); ++ mutex_exit(&(buf_pool->free_mutex)); + + return(ret); + } +@@ -480,7 +507,7 @@ + ibool mon_value_was = FALSE; + ibool started_monitor = FALSE; + loop: +- mutex_enter(&(buf_pool->mutex)); ++ //mutex_enter(&(buf_pool->mutex)); /* optimistic */ + + if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) + + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->max_size / 20) { +@@ -536,10 +563,16 @@ + /* If there is a block in the free list, take it */ + if (UT_LIST_GET_LEN(buf_pool->free) > 0) { + +- block = UT_LIST_GET_FIRST(buf_pool->free); ++ mutex_enter(&(buf_pool->free_mutex)); ++ block = UT_LIST_GET_LAST(buf_pool->free); ++ if (!block) { ++ mutex_exit(&(buf_pool->free_mutex)); ++ goto no_block; ++ } + ut_a(block->in_free_list); + UT_LIST_REMOVE(free, buf_pool->free, block); + block->in_free_list = FALSE; ++ mutex_exit(&(buf_pool->free_mutex)); + ut_a(block->state != BUF_BLOCK_FILE_PAGE); + ut_a(!block->in_LRU_list); + +@@ -564,7 +597,7 @@ + + mutex_exit(&block->mutex); + +- mutex_exit(&(buf_pool->mutex)); ++ //mutex_exit(&(buf_pool->mutex)); + + if (started_monitor) { + srv_print_innodb_monitor = mon_value_was; +@@ -572,11 +605,12 @@ + + return(block); + } ++no_block: + + /* If no block was in the free list, search from the end of the LRU + list and try to free a block there */ + +- mutex_exit(&(buf_pool->mutex)); ++ //mutex_exit(&(buf_pool->mutex)); + + freed = buf_LRU_search_and_free_block(n_iterations); + +@@ -613,7 +647,7 @@ + + /* No free block was found: try to flush the LRU list */ + +- buf_flush_free_margin(); ++ buf_flush_free_margin(TRUE); + ++srv_buf_pool_wait_free; + + os_aio_simulated_wake_handler_threads(); +@@ -655,7 +689,7 @@ + + ut_a(buf_pool->LRU_old); + #ifdef UNIV_SYNC_DEBUG +- ut_ad(mutex_own(&(buf_pool->mutex))); ++ ut_ad(mutex_own(&(buf_pool->LRU_mutex))); + #endif /* UNIV_SYNC_DEBUG */ + ut_ad(3 * (BUF_LRU_OLD_MIN_LEN / 8) > BUF_LRU_OLD_TOLERANCE + 5); + +@@ -730,7 +764,7 @@ + ut_ad(buf_pool); + ut_ad(block); + #ifdef UNIV_SYNC_DEBUG +- ut_ad(mutex_own(&(buf_pool->mutex))); ++ ut_ad(mutex_own(&(buf_pool->LRU_mutex))); + #endif /* UNIV_SYNC_DEBUG */ + + ut_a(block->state == BUF_BLOCK_FILE_PAGE); +@@ -796,7 +830,7 @@ + ut_ad(buf_pool); + ut_ad(block); + #ifdef UNIV_SYNC_DEBUG +- ut_ad(mutex_own(&(buf_pool->mutex))); ++ ut_ad(mutex_own(&(buf_pool->LRU_mutex))); + #endif /* UNIV_SYNC_DEBUG */ + + ut_a(block->state == BUF_BLOCK_FILE_PAGE); +@@ -861,7 +895,7 @@ + ut_ad(buf_pool); + ut_ad(block); + #ifdef UNIV_SYNC_DEBUG +- ut_ad(mutex_own(&(buf_pool->mutex))); ++ ut_ad(mutex_own(&(buf_pool->LRU_mutex))); + #endif /* UNIV_SYNC_DEBUG */ + + ut_a(block->state == BUF_BLOCK_FILE_PAGE); +@@ -964,7 +998,7 @@ + buf_block_t* block) /* in: block, must not contain a file page */ + { + #ifdef UNIV_SYNC_DEBUG +- ut_ad(mutex_own(&(buf_pool->mutex))); ++ //ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&block->mutex)); + #endif /* UNIV_SYNC_DEBUG */ + ut_ad(block); +@@ -981,8 +1015,10 @@ + /* Wipe contents of page to reveal possible stale pointers to it */ + memset(block->frame, '\0', UNIV_PAGE_SIZE); + #endif ++ mutex_enter(&(buf_pool->free_mutex)); + UT_LIST_ADD_FIRST(free, buf_pool->free, block); + block->in_free_list = TRUE; ++ mutex_exit(&(buf_pool->free_mutex)); + + if (srv_use_awe && block->frame) { + /* Add to the list of mapped pages */ +@@ -1004,7 +1040,7 @@ + may or may not be a hash index to the page */ + { + #ifdef UNIV_SYNC_DEBUG +- ut_ad(mutex_own(&(buf_pool->mutex))); ++ ut_ad(mutex_own(&(buf_pool->LRU_mutex))); + ut_ad(mutex_own(&block->mutex)); + #endif /* UNIV_SYNC_DEBUG */ + ut_ad(block); +@@ -1062,7 +1098,7 @@ + be in a state where it can be freed */ + { + #ifdef UNIV_SYNC_DEBUG +- ut_ad(mutex_own(&(buf_pool->mutex))); ++ //ut_ad(mutex_own(&(buf_pool->mutex))); + ut_ad(mutex_own(&block->mutex)); + #endif /* UNIV_SYNC_DEBUG */ + ut_a(block->state == BUF_BLOCK_REMOVE_HASH); +@@ -1085,7 +1121,7 @@ + ulint LRU_pos; + + ut_ad(buf_pool); +- mutex_enter(&(buf_pool->mutex)); ++ mutex_enter(&(buf_pool->LRU_mutex)); + + if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) { + +@@ -1130,6 +1166,9 @@ + ut_a(buf_pool->LRU_old_len == old_len); + } + ++ mutex_exit(&(buf_pool->LRU_mutex)); ++ mutex_enter(&(buf_pool->free_mutex)); ++ + UT_LIST_VALIDATE(free, buf_block_t, buf_pool->free); + + block = UT_LIST_GET_FIRST(buf_pool->free); +@@ -1140,7 +1179,7 @@ + block = UT_LIST_GET_NEXT(free, block); + } + +- mutex_exit(&(buf_pool->mutex)); ++ mutex_exit(&(buf_pool->free_mutex)); + return(TRUE); + } + +@@ -1156,7 +1195,7 @@ + ulint len; + + ut_ad(buf_pool); +- mutex_enter(&(buf_pool->mutex)); ++ mutex_enter(&(buf_pool->LRU_mutex)); + + fprintf(stderr, "Pool ulint clock %lu\n", (ulong) buf_pool->ulint_clock); + +@@ -1200,5 +1239,5 @@ + } + } + +- mutex_exit(&(buf_pool->mutex)); ++ mutex_exit(&(buf_pool->LRU_mutex)); + } +diff -ruN a/innobase/buf/buf0rea.c b/innobase/buf/buf0rea.c +--- a/innobase/buf/buf0rea.c 2009-08-28 11:08:17.000000000 +0900 ++++ b/innobase/buf/buf0rea.c 2009-08-28 11:06:30.000000000 +0900 +@@ -277,10 +277,12 @@ + + return(0); + } ++ mutex_exit(&(buf_pool->mutex)); + + /* Count how many blocks in the area have been recently accessed, + that is, reside near the start of the LRU list. */ + ++ rw_lock_s_lock(&(buf_pool->hash_latch)); + for (i = low; i < high; i++) { + block = buf_page_hash_get(space, i); + +@@ -292,7 +294,7 @@ + } + } + +- mutex_exit(&(buf_pool->mutex)); ++ rw_lock_s_unlock(&(buf_pool->hash_latch)); + + if (recent_blocks < BUF_READ_AHEAD_RANDOM_THRESHOLD) { + /* Do nothing */ +@@ -388,7 +390,7 @@ + } + + /* Flush pages from the end of the LRU list if necessary */ +- buf_flush_free_margin(); ++ buf_flush_free_margin(FALSE); + + return(count + count2); + } +@@ -491,6 +493,7 @@ + + return(0); + } ++ mutex_exit(&(buf_pool->mutex)); + + /* Check that almost all pages in the area have been accessed; if + offset == low, the accesses must be in a descending order, otherwise, +@@ -504,6 +507,7 @@ + + fail_count = 0; + ++ rw_lock_s_lock(&(buf_pool->hash_latch)); + for (i = low; i < high; i++) { + block = buf_page_hash_get(space, i); + +@@ -520,23 +524,23 @@ + pred_block = block; + } + } ++ rw_lock_s_unlock(&(buf_pool->hash_latch)); + + if (fail_count > BUF_READ_AHEAD_LINEAR_AREA - + BUF_READ_AHEAD_LINEAR_THRESHOLD) { + /* Too many failures: return */ + +- mutex_exit(&(buf_pool->mutex)); +- + return(0); + } + + /* If we got this far, we know that enough pages in the area have + been accessed in the right order: linear read-ahead can be sensible */ + ++ rw_lock_s_lock(&(buf_pool->hash_latch)); + block = buf_page_hash_get(space, offset); + + if (block == NULL) { +- mutex_exit(&(buf_pool->mutex)); ++ rw_lock_s_unlock(&(buf_pool->hash_latch)); + + return(0); + } +@@ -552,7 +556,7 @@ + pred_offset = fil_page_get_prev(frame); + succ_offset = fil_page_get_next(frame); + +- mutex_exit(&(buf_pool->mutex)); ++ rw_lock_s_unlock(&(buf_pool->hash_latch)); + + if ((offset == low) && (succ_offset == offset + 1)) { + +@@ -628,7 +632,7 @@ + os_aio_simulated_wake_handler_threads(); + + /* Flush pages from the end of the LRU list if necessary */ +- buf_flush_free_margin(); ++ buf_flush_free_margin(FALSE); + + #ifdef UNIV_DEBUG + if (buf_debug_prints && (count > 0)) { +@@ -696,7 +700,7 @@ + os_aio_simulated_wake_handler_threads(); + + /* Flush pages from the end of the LRU list if necessary */ +- buf_flush_free_margin(); ++ buf_flush_free_margin(FALSE); + + #ifdef UNIV_DEBUG + if (buf_debug_prints) { +@@ -768,7 +772,7 @@ + os_aio_simulated_wake_handler_threads(); + + /* Flush pages from the end of the LRU list if necessary */ +- buf_flush_free_margin(); ++ buf_flush_free_margin(FALSE); + + #ifdef UNIV_DEBUG + if (buf_debug_prints) { +diff -ruN a/innobase/include/buf0buf.h b/innobase/include/buf0buf.h +--- a/innobase/include/buf0buf.h 2009-08-28 11:08:16.000000000 +0900 ++++ b/innobase/include/buf0buf.h 2009-08-28 11:06:30.000000000 +0900 +@@ -946,6 +946,7 @@ + mem_heap_t* io_counter_heap; + ulint io_counters; + hash_table_t* page_hash; /* hash table of the file pages */ ++ rw_lock_t hash_latch; + + ulint n_pend_reads; /* number of pending read operations */ + +@@ -978,6 +979,7 @@ + UT_LIST_BASE_NODE_T(buf_block_t) flush_list; + /* base node of the modified block + list */ ++ mutex_t flush_list_mutex; + ibool init_flush[BUF_FLUSH_LIST + 1]; + /* this is TRUE when a flush of the + given type is being initialized */ +@@ -1011,8 +1013,10 @@ + in the case of AWE, at the start are + always free blocks for which the + physical memory is mapped to a frame */ ++ mutex_t free_mutex; + UT_LIST_BASE_NODE_T(buf_block_t) LRU; + /* base node of the LRU list */ ++ mutex_t LRU_mutex; + buf_block_t* LRU_old; /* pointer to the about 3/8 oldest + blocks in the LRU list; NULL if LRU + length less than BUF_LRU_OLD_MIN_LEN */ +diff -ruN a/innobase/include/buf0buf.ic b/innobase/include/buf0buf.ic +--- a/innobase/include/buf0buf.ic 2009-07-07 21:54:00.000000000 +0900 ++++ b/innobase/include/buf0buf.ic 2009-08-28 11:06:30.000000000 +0900 +@@ -112,7 +112,8 @@ + buf_block_t* block; + dulint lsn; + +- mutex_enter(&(buf_pool->mutex)); ++try_again: ++ mutex_enter(&(buf_pool->flush_list_mutex)); + + block = UT_LIST_GET_LAST(buf_pool->flush_list); + +@@ -120,9 +121,13 @@ + lsn = ut_dulint_zero; + } else { + lsn = block->oldest_modification; ++ if (ut_dulint_cmp(lsn, ut_dulint_zero) == 0) { ++ mutex_exit(&(buf_pool->flush_list_mutex)); ++ goto try_again; ++ } + } + +- mutex_exit(&(buf_pool->mutex)); ++ mutex_exit(&(buf_pool->flush_list_mutex)); + + return(lsn); + } +@@ -137,7 +142,7 @@ + /* out: new clock value */ + { + #ifdef UNIV_SYNC_DEBUG +- ut_ad(mutex_own(&(buf_pool->mutex))); ++ ut_ad(mutex_own(&(buf_pool->LRU_mutex))); + #endif /* UNIV_SYNC_DEBUG */ + + buf_pool->ulint_clock++; +@@ -392,18 +397,18 @@ + /* out: TRUE if io going on */ + buf_block_t* block) /* in: buf_pool block, must be bufferfixed */ + { +- mutex_enter(&(buf_pool->mutex)); ++ mutex_enter(&block->mutex); + + ut_ad(block->state == BUF_BLOCK_FILE_PAGE); + ut_ad(block->buf_fix_count > 0); + + if (block->io_fix != 0) { +- mutex_exit(&(buf_pool->mutex)); ++ mutex_exit(&block->mutex); + + return(TRUE); + } + +- mutex_exit(&(buf_pool->mutex)); ++ mutex_exit(&block->mutex); + + return(FALSE); + } +@@ -425,7 +430,7 @@ + + block = buf_block_align(frame); + +- mutex_enter(&(buf_pool->mutex)); ++ mutex_enter(&block->mutex); + + if (block->state == BUF_BLOCK_FILE_PAGE) { + lsn = block->newest_modification; +@@ -433,7 +438,7 @@ + lsn = ut_dulint_zero; + } + +- mutex_exit(&(buf_pool->mutex)); ++ mutex_exit(&block->mutex); + + return(lsn); + } +@@ -456,7 +461,7 @@ + block = buf_block_align(frame); + + #ifdef UNIV_SYNC_DEBUG +- ut_ad((mutex_own(&(buf_pool->mutex)) && (block->buf_fix_count == 0)) ++ ut_ad((mutex_own(&(buf_pool->LRU_mutex)) && (block->buf_fix_count == 0)) + || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE)); + #endif /*UNIV_SYNC_DEBUG */ + +@@ -477,7 +482,7 @@ + buf_block_t* block) /* in: block */ + { + #ifdef UNIV_SYNC_DEBUG +- ut_ad((mutex_own(&(buf_pool->mutex)) && (block->buf_fix_count == 0)) ++ ut_ad((mutex_own(&(buf_pool->LRU_mutex)) && (block->buf_fix_count == 0)) + || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE)); + #endif /* UNIV_SYNC_DEBUG */ + +@@ -555,7 +560,8 @@ + + ut_ad(buf_pool); + #ifdef UNIV_SYNC_DEBUG +- ut_ad(mutex_own(&(buf_pool->mutex))); ++ ut_ad(rw_lock_own(&(buf_pool->hash_latch), RW_LOCK_EX) ++ || rw_lock_own(&(buf_pool->hash_latch), RW_LOCK_SHARED)); + #endif /* UNIV_SYNC_DEBUG */ + + /* Look for the page in the hash table */ +@@ -631,11 +637,14 @@ + + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + ++ /* buf_flush_note_modification() should be called before this function. */ ++/* + if (rw_latch == RW_X_LATCH && mtr->modifications) { + mutex_enter(&buf_pool->mutex); + buf_flush_note_modification(block, mtr); + mutex_exit(&buf_pool->mutex); + } ++*/ + + mutex_enter(&block->mutex); + +diff -ruN a/innobase/include/buf0flu.h b/innobase/include/buf0flu.h +--- a/innobase/include/buf0flu.h 2009-07-07 21:54:00.000000000 +0900 ++++ b/innobase/include/buf0flu.h 2009-08-28 11:06:30.000000000 +0900 +@@ -26,8 +26,9 @@ + a margin of replaceable pages there. */ + + void +-buf_flush_free_margin(void); ++buf_flush_free_margin( + /*=======================*/ ++ ibool wait); + /************************************************************************ + Initializes a page for writing to the tablespace. */ + +diff -ruN a/innobase/include/buf0flu.ic b/innobase/include/buf0flu.ic +--- a/innobase/include/buf0flu.ic 2009-07-07 21:54:00.000000000 +0900 ++++ b/innobase/include/buf0flu.ic 2009-08-28 11:06:30.000000000 +0900 +@@ -38,11 +38,14 @@ + mtr_t* mtr) /* in: mtr */ + { + ut_ad(block); ++ ++ mutex_enter(&block->mutex); ++ + ut_ad(block->state == BUF_BLOCK_FILE_PAGE); + ut_ad(block->buf_fix_count > 0); + #ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); +- ut_ad(mutex_own(&(buf_pool->mutex))); ++ //ut_ad(mutex_own(&(buf_pool->mutex))); + #endif /* UNIV_SYNC_DEBUG */ + + ut_ad(ut_dulint_cmp(mtr->start_lsn, ut_dulint_zero) != 0); +@@ -52,16 +55,20 @@ + block->newest_modification = mtr->end_lsn; + + if (ut_dulint_is_zero(block->oldest_modification)) { ++ mutex_enter(&(buf_pool->flush_list_mutex)); + + block->oldest_modification = mtr->start_lsn; + ut_ad(!ut_dulint_is_zero(block->oldest_modification)); + + buf_flush_insert_into_flush_list(block); ++ mutex_exit(&(buf_pool->flush_list_mutex)); + } else { + ut_ad(ut_dulint_cmp(block->oldest_modification, + mtr->start_lsn) <= 0); + } + ++ mutex_exit(&block->mutex); ++ + ++srv_buf_pool_write_requests; + } + +@@ -78,29 +85,32 @@ + set of mtr's */ + { + ut_ad(block); ++ ++ mutex_enter(&(block->mutex)); ++ + ut_ad(block->state == BUF_BLOCK_FILE_PAGE); + ut_ad(block->buf_fix_count > 0); + #ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); + #endif /* UNIV_SYNC_DEBUG */ + +- mutex_enter(&(buf_pool->mutex)); +- + ut_ad(ut_dulint_cmp(block->newest_modification, end_lsn) <= 0); + + block->newest_modification = end_lsn; + + if (ut_dulint_is_zero(block->oldest_modification)) { ++ mutex_enter(&(buf_pool->flush_list_mutex)); + + block->oldest_modification = start_lsn; + + ut_ad(!ut_dulint_is_zero(block->oldest_modification)); + + buf_flush_insert_sorted_into_flush_list(block); ++ mutex_exit(&(buf_pool->flush_list_mutex)); + } else { + ut_ad(ut_dulint_cmp(block->oldest_modification, + start_lsn) <= 0); + } + +- mutex_exit(&(buf_pool->mutex)); ++ mutex_exit(&(block->mutex)); + } +diff -ruN a/innobase/include/sync0sync.h b/innobase/include/sync0sync.h +--- a/innobase/include/sync0sync.h 2009-07-07 21:54:06.000000000 +0900 ++++ b/innobase/include/sync0sync.h 2009-08-28 11:06:30.000000000 +0900 +@@ -438,8 +438,12 @@ + SYNC_SEARCH_SYS, as memory allocation + can call routines there! Otherwise + the level is SYNC_MEM_HASH. */ ++#define SYNC_BUF_LRU_LIST 157 ++#define SYNC_BUF_PAGE_HASH 156 ++#define SYNC_BUF_BLOCK 155 ++#define SYNC_BUF_FREE_LIST 153 + #define SYNC_BUF_POOL 150 +-#define SYNC_BUF_BLOCK 149 ++#define SYNC_BUF_FLUSH_LIST 149 + #define SYNC_DOUBLEWRITE 140 + #define SYNC_ANY_LATCH 135 + #define SYNC_THR_LOCAL 133 +diff -ruN a/innobase/log/log0recv.c b/innobase/log/log0recv.c +--- a/innobase/log/log0recv.c 2009-08-28 11:08:17.000000000 +0900 ++++ b/innobase/log/log0recv.c 2009-08-28 11:06:30.000000000 +0900 +@@ -1695,11 +1695,11 @@ + + mtr_start(&mtr); + +- mutex_enter(&(buf_pool->mutex)); ++ rw_lock_s_lock(&(buf_pool->hash_latch)); + + page = buf_page_hash_get(space, page_no)->frame; + +- mutex_exit(&(buf_pool->mutex)); ++ rw_lock_s_unlock(&(buf_pool->hash_latch)); + + replica = buf_page_get(space + RECV_REPLICA_SPACE_ADD, page_no, + RW_X_LATCH, &mtr); +diff -ruN a/innobase/mtr/mtr0mtr.c b/innobase/mtr/mtr0mtr.c +--- a/innobase/mtr/mtr0mtr.c 2009-07-07 21:54:08.000000000 +0900 ++++ b/innobase/mtr/mtr0mtr.c 2009-08-28 11:06:30.000000000 +0900 +@@ -103,6 +103,38 @@ + } + } + ++UNIV_INLINE ++void ++mtr_memo_note_modification_all( ++/*===========================*/ ++ mtr_t* mtr) /* in: mtr */ ++{ ++ mtr_memo_slot_t* slot; ++ dyn_array_t* memo; ++ ulint offset; ++ ++ ut_ad(mtr); ++ ut_ad(mtr->magic_n == MTR_MAGIC_N); ++ ut_ad(mtr->state == MTR_COMMITTING); /* Currently only used in ++ commit */ ++ ut_ad(mtr->modifications); ++ ++ memo = &(mtr->memo); ++ ++ offset = dyn_array_get_data_size(memo); ++ ++ while (offset > 0) { ++ offset -= sizeof(mtr_memo_slot_t); ++ slot = dyn_array_get_element(memo, offset); ++ ++ if (UNIV_LIKELY(slot->object != NULL) && ++ slot->type == MTR_MEMO_PAGE_X_FIX) { ++ buf_flush_note_modification( ++ (buf_block_t*)slot->object, mtr); ++ } ++ } ++} ++ + /**************************************************************** + Writes the contents of a mini-transaction log, if any, to the database log. */ + static +@@ -177,6 +209,8 @@ + #endif + if (mtr->modifications) { + mtr_log_reserve_and_write(mtr); ++ ++ mtr_memo_note_modification_all(mtr); + } + + /* We first update the modification info to buffer pages, and only +@@ -187,12 +221,13 @@ + required when we insert modified buffer pages in to the flush list + which must be sorted on oldest_modification. */ + +- mtr_memo_pop_all(mtr); +- + if (mtr->modifications) { + log_release(); + } + ++ /* All unlocking has been moved here, after log_sys mutex release. */ ++ mtr_memo_pop_all(mtr); ++ + #ifdef UNIV_DEBUG + mtr->state = MTR_COMMITTED; + #endif +@@ -262,6 +297,12 @@ + slot = dyn_array_get_element(memo, offset); + + if ((object == slot->object) && (type == slot->type)) { ++ if (mtr->modifications && ++ UNIV_LIKELY(slot->object != NULL) && ++ slot->type == MTR_MEMO_PAGE_X_FIX) { ++ buf_flush_note_modification( ++ (buf_block_t*)slot->object, mtr); ++ } + + mtr_memo_slot_release(mtr, slot); + +diff -ruN a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c +--- a/innobase/srv/srv0srv.c 2009-08-28 11:08:17.000000000 +0900 ++++ b/innobase/srv/srv0srv.c 2009-08-28 11:06:30.000000000 +0900 +@@ -370,6 +370,7 @@ + ulong srv_n_free_tickets_to_enter = 500; + ulong srv_thread_sleep_delay = 10000; + ulint srv_spin_wait_delay = 5; ++ulint srv_spins_microsec = 50; + ibool srv_priority_boost = TRUE; + + ibool srv_print_thread_releases = FALSE; +@@ -676,6 +677,47 @@ + ulint srv_n_threads_active[SRV_MASTER + 1]; + ulint srv_n_threads[SRV_MASTER + 1]; + ++static ++void ++srv_align_spins_microsec(void) ++{ ++ ulint start_sec, end_sec; ++ ulint start_usec, end_usec; ++ ib_longlong usecs; ++ ++ /* change temporary */ ++ srv_spins_microsec = 1; ++ ++ if (ut_usectime(&start_sec, &start_usec)) { ++ srv_spins_microsec = 50; ++ goto end; ++ } ++ ++ ut_delay(100000); ++ ++ if (ut_usectime(&end_sec, &end_usec)) { ++ srv_spins_microsec = 50; ++ goto end; ++ } ++ ++ usecs = (end_sec - start_sec) * 1000000LL + (end_usec - start_usec); ++ ++ if (usecs) { ++ srv_spins_microsec = 100000 / usecs; ++ if (srv_spins_microsec == 0) ++ srv_spins_microsec = 1; ++ if (srv_spins_microsec > 50) ++ srv_spins_microsec = 50; ++ } else { ++ srv_spins_microsec = 50; ++ } ++end: ++ if (srv_spins_microsec != 50) ++ fprintf(stderr, ++ "InnoDB: unit of spin count at ut_delay() is aligned to %lu\n", ++ srv_spins_microsec); ++} ++ + /************************************************************************* + Sets the info describing an i/o thread current state. */ + +@@ -909,6 +951,8 @@ + dict_table_t* table; + ulint i; + ++ srv_align_spins_microsec(); ++ + srv_sys = mem_alloc(sizeof(srv_sys_t)); + + kernel_mutex_temp = mem_alloc(sizeof(mutex_t)); +@@ -2665,7 +2709,7 @@ + ib_longlong level, bpl; + buf_block_t* bpage; + +- mutex_enter(&buf_pool->mutex); ++ mutex_enter(&(buf_pool->flush_list_mutex)); + + level = 0; + bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); +@@ -2687,7 +2731,7 @@ + bpl = 0; + } + +- mutex_exit(&buf_pool->mutex); ++ mutex_exit(&(buf_pool->flush_list_mutex)); + + if (!srv_use_doublewrite_buf) { + /* flush is faster than when doublewrite */ +diff -ruN a/innobase/sync/sync0sync.c b/innobase/sync/sync0sync.c +--- a/innobase/sync/sync0sync.c 2009-07-07 21:54:10.000000000 +0900 ++++ b/innobase/sync/sync0sync.c 2009-08-28 11:06:30.000000000 +0900 +@@ -1105,11 +1105,19 @@ + } else if (level == SYNC_DOUBLEWRITE) { + ut_a(sync_thread_levels_g(array, SYNC_DOUBLEWRITE)); + } else if (level == SYNC_BUF_BLOCK) { +- ut_a((sync_thread_levels_contain(array, SYNC_BUF_POOL) ++ ut_a((sync_thread_levels_contain(array, SYNC_BUF_LRU_LIST) + && sync_thread_levels_g(array, SYNC_BUF_BLOCK - 1)) + || sync_thread_levels_g(array, SYNC_BUF_BLOCK)); + } else if (level == SYNC_BUF_POOL) { + ut_a(sync_thread_levels_g(array, SYNC_BUF_POOL)); ++ } else if (level == SYNC_BUF_FLUSH_LIST) { ++ ut_a(sync_thread_levels_g(array, SYNC_BUF_FLUSH_LIST)); ++ } else if (level == SYNC_BUF_FREE_LIST) { ++ ut_a(sync_thread_levels_g(array, SYNC_BUF_FREE_LIST)); ++ } else if (level == SYNC_BUF_PAGE_HASH) { ++ ut_a(sync_thread_levels_g(array, SYNC_BUF_PAGE_HASH)); ++ } else if (level == SYNC_BUF_LRU_LIST) { ++ ut_a(sync_thread_levels_g(array, SYNC_BUF_LRU_LIST)); + } else if (level == SYNC_SEARCH_SYS) { + ut_a(sync_thread_levels_g(array, SYNC_SEARCH_SYS)); + } else if (level == SYNC_TRX_LOCK_HEAP) { +diff -ruN a/innobase/ut/ut0ut.c b/innobase/ut/ut0ut.c +--- a/innobase/ut/ut0ut.c 2009-07-07 21:54:12.000000000 +0900 ++++ b/innobase/ut/ut0ut.c 2009-08-28 11:06:30.000000000 +0900 +@@ -347,6 +347,7 @@ + /***************************************************************** + Runs an idle loop on CPU. The argument gives the desired delay + in microseconds on 100 MHz Pentium + Visual C++. */ ++extern ulint srv_spins_microsec; + + ulint + ut_delay( +@@ -358,7 +359,11 @@ + + j = 0; + +- for (i = 0; i < delay * 50; i++) { ++ for (i = 0; i < delay * srv_spins_microsec; i++) { ++#if (defined (__i386__) || defined (__x86_64__)) && defined (__GNUC__) ++ /* it is equal to the instruction 'pause' */ ++ __asm__ __volatile__ ("rep; nop"); ++#endif + j += i; + } + +diff -ruN a/patch_info/innodb_split_buf_pool_mutex.info b/patch_info/innodb_split_buf_pool_mutex.info +--- /dev/null 1970-01-01 09:00:00.000000000 +0900 ++++ b/patch_info/innodb_split_buf_pool_mutex.info 2009-08-28 11:06:30.000000000 +0900 +@@ -0,0 +1,6 @@ ++File=innodb_split_buf_pool_mutex.patch ++Name=InnoDB patch to fix buffer pool scalability ++Version=1.0 ++Author=Yasufumi Kinoshita ++License=BSD ++Comment=Backport from XtraDB +diff -ruN a/sql/ha_innodb.cc b/sql/ha_innodb.cc +--- a/sql/ha_innodb.cc 2009-08-28 11:08:17.000000000 +0900 ++++ b/sql/ha_innodb.cc 2009-08-28 11:06:30.000000000 +0900 +@@ -1507,6 +1507,13 @@ + /* We set srv_pool_size here in units of 1 kB. InnoDB internally + changes the value so that it becomes the number of database pages. */ + ++ if (innobase_buffer_pool_awe_mem_mb) { ++ /* split_buf_pool_mutex.patch don't support AWE */ ++ fputs("InnoDB: Warning: split_buf_pool_mutex.patch don't support AWE. Disabled.\n", ++ stderr); ++ innobase_buffer_pool_awe_mem_mb = 0; ++ } ++ + if (innobase_buffer_pool_awe_mem_mb == 0) { + /* Careful here: we first convert the signed long int to ulint + and only after that divide */ diff --git a/percona/5.0.91-b22-20100522/innodb_thread_concurrency_timer_based.patch b/percona/5.0.91-b22-20100522/innodb_thread_concurrency_timer_based.patch new file mode 100644 index 0000000..3b8f659 --- /dev/null +++ b/percona/5.0.91-b22-20100522/innodb_thread_concurrency_timer_based.patch @@ -0,0 +1,389 @@ +diff -ruN a/innobase/configure b/innobase/configure +--- a/innobase/configure 2009-01-30 06:56:31.000000000 +0900 ++++ b/innobase/configure 2009-05-06 15:40:47.000000000 +0900 +@@ -21306,6 +21306,88 @@ + fi + done + ++ ++# as http://lists.mysql.com/commits/40686 does ++{ echo "$as_me:$LINENO: checking whether the compiler provides atomic builtins" >&5 ++echo $ECHO_N "checking whether the compiler provides atomic builtins... $ECHO_C" >&6; } ++if test "${mysql_cv_atomic_builtins+set}" = set; then ++ echo $ECHO_N "(cached) $ECHO_C" >&6 ++else ++ if test "$cross_compiling" = yes; then ++ { { echo "$as_me:$LINENO: error: cannot run test program while cross compiling ++See \`config.log' for more details." >&5 ++echo "$as_me: error: cannot run test program while cross compiling ++See \`config.log' for more details." >&2;} ++ { (exit 1); exit 1; }; } ++else ++ cat >conftest.$ac_ext <<_ACEOF ++/* confdefs.h. */ ++_ACEOF ++cat confdefs.h >>conftest.$ac_ext ++cat >>conftest.$ac_ext <<_ACEOF ++/* end confdefs.h. */ ++ ++ int main() ++ { ++ int foo= -10; int bar= 10; ++ __sync_fetch_and_add(&foo, bar); ++ if (foo) ++ return -1; ++ bar= __sync_lock_test_and_set(&foo, bar); ++ if (bar || foo != 10) ++ return -1; ++ bar= __sync_val_compare_and_swap(&bar, foo, 15); ++ if (bar) ++ return -1; ++ return 0; ++ } ++ ++_ACEOF ++rm -f conftest$ac_exeext ++if { (ac_try="$ac_link" ++case "(($ac_try" in ++ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; ++ *) ac_try_echo=$ac_try;; ++esac ++eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 ++ (eval "$ac_link") 2>&5 ++ ac_status=$? ++ echo "$as_me:$LINENO: \$? = $ac_status" >&5 ++ (exit $ac_status); } && { ac_try='./conftest$ac_exeext' ++ { (case "(($ac_try" in ++ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; ++ *) ac_try_echo=$ac_try;; ++esac ++eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 ++ (eval "$ac_try") 2>&5 ++ ac_status=$? ++ echo "$as_me:$LINENO: \$? = $ac_status" >&5 ++ (exit $ac_status); }; }; then ++ mysql_cv_atomic_builtins=yes ++else ++ echo "$as_me: program exited with status $ac_status" >&5 ++echo "$as_me: failed program was:" >&5 ++sed 's/^/| /' conftest.$ac_ext >&5 ++ ++( exit $ac_status ) ++mysql_cv_atomic_builtins=no ++fi ++rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext conftest.$ac_objext conftest.$ac_ext ++fi ++ ++ ++fi ++{ echo "$as_me:$LINENO: result: $mysql_cv_atomic_builtins" >&5 ++echo "${ECHO_T}$mysql_cv_atomic_builtins" >&6; } ++ ++if test "x$mysql_cv_atomic_builtins" = xyes; then ++ ++cat >>confdefs.h <<\_ACEOF ++#define HAVE_ATOMIC_BUILTINS 1 ++_ACEOF ++ ++fi ++ + #AC_CHECK_FUNCS(readdir_r) MySQL checks that it has also the right args. + # Some versions of Unix only take 2 arguments. + #AC_C_INLINE Already checked in MySQL +diff -ruN a/innobase/configure.in b/innobase/configure.in +--- a/innobase/configure.in 2009-01-30 06:42:15.000000000 +0900 ++++ b/innobase/configure.in 2009-05-06 15:40:47.000000000 +0900 +@@ -42,6 +42,31 @@ + AC_CHECK_FUNCS(sched_yield) + AC_CHECK_FUNCS(fdatasync) + AC_CHECK_FUNCS(localtime_r) ++ ++# as http://lists.mysql.com/commits/40686 does ++AC_CACHE_CHECK([whether the compiler provides atomic builtins], ++ [mysql_cv_atomic_builtins], [AC_TRY_RUN([ ++ int main() ++ { ++ int foo= -10; int bar= 10; ++ __sync_fetch_and_add(&foo, bar); ++ if (foo) ++ return -1; ++ bar= __sync_lock_test_and_set(&foo, bar); ++ if (bar || foo != 10) ++ return -1; ++ bar= __sync_val_compare_and_swap(&bar, foo, 15); ++ if (bar) ++ return -1; ++ return 0; ++ } ++], [mysql_cv_atomic_builtins=yes], [mysql_cv_atomic_builtins=no])]) ++ ++if test "x$mysql_cv_atomic_builtins" = xyes; then ++ AC_DEFINE(HAVE_ATOMIC_BUILTINS, 1, ++ [Define to 1 if compiler provides atomic builtins.]) ++fi ++ + #AC_CHECK_FUNCS(readdir_r) MySQL checks that it has also the right args. + # Some versions of Unix only take 2 arguments. + #AC_C_INLINE Already checked in MySQL +diff -ruN a/innobase/ib_config.h b/innobase/ib_config.h +--- a/innobase/ib_config.h 2009-01-30 07:05:03.000000000 +0900 ++++ b/innobase/ib_config.h 2009-05-06 15:40:47.000000000 +0900 +@@ -7,6 +7,9 @@ + /* Define to 1 if you have the <aio.h> header file. */ + #define HAVE_AIO_H 1 + ++/* Define to 1 if compiler provides atomic builtins. */ ++#define HAVE_ATOMIC_BUILTINS 1 ++ + /* Define to 1 if you have the <dlfcn.h> header file. */ + #define HAVE_DLFCN_H 1 + +diff -ruN a/innobase/ib_config.h.in b/innobase/ib_config.h.in +--- a/innobase/ib_config.h.in 2009-01-30 06:56:11.000000000 +0900 ++++ b/innobase/ib_config.h.in 2009-05-06 15:40:47.000000000 +0900 +@@ -6,6 +6,9 @@ + /* Define to 1 if you have the <aio.h> header file. */ + #undef HAVE_AIO_H + ++/* Define to 1 if compiler provides atomic builtins. */ ++#undef HAVE_ATOMIC_BUILTINS ++ + /* Define to 1 if you have the <dlfcn.h> header file. */ + #undef HAVE_DLFCN_H + +diff -ruN a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h +--- a/innobase/include/srv0srv.h 2009-05-06 15:38:01.000000000 +0900 ++++ b/innobase/include/srv0srv.h 2009-05-06 16:04:36.000000000 +0900 +@@ -90,6 +90,8 @@ + extern ulint srv_mem_pool_size; + extern ulint srv_lock_table_size; + ++extern ibool srv_thread_concurrency_timer_based; ++ + extern ulint srv_n_file_io_threads; + extern ulint srv_n_read_io_threads; + extern ulint srv_n_write_io_threads; +diff -ruN a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c +--- a/innobase/srv/srv0srv.c 2009-05-06 15:38:01.000000000 +0900 ++++ b/innobase/srv/srv0srv.c 2009-05-06 17:12:54.000000000 +0900 +@@ -267,6 +267,7 @@ + computer. Bigger computers need bigger values. Value 0 will disable the + concurrency check. */ + ++ibool srv_thread_concurrency_timer_based = TRUE; + ulong srv_thread_concurrency = 0; + ulong srv_commit_concurrency = 0; + +@@ -1020,6 +1021,74 @@ + Puts an OS thread to wait if there are too many concurrent threads + (>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */ + ++#ifdef HAVE_ATOMIC_BUILTINS ++static void ++enter_innodb_with_tickets(trx_t* trx) ++{ ++ trx->declared_to_be_inside_innodb = TRUE; ++ trx->n_tickets_to_enter_innodb = SRV_FREE_TICKETS_TO_ENTER; ++ return; ++} ++ ++static void ++srv_conc_enter_innodb_timer_based(trx_t* trx) ++{ ++ lint conc_n_threads; ++ ibool has_yielded = FALSE; ++ ulint has_slept = 0; ++ ++ if (trx->declared_to_be_inside_innodb) { ++ ut_print_timestamp(stderr); ++ fputs( ++" InnoDB: Error: trying to declare trx to enter InnoDB, but\n" ++"InnoDB: it already is declared.\n", stderr); ++ trx_print(stderr, trx, 0); ++ putc('\n', stderr); ++ } ++retry: ++ if (srv_conc_n_threads < (lint) srv_thread_concurrency) { ++ conc_n_threads = __sync_add_and_fetch(&srv_conc_n_threads, 1); ++ if (conc_n_threads <= (lint) srv_thread_concurrency) { ++ enter_innodb_with_tickets(trx); ++ return; ++ } ++ __sync_add_and_fetch(&srv_conc_n_threads, -1); ++ } ++ if (!has_yielded) ++ { ++ has_yielded = TRUE; ++ os_thread_yield(); ++ goto retry; ++ } ++ if (trx->has_search_latch ++ || NULL != UT_LIST_GET_FIRST(trx->trx_locks)) { ++ ++ conc_n_threads = __sync_add_and_fetch(&srv_conc_n_threads, 1); ++ enter_innodb_with_tickets(trx); ++ return; ++ } ++ if (has_slept < 2) ++ { ++ trx->op_info = "sleeping before entering InnoDB"; ++ os_thread_sleep(10000); ++ trx->op_info = ""; ++ has_slept++; ++ } ++ conc_n_threads = __sync_add_and_fetch(&srv_conc_n_threads, 1); ++ enter_innodb_with_tickets(trx); ++ return; ++} ++ ++static void ++srv_conc_exit_innodb_timer_based(trx_t* trx) ++{ ++ __sync_add_and_fetch(&srv_conc_n_threads, -1); ++ trx->declared_to_be_inside_innodb = FALSE; ++ trx->n_tickets_to_enter_innodb = 0; ++ return; ++} ++#endif ++ + void + srv_conc_enter_innodb( + /*==================*/ +@@ -1043,6 +1112,13 @@ + return; + } + ++#ifdef HAVE_ATOMIC_BUILTINS ++ if (srv_thread_concurrency_timer_based) { ++ srv_conc_enter_innodb_timer_based(trx); ++ return; ++ } ++#endif ++ + os_fast_mutex_lock(&srv_conc_mutex); + retry: + if (trx->declared_to_be_inside_innodb) { +@@ -1196,6 +1272,15 @@ + return; + } + ++ ut_ad(srv_conc_n_threads >= 0); ++#ifdef HAVE_ATOMIC_BUILTINS ++ if (srv_thread_concurrency_timer_based) { ++ __sync_add_and_fetch(&srv_conc_n_threads, 1); ++ trx->declared_to_be_inside_innodb = TRUE; ++ trx->n_tickets_to_enter_innodb = 1; ++ return; ++ } ++#endif + os_fast_mutex_lock(&srv_conc_mutex); + + srv_conc_n_threads++; +@@ -1227,8 +1312,16 @@ + return; + } + ++#ifdef HAVE_ATOMIC_BUILTINS ++ if (srv_thread_concurrency_timer_based) { ++ srv_conc_exit_innodb_timer_based(trx); ++ return; ++ } ++#endif ++ + os_fast_mutex_lock(&srv_conc_mutex); + ++ ut_ad(srv_conc_n_threads > 0); + srv_conc_n_threads--; + trx->declared_to_be_inside_innodb = FALSE; + trx->n_tickets_to_enter_innodb = 0; +diff -ruN a/innobase/srv/srv0start.c b/innobase/srv/srv0start.c +--- a/innobase/srv/srv0start.c 2009-05-06 15:38:01.000000000 +0900 ++++ b/innobase/srv/srv0start.c 2009-05-06 17:22:26.000000000 +0900 +@@ -1040,6 +1040,11 @@ + return(DB_ERROR); + } + ++#ifdef HAVE_ATOMIC_BUILTINS ++ fprintf(stderr, ++ "InnoDB: use atomic builtins.\n"); ++#endif ++ + /* Since InnoDB does not currently clean up all its internal data + structures in MySQL Embedded Server Library server_end(), we + print an error message if someone tries to start up InnoDB a +diff -ruN a/patch_info/innodb_thread_concurrency_timer_based.info b/patch_info/innodb_thread_concurrency_timer_based.info +--- /dev/null 1970-01-01 09:00:00.000000000 +0900 ++++ b/patch_info/innodb_thread_concurrency_timer_based.info 2009-05-06 17:17:12.000000000 +0900 +@@ -0,0 +1,6 @@ ++File=thread_concurrency_timer_based.patch ++Name=Use InnoDB timer based concurrency throttling (backport from MySQL 5.4.0) ++Version=1.0 ++Author=Percona <info@percona.com> ++License=GPL ++Comment +diff -ruN a/sql/ha_innodb.cc b/sql/ha_innodb.cc +--- a/sql/ha_innodb.cc 2009-05-06 15:38:01.000000000 +0900 ++++ b/sql/ha_innodb.cc 2009-05-06 15:54:08.000000000 +0900 +@@ -152,6 +152,7 @@ + innobase_open_files; + + long innobase_read_io_threads, innobase_write_io_threads; ++my_bool innobase_thread_concurrency_timer_based; + long innobase_extra_rsegments; + longlong innobase_buffer_pool_size, innobase_log_file_size; + +@@ -1477,6 +1478,9 @@ + srv_n_log_files = (ulint) innobase_log_files_in_group; + srv_log_file_size = (ulint) innobase_log_file_size; + ++ srv_thread_concurrency_timer_based = ++ (ibool) innobase_thread_concurrency_timer_based; ++ + #ifdef UNIV_LOG_ARCHIVE + srv_log_archive_on = (ulint) innobase_log_archive; + #endif /* UNIV_LOG_ARCHIVE */ +diff -ruN a/sql/ha_innodb.h b/sql/ha_innodb.h +--- a/sql/ha_innodb.h 2009-05-06 15:38:01.000000000 +0900 ++++ b/sql/ha_innodb.h 2009-05-06 15:55:50.000000000 +0900 +@@ -205,6 +205,7 @@ + extern long innobase_buffer_pool_awe_mem_mb; + extern long innobase_file_io_threads, innobase_lock_wait_timeout; + extern long innobase_read_io_threads, innobase_write_io_threads; ++extern my_bool innobase_thread_concurrency_timer_based; + extern long innobase_extra_rsegments; + extern long innobase_force_recovery; + extern long innobase_open_files; +diff -ruN a/sql/mysqld.cc b/sql/mysqld.cc +--- a/sql/mysqld.cc 2009-05-06 15:38:01.000000000 +0900 ++++ b/sql/mysqld.cc 2009-05-06 16:22:06.000000000 +0900 +@@ -5096,6 +5096,7 @@ + OPT_INNODB_ADAPTIVE_CHECKPOINT, + OPT_INNODB_READ_IO_THREADS, + OPT_INNODB_WRITE_IO_THREADS, ++ OPT_INNODB_THREAD_CONCURRENCY_TIMER_BASED, + OPT_INNODB_EXTRA_RSEGMENTS, + OPT_INNODB_DICT_SIZE_LIMIT, + OPT_INNODB_ADAPTIVE_HASH_INDEX, +@@ -5455,6 +5456,11 @@ + "Number of background write I/O threads in InnoDB.", + (gptr*) &innobase_write_io_threads, (gptr*) &innobase_write_io_threads, + 0, GET_LONG, REQUIRED_ARG, 8, 1, 64, 0, 0, 0}, ++ {"innodb_thread_concurrency_timer_based", OPT_INNODB_THREAD_CONCURRENCY_TIMER_BASED, ++ "Use InnoDB timer based concurrency throttling. ", ++ (gptr*) &innobase_thread_concurrency_timer_based, ++ (gptr*) &innobase_thread_concurrency_timer_based, ++ 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"innodb_extra_rsegments", OPT_INNODB_EXTRA_RSEGMENTS, + "Number of extra user rollback segments when create new database.", + (gptr*) &innobase_extra_rsegments, (gptr*) &innobase_extra_rsegments, +diff -ruN a/sql/set_var.cc b/sql/set_var.cc +--- a/sql/set_var.cc 2009-05-06 15:38:01.000000000 +0900 ++++ b/sql/set_var.cc 2009-05-06 16:02:27.000000000 +0900 +@@ -1063,6 +1063,7 @@ + {sys_innodb_adaptive_checkpoint.name, (char*) &sys_innodb_adaptive_checkpoint, SHOW_SYS}, + {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG}, + {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG}, ++ {"innodb_thread_concurrency_timer_based", (char*) &innobase_thread_concurrency_timer_based, SHOW_MY_BOOL}, + {"innodb_extra_rsegments", (char*) &innobase_extra_rsegments, SHOW_LONG}, + {sys_innodb_dict_size_limit.name, (char*) &sys_innodb_dict_size_limit, SHOW_SYS}, + {sys_innodb_io_pattern_trace.name, (char*) &sys_innodb_io_pattern_trace, SHOW_SYS}, diff --git a/percona/5.0.91-b22-20100522/innodb_use_sys_malloc.patch b/percona/5.0.91-b22-20100522/innodb_use_sys_malloc.patch new file mode 100644 index 0000000..9637315 --- /dev/null +++ b/percona/5.0.91-b22-20100522/innodb_use_sys_malloc.patch @@ -0,0 +1,265 @@ +diff -ruN a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h +--- a/innobase/include/srv0srv.h 2009-07-06 15:59:52.000000000 +0900 ++++ b/innobase/include/srv0srv.h 2009-07-06 16:06:51.000000000 +0900 +@@ -90,6 +90,7 @@ + extern ulint srv_mem_pool_size; + extern ulint srv_lock_table_size; + ++extern ibool srv_use_sys_malloc; + extern ibool srv_thread_concurrency_timer_based; + + extern ulint srv_n_file_io_threads; +diff -ruN a/innobase/include/ut0mem.h b/innobase/include/ut0mem.h +--- a/innobase/include/ut0mem.h 2009-07-07 21:54:07.000000000 +0900 ++++ b/innobase/include/ut0mem.h 2009-08-03 14:42:17.000000000 +0900 +@@ -30,6 +30,13 @@ + + + /************************************************************************** ++Initializes the mem block list at database startup. */ ++ ++void ++ut_mem_block_list_init(void); ++/*========================*/ ++ ++/************************************************************************** + Allocates memory. Sets it also to zero if UNIV_SET_MEM_TO_ZERO is + defined and set_to_zero is TRUE. */ + +diff -ruN a/innobase/mem/mem0dbg.c b/innobase/mem/mem0dbg.c +--- a/innobase/mem/mem0dbg.c 2009-05-08 06:12:10.000000000 +0900 ++++ b/innobase/mem/mem0dbg.c 2009-07-06 16:48:17.000000000 +0900 +@@ -134,6 +134,14 @@ + mem_hash_initialized = TRUE; + #endif + ++ if (UNIV_LIKELY(srv_use_sys_malloc)) { ++ /* When innodb_use_sys_malloc is set, the ++ mem_comm_pool won't be used for any allocations. We ++ create a dummy mem_comm_pool, because some statistics ++ and debugging code relies on it being initialized. */ ++ size = 1; ++ } ++ + mem_comm_pool = mem_pool_create(size); + } + +diff -ruN a/innobase/mem/mem0pool.c b/innobase/mem/mem0pool.c +--- a/innobase/mem/mem0pool.c 2009-05-08 06:12:10.000000000 +0900 ++++ b/innobase/mem/mem0pool.c 2009-07-06 17:22:09.000000000 +0900 +@@ -11,6 +11,7 @@ + #include "mem0pool.ic" + #endif + ++#include "srv0srv.h" + #include "sync0sync.h" + #include "ut0mem.h" + #include "ut0lst.h" +@@ -191,8 +192,6 @@ + ulint i; + ulint used; + +- ut_a(size > 10000); +- + pool = ut_malloc(sizeof(mem_pool_t)); + + /* We do not set the memory to zero (FALSE) in the pool, +@@ -330,6 +329,10 @@ + ulint n; + ibool ret; + ++ if (UNIV_LIKELY(srv_use_sys_malloc)) { ++ return(malloc(size)); ++ } ++ + n = ut_2_log(ut_max(size + MEM_AREA_EXTRA_SIZE, MEM_AREA_MIN_SIZE)); + + mutex_enter(&(pool->mutex)); +@@ -457,6 +460,11 @@ + ulint size; + ulint n; + ++ if (UNIV_LIKELY(srv_use_sys_malloc)) { ++ free(ptr); ++ return; ++ } ++ + /* It may be that the area was really allocated from the OS with + regular malloc: check if ptr points within our memory pool */ + +diff -ruN a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c +--- a/innobase/srv/srv0srv.c 2009-07-06 15:59:52.000000000 +0900 ++++ b/innobase/srv/srv0srv.c 2009-07-06 16:08:06.000000000 +0900 +@@ -273,6 +273,7 @@ + computer. Bigger computers need bigger values. Value 0 will disable the + concurrency check. */ + ++ibool srv_use_sys_malloc = TRUE; + ibool srv_thread_concurrency_timer_based = TRUE; + ulong srv_thread_concurrency = 0; + ulong srv_commit_concurrency = 0; +@@ -1012,6 +1013,7 @@ + srv_general_init(void) + /*==================*/ + { ++ ut_mem_block_list_init(); + os_sync_init(); + sync_init(); + mem_init(srv_mem_pool_size); +diff -ruN a/innobase/srv/srv0start.c b/innobase/srv/srv0start.c +--- a/innobase/srv/srv0start.c 2009-07-06 15:59:52.000000000 +0900 ++++ b/innobase/srv/srv0start.c 2009-07-06 16:23:38.000000000 +0900 +@@ -1040,6 +1040,11 @@ + return(DB_ERROR); + } + ++ if (UNIV_LIKELY(srv_use_sys_malloc)) { ++ fprintf(stderr, ++ "InnoDB: The InnoDB memory heap is disabled\n"); ++ } ++ + #ifdef HAVE_ATOMIC_BUILTINS + fprintf(stderr, + "InnoDB: use atomic builtins.\n"); +diff -ruN a/innobase/ut/ut0mem.c b/innobase/ut/ut0mem.c +--- a/innobase/ut/ut0mem.c 2009-05-08 06:12:13.000000000 +0900 ++++ b/innobase/ut/ut0mem.c 2009-07-06 16:42:26.000000000 +0900 +@@ -15,6 +15,7 @@ + #include "mem0mem.h" + #include "os0sync.h" + #include "os0thread.h" ++#include "srv0srv.h" + + /* This struct is placed first in every allocated memory block */ + typedef struct ut_mem_block_struct ut_mem_block_t; +@@ -43,7 +44,7 @@ + + /************************************************************************** + Initializes the mem block list at database startup. */ +-static ++ + void + ut_mem_block_list_init(void) + /*========================*/ +@@ -70,11 +71,21 @@ + ulint retry_count = 0; + void* ret; + +- ut_ad((sizeof(ut_mem_block_t) % 8) == 0); /* check alignment ok */ ++ if (UNIV_LIKELY(srv_use_sys_malloc)) { ++ ret = malloc(n); ++ ut_a(ret || !assert_on_error); + +- if (!ut_mem_block_list_inited) { +- ut_mem_block_list_init(); ++#ifdef UNIV_SET_MEM_TO_ZERO ++ if (set_to_zero) { ++ memset(ret, '\0', n); ++ } ++#endif ++ return(ret); + } ++ ++ ut_ad((sizeof(ut_mem_block_t) % 8) == 0); /* check alignment ok */ ++ ++ ut_a(ut_mem_block_list_inited); + retry: + os_fast_mutex_lock(&ut_list_mutex); + +@@ -223,6 +236,11 @@ + { + ut_mem_block_t* block; + ++ if (UNIV_LIKELY(srv_use_sys_malloc)) { ++ free(ptr); ++ return; ++ } ++ + block = (ut_mem_block_t*)((byte*)ptr - sizeof(ut_mem_block_t)); + + os_fast_mutex_lock(&ut_list_mutex); +@@ -275,6 +293,10 @@ + ulint min_size; + void* new_ptr; + ++ if (UNIV_LIKELY(srv_use_sys_malloc)) { ++ return(realloc(ptr, size)); ++ } ++ + if (ptr == NULL) { + + return(ut_malloc(size)); +diff -ruN a/patch_info/innodb_use_sys_malloc.info b/patch_info/innodb_use_sys_malloc.info +--- /dev/null 1970-01-01 09:00:00.000000000 +0900 ++++ b/patch_info/innodb_use_sys_malloc.info 2009-07-06 16:04:24.000000000 +0900 +@@ -0,0 +1,6 @@ ++File=innodb_use_sys_malloc.patch ++Name=InnoDB uses malloc directly (backport from InnoDB-Plugin) ++Version=1.0 ++Author=Percona <info@percona.com> ++License=GPL ++Comment +diff -ruN a/sql/ha_innodb.cc b/sql/ha_innodb.cc +--- a/sql/ha_innodb.cc 2009-07-06 15:59:52.000000000 +0900 ++++ b/sql/ha_innodb.cc 2009-07-06 16:10:15.000000000 +0900 +@@ -152,6 +152,7 @@ + innobase_open_files; + + long innobase_read_io_threads, innobase_write_io_threads; ++my_bool innobase_use_sys_malloc; + my_bool innobase_thread_concurrency_timer_based; + long innobase_extra_rsegments; + longlong innobase_buffer_pool_size, innobase_log_file_size; +@@ -1492,6 +1493,8 @@ + srv_n_log_files = (ulint) innobase_log_files_in_group; + srv_log_file_size = (ulint) innobase_log_file_size; + ++ srv_use_sys_malloc = (ibool) innobase_use_sys_malloc; ++ + srv_thread_concurrency_timer_based = + (ibool) innobase_thread_concurrency_timer_based; + +diff -ruN a/sql/ha_innodb.h b/sql/ha_innodb.h +--- a/sql/ha_innodb.h 2009-07-06 15:59:52.000000000 +0900 ++++ b/sql/ha_innodb.h 2009-07-06 16:10:42.000000000 +0900 +@@ -205,6 +205,7 @@ + extern long innobase_buffer_pool_awe_mem_mb; + extern long innobase_file_io_threads, innobase_lock_wait_timeout; + extern long innobase_read_io_threads, innobase_write_io_threads; ++extern my_bool innobase_use_sys_malloc; + extern my_bool innobase_thread_concurrency_timer_based; + extern long innobase_extra_rsegments; + extern long innobase_force_recovery; +diff -ruN a/sql/mysqld.cc b/sql/mysqld.cc +--- a/sql/mysqld.cc 2009-07-06 15:59:52.000000000 +0900 ++++ b/sql/mysqld.cc 2009-07-06 16:16:56.000000000 +0900 +@@ -5102,6 +5102,7 @@ + OPT_INNODB_ADAPTIVE_CHECKPOINT, + OPT_INNODB_READ_IO_THREADS, + OPT_INNODB_WRITE_IO_THREADS, ++ OPT_INNODB_USE_SYS_MALLOC, + OPT_INNODB_THREAD_CONCURRENCY_TIMER_BASED, + OPT_INNODB_EXTRA_RSEGMENTS, + OPT_INNODB_DICT_SIZE_LIMIT, +@@ -5470,6 +5471,10 @@ + "Number of background write I/O threads in InnoDB.", + (gptr*) &innobase_write_io_threads, (gptr*) &innobase_write_io_threads, + 0, GET_LONG, REQUIRED_ARG, 8, 1, 64, 0, 0, 0}, ++ {"innodb_use_sys_malloc", OPT_INNODB_USE_SYS_MALLOC, ++ "Use OS memory allocator instead of InnoDB's internal memory allocator", ++ (gptr*) &innobase_use_sys_malloc, (gptr*) &innobase_use_sys_malloc, ++ 0, GET_BOOL, NO_ARG, 1, 0, 0, 0, 0, 0}, + {"innodb_thread_concurrency_timer_based", OPT_INNODB_THREAD_CONCURRENCY_TIMER_BASED, + "Use InnoDB timer based concurrency throttling. ", + (gptr*) &innobase_thread_concurrency_timer_based, +diff -ruN a/sql/set_var.cc b/sql/set_var.cc +--- a/sql/set_var.cc 2009-07-06 15:59:52.000000000 +0900 ++++ b/sql/set_var.cc 2009-07-06 16:22:05.000000000 +0900 +@@ -1093,6 +1093,7 @@ + {sys_innodb_adaptive_checkpoint.name, (char*) &sys_innodb_adaptive_checkpoint, SHOW_SYS}, + {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG}, + {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG}, ++ {"innodb_use_sys_malloc", (char*) &innobase_use_sys_malloc, SHOW_MY_BOOL}, + {"innodb_thread_concurrency_timer_based", (char*) &innobase_thread_concurrency_timer_based, SHOW_MY_BOOL}, + {"innodb_extra_rsegments", (char*) &innobase_extra_rsegments, SHOW_LONG}, + {sys_innodb_dict_size_limit.name, (char*) &sys_innodb_dict_size_limit, SHOW_SYS}, diff --git a/percona/5.0.91-b22-20100522/microsec_process.patch b/percona/5.0.91-b22-20100522/microsec_process.patch new file mode 100644 index 0000000..2e68888 --- /dev/null +++ b/percona/5.0.91-b22-20100522/microsec_process.patch @@ -0,0 +1,282 @@ +diff -r e3b747e556c8 mysql-test/r/information_schema.result +--- a/mysql-test/r/information_schema.result Mon May 18 18:44:04 2009 -0700 ++++ b/mysql-test/r/information_schema.result Mon May 18 18:48:11 2009 -0700 +@@ -44,6 +44,7 @@ + COLUMN_PRIVILEGES + INDEX_STATISTICS + KEY_COLUMN_USAGE ++PROCESSLIST + PROFILING + ROUTINES + SCHEMATA +@@ -740,7 +741,7 @@ + CREATE VIEW a1 (t_CRASHME) AS SELECT f1 FROM t_crashme GROUP BY f1; + CREATE VIEW a2 AS SELECT t_CRASHME FROM a1; + count(*) +-106 ++107 + drop view a2, a1; + drop table t_crashme; + select table_schema,table_name, column_name from +@@ -749,6 +750,7 @@ + table_schema table_name column_name + information_schema COLUMNS COLUMN_DEFAULT + information_schema COLUMNS COLUMN_TYPE ++information_schema PROCESSLIST INFO + information_schema ROUTINES ROUTINE_DEFINITION + information_schema ROUTINES SQL_MODE + information_schema TRIGGERS ACTION_CONDITION +@@ -813,7 +815,7 @@ + flush privileges; + SELECT table_schema, count(*) FROM information_schema.TABLES GROUP BY TABLE_SCHEMA; + table_schema count(*) +-information_schema 21 ++information_schema 22 + mysql 17 + create table t1 (i int, j int); + create trigger trg1 before insert on t1 for each row +@@ -1206,6 +1208,7 @@ + COLUMN_PRIVILEGES TABLE_SCHEMA + INDEX_STATISTICS TABLE_SCHEMA + KEY_COLUMN_USAGE CONSTRAINT_SCHEMA ++PROCESSLIST ID + PROFILING QUERY_ID + ROUTINES ROUTINE_SCHEMA + SCHEMATA SCHEMA_NAME +@@ -1242,6 +1245,7 @@ + COLUMN_PRIVILEGES TABLE_SCHEMA + INDEX_STATISTICS TABLE_SCHEMA + KEY_COLUMN_USAGE CONSTRAINT_SCHEMA ++PROCESSLIST ID + PROFILING QUERY_ID + ROUTINES ROUTINE_SCHEMA + SCHEMATA SCHEMA_NAME +@@ -1329,6 +1333,7 @@ + COLUMN_PRIVILEGES information_schema.COLUMN_PRIVILEGES 1 + INDEX_STATISTICS information_schema.INDEX_STATISTICS 1 + KEY_COLUMN_USAGE information_schema.KEY_COLUMN_USAGE 1 ++PROCESSLIST information_schema.PROCESSLIST 1 + PROFILING information_schema.PROFILING 1 + ROUTINES information_schema.ROUTINES 1 + SCHEMATA information_schema.SCHEMATA 1 +diff -r e3b747e556c8 mysql-test/r/information_schema_db.result +--- a/mysql-test/r/information_schema_db.result Mon May 18 18:44:04 2009 -0700 ++++ b/mysql-test/r/information_schema_db.result Mon May 18 18:48:11 2009 -0700 +@@ -13,6 +13,7 @@ + COLUMN_PRIVILEGES + INDEX_STATISTICS + KEY_COLUMN_USAGE ++PROCESSLIST + PROFILING + ROUTINES + SCHEMATA +diff -r e3b747e556c8 mysql-test/r/mysqlshow.result +--- a/mysql-test/r/mysqlshow.result Mon May 18 18:44:04 2009 -0700 ++++ b/mysql-test/r/mysqlshow.result Mon May 18 18:48:11 2009 -0700 +@@ -87,6 +87,7 @@ + | COLUMN_PRIVILEGES | + | INDEX_STATISTICS | + | KEY_COLUMN_USAGE | ++| PROCESSLIST | + | PROFILING | + | ROUTINES | + | SCHEMATA | +@@ -113,6 +114,7 @@ + | COLUMN_PRIVILEGES | + | INDEX_STATISTICS | + | KEY_COLUMN_USAGE | ++| PROCESSLIST | + | PROFILING | + | ROUTINES | + | SCHEMATA | +diff -r e3b747e556c8 patch_info/microsec_process.info +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/patch_info/microsec_process.info Mon May 18 18:48:11 2009 -0700 +@@ -0,0 +1,6 @@ ++File=microsec_process.patch ++Name=Adds INFOMATION_SCHEMA.PROCESSLIST with TIME_MS column ++Version=1.0 ++Author=Percona <info@percona.com> ++License=GPL ++Comment= +diff -r e3b747e556c8 sql/mysql_priv.h +--- a/sql/mysql_priv.h Mon May 18 18:44:04 2009 -0700 ++++ b/sql/mysql_priv.h Mon May 18 18:48:11 2009 -0700 +@@ -249,6 +249,8 @@ + + /* Characters shown for the command in 'show processlist' */ + #define PROCESS_LIST_WIDTH 100 ++/* Characters shown for the command in 'information_schema.processlist' */ ++#define PROCESS_LIST_INFO_WIDTH 65535 + + #define PRECISION_FOR_DOUBLE 53 + #define PRECISION_FOR_FLOAT 24 +diff -r e3b747e556c8 sql/sql_show.cc +--- a/sql/sql_show.cc Mon May 18 18:44:04 2009 -0700 ++++ b/sql/sql_show.cc Mon May 18 18:48:11 2009 -0700 +@@ -1480,6 +1480,122 @@ + DBUG_VOID_RETURN; + } + ++int fill_schema_processlist(THD* thd, TABLE_LIST* tables, COND* cond) ++{ ++ TABLE *table= tables->table; ++ CHARSET_INFO *cs= system_charset_info; ++ char *user; ++ ulonglong current_timer= my_timer(¤t_timer, frequency); ++ DBUG_ENTER("fill_process_list"); ++ ++ user= thd->security_ctx->master_access & PROCESS_ACL ? ++ NullS : thd->security_ctx->priv_user; ++ ++ VOID(pthread_mutex_lock(&LOCK_thread_count)); ++ ++ if (!thd->killed) ++ { ++ I_List_iterator<THD> it(threads); ++ THD* tmp; ++ ++ while ((tmp= it++)) ++ { ++ Security_context *tmp_sctx= tmp->security_ctx; ++ struct st_my_thread_var *mysys_var; ++ const char *val; ++ ++ if ((!tmp->vio_ok() && !tmp->system_thread) || ++ (user && (!tmp_sctx->user || strcmp(tmp_sctx->user, user)))) ++ continue; ++ ++ restore_record(table, s->default_values); ++ /* ID */ ++ table->field[0]->store((longlong) tmp->thread_id, TRUE); ++ /* USER */ ++ val= tmp_sctx->user ? tmp_sctx->user : ++ (tmp->system_thread ? "system user" : "unauthenticated user"); ++ table->field[1]->store(val, strlen(val), cs); ++ /* HOST */ ++ if (tmp->peer_port && (tmp_sctx->host || tmp_sctx->ip) && ++ thd->security_ctx->host_or_ip[0]) ++ { ++ char host[LIST_PROCESS_HOST_LEN + 1]; ++ my_snprintf(host, LIST_PROCESS_HOST_LEN, "%s:%u", ++ tmp_sctx->host_or_ip, tmp->peer_port); ++ table->field[2]->store(host, strlen(host), cs); ++ } ++ else ++ table->field[2]->store(tmp_sctx->host_or_ip, ++ strlen(tmp_sctx->host_or_ip), cs); ++ /* DB */ ++ if (tmp->db) ++ { ++ table->field[3]->store(tmp->db, strlen(tmp->db), cs); ++ table->field[3]->set_notnull(); ++ } ++ ++ if ((mysys_var= tmp->mysys_var)) ++ pthread_mutex_lock(&mysys_var->mutex); ++ /* COMMAND */ ++ if ((val= (char *) (tmp->killed == THD::KILL_CONNECTION? "Killed" : 0))) ++ table->field[4]->store(val, strlen(val), cs); ++ else ++ table->field[4]->store(command_name[tmp->command], ++ strlen(command_name[tmp->command]), cs); ++ /* MYSQL_TIME */ ++ ulonglong utime= (tmp->start_timer && current_timer) ? current_timer - tmp->start_timer : 0; ++ /* correction for negative time */ ++ if (utime > 2629743) utime= 0; ++ table->field[5]->store(utime / 1000000, TRUE); ++ /* STATE */ ++#ifndef EMBEDDED_LIBRARY ++ val= (char*) (tmp->locked ? "Locked" : ++ tmp->net.reading_or_writing ? ++ (tmp->net.reading_or_writing == 2 ? ++ "Writing to net" : ++ tmp->command == COM_SLEEP ? "" : ++ "Reading from net") : ++ tmp->proc_info ? tmp->proc_info : ++ tmp->mysys_var && ++ tmp->mysys_var->current_cond ? ++ "Waiting on cond" : NullS); ++#else ++ val= (char *) "Writing to net"; ++#endif ++ if (val) ++ { ++ table->field[6]->store(val, strlen(val), cs); ++ table->field[6]->set_notnull(); ++ } ++ ++ if (mysys_var) ++ pthread_mutex_unlock(&mysys_var->mutex); ++ ++ /* INFO */ ++ if (tmp->query) ++ { ++ table->field[7]->store(tmp->query, ++ min(PROCESS_LIST_INFO_WIDTH, ++ tmp->query_length), cs); ++ table->field[7]->set_notnull(); ++ } ++ ++ /* TIME_MS */ ++ table->field[8]->store((double)(utime / 1000.0)); ++ ++ if (schema_table_store_record(thd, table)) ++ { ++ VOID(pthread_mutex_unlock(&LOCK_thread_count)); ++ DBUG_RETURN(1); ++ } ++ ++ } ++ } ++ ++ VOID(pthread_mutex_unlock(&LOCK_thread_count)); ++ DBUG_RETURN(0); ++} ++ + /***************************************************************************** + Status functions + *****************************************************************************/ +@@ -4849,6 +4965,22 @@ + {0, 0, MYSQL_TYPE_STRING, 0, 0, 0} + }; + ++ST_FIELD_INFO processlist_fields_info[]= ++{ ++ {"ID", 4, MYSQL_TYPE_LONG, 0, 0, "Id"}, ++ {"USER", 16, MYSQL_TYPE_STRING, 0, 0, "User"}, ++ {"HOST", LIST_PROCESS_HOST_LEN, MYSQL_TYPE_STRING, 0, 0, "Host"}, ++ {"DB", NAME_LEN, MYSQL_TYPE_STRING, 0, 1, "Db"}, ++ {"COMMAND", 16, MYSQL_TYPE_STRING, 0, 0, "Command"}, ++ {"TIME", 7, MYSQL_TYPE_LONG, 0, 0, "Time"}, ++ {"STATE", 64, MYSQL_TYPE_STRING, 0, 1, "State"}, ++ {"INFO", PROCESS_LIST_INFO_WIDTH, MYSQL_TYPE_STRING, 0, 1, "Info"}, ++ {"TIME_MS", 100 * (MY_INT64_NUM_DECIMAL_DIGITS + 1) + 3, MYSQL_TYPE_DECIMAL, ++ 0, 0, "Time_ms"}, ++ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0} ++}; ++ ++ + /* + Description of ST_FIELD_INFO in table.h + */ +@@ -4873,6 +5005,8 @@ + get_all_tables, 0, get_schema_key_column_usage_record, 4, 5, 0}, + {"OPEN_TABLES", open_tables_fields_info, create_schema_table, + fill_open_tables, make_old_format, 0, -1, -1, 1}, ++ {"PROCESSLIST", processlist_fields_info, create_schema_table, ++ fill_schema_processlist, make_old_format, 0, -1, -1, 0}, + {"PROFILING", query_profile_statistics_info, create_schema_table, + fill_query_profile_statistics_info, make_profile_table_for_show, + NULL, -1, -1, false}, +diff -r e3b747e556c8 sql/table.h +--- a/sql/table.h Mon May 18 18:44:04 2009 -0700 ++++ b/sql/table.h Mon May 18 18:48:11 2009 -0700 +@@ -379,6 +379,7 @@ + SCH_INDEX_STATS, + SCH_KEY_COLUMN_USAGE, + SCH_OPEN_TABLES, ++ SCH_PROCESSLIST, + SCH_PROFILES, + SCH_PROCEDURES, + SCH_SCHEMATA, diff --git a/percona/5.0.91-b22-20100522/microslow_innodb.patch b/percona/5.0.91-b22-20100522/microslow_innodb.patch new file mode 100644 index 0000000..11a186c --- /dev/null +++ b/percona/5.0.91-b22-20100522/microslow_innodb.patch @@ -0,0 +1,2492 @@ +diff -r 1242d4575291 include/my_getopt.h +--- a/include/my_getopt.h Tue Jul 28 23:39:12 2009 -0700 ++++ b/include/my_getopt.h Tue Jul 28 23:42:44 2009 -0700 +@@ -28,7 +28,8 @@ + #define GET_ULL 8 + #define GET_STR 9 + #define GET_STR_ALLOC 10 +-#define GET_DISABLED 11 ++#define GET_MICROTIME 11 ++#define GET_DISABLED 12 + + #define GET_ASK_ADDR 128 + #define GET_TYPE_MASK 127 +diff -r 1242d4575291 include/my_time.h +--- a/include/my_time.h Tue Jul 28 23:39:12 2009 -0700 ++++ b/include/my_time.h Tue Jul 28 23:42:44 2009 -0700 +@@ -140,7 +140,7 @@ + int my_date_to_str(const MYSQL_TIME *l_time, char *to); + int my_datetime_to_str(const MYSQL_TIME *l_time, char *to); + int my_TIME_to_str(const MYSQL_TIME *l_time, char *to); +- ++ulonglong my_timer(ulonglong *ltime, ulonglong frequency); + C_MODE_END + + #endif /* _my_time_h_ */ +diff -r 1242d4575291 innobase/buf/buf0buf.c +--- a/innobase/buf/buf0buf.c Tue Jul 28 23:39:12 2009 -0700 ++++ b/innobase/buf/buf0buf.c Tue Jul 28 23:42:44 2009 -0700 +@@ -37,6 +37,10 @@ + #include "log0log.h" + #include "trx0undo.h" + #include "srv0srv.h" ++#include "trx0trx.h" ++ ++/* prototypes for new functions added to ha_innodb.cc */ ++trx_t* innobase_get_trx(); + + /* + IMPLEMENTATION OF THE BUFFER POOL +@@ -1086,6 +1090,36 @@ + return(block); + } + ++inline void _increment_page_get_statistics(buf_block_t* block, trx_t* trx) ++{ ++ ulint block_hash; ++ ulint block_hash_byte; ++ byte block_hash_offset; ++ ++ ut_ad(block); ++ ++ if (!srv_slow_log || !trx || !trx->take_stats) ++ return; ++ ++ if (!trx->distinct_page_access_hash) { ++ trx->distinct_page_access_hash = mem_alloc(DPAH_SIZE); ++ memset(trx->distinct_page_access_hash, 0, DPAH_SIZE); ++ } ++ ++ block_hash = ut_hash_ulint((block->space << 20) + block->space + ++ block->offset, DPAH_SIZE << 3); ++ block_hash_byte = block_hash >> 3; ++ block_hash_offset = (byte) block_hash & 0x07; ++ if (block_hash_byte < 0 || block_hash_byte >= DPAH_SIZE) ++ fprintf(stderr, "!!! block_hash_byte = %lu block_hash_offset = %lu !!!\n", block_hash_byte, block_hash_offset); ++ if (block_hash_offset < 0 || block_hash_offset > 7) ++ fprintf(stderr, "!!! block_hash_byte = %lu block_hash_offset = %lu !!!\n", block_hash_byte, block_hash_offset); ++ if ((trx->distinct_page_access_hash[block_hash_byte] & ((byte) 0x01 << block_hash_offset)) == 0) ++ trx->distinct_page_access++; ++ trx->distinct_page_access_hash[block_hash_byte] |= (byte) 0x01 << block_hash_offset; ++ return; ++} ++ + /************************************************************************ + This is the general function used to get access to a database page. */ + +@@ -1108,6 +1142,11 @@ + ulint fix_type; + ibool success; + ibool must_read; ++ trx_t* trx = NULL; ++ ulint sec; ++ ulint ms; ++ ib_longlong start_time; ++ ib_longlong finish_time; + + ut_ad(mtr); + ut_ad((rw_latch == RW_S_LATCH) +@@ -1119,6 +1158,9 @@ + #ifndef UNIV_LOG_DEBUG + ut_ad(!ibuf_inside() || ibuf_page(space, offset)); + #endif ++ if (srv_slow_log) { ++ trx = innobase_get_trx(); ++ } + buf_pool->n_page_gets++; + loop: + block = NULL; +@@ -1148,7 +1190,7 @@ + return(NULL); + } + +- buf_read_page(space, offset); ++ buf_read_page(space, offset, trx); + + #ifdef UNIV_DEBUG + buf_dbg_counter++; +@@ -1261,6 +1303,11 @@ + /* Let us wait until the read operation + completes */ + ++ if (srv_slow_log && trx && trx->take_stats) ++ { ++ ut_usectime(&sec, &ms); ++ start_time = (ib_longlong)sec * 1000000 + ms; ++ } + for (;;) { + mutex_enter(&block->mutex); + +@@ -1276,6 +1323,12 @@ + break; + } + } ++ if (srv_slow_log && trx && trx->take_stats && start_time) ++ { ++ ut_usectime(&sec, &ms); ++ finish_time = (ib_longlong)sec * 1000000 + ms; ++ trx->io_reads_wait_timer += (ulint)(finish_time - start_time); ++ } + } + + fix_type = MTR_MEMO_BUF_FIX; +@@ -1296,12 +1349,17 @@ + /* In the case of a first access, try to apply linear + read-ahead */ + +- buf_read_ahead_linear(space, offset); ++ buf_read_ahead_linear(space, offset, trx); + } + + #ifdef UNIV_IBUF_DEBUG + ut_a(ibuf_count_get(block->space, block->offset) == 0); + #endif ++ ++ if (srv_slow_log) { ++ _increment_page_get_statistics(block, trx); ++ } ++ + return(block->frame); + } + +@@ -1326,6 +1384,7 @@ + ibool accessed; + ibool success; + ulint fix_type; ++ trx_t* trx = NULL; + + ut_ad(mtr && block); + ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); +@@ -1440,7 +1499,7 @@ + read-ahead */ + + buf_read_ahead_linear(buf_frame_get_space_id(guess), +- buf_frame_get_page_no(guess)); ++ buf_frame_get_page_no(guess), trx); + } + + #ifdef UNIV_IBUF_DEBUG +@@ -1448,6 +1507,11 @@ + #endif + buf_pool->n_page_gets++; + ++ if (srv_slow_log) { ++ trx = innobase_get_trx(); ++ _increment_page_get_statistics(block, trx); ++ } ++ + return(TRUE); + } + +@@ -1470,6 +1534,7 @@ + buf_block_t* block; + ibool success; + ulint fix_type; ++ trx_t* trx = NULL; + + ut_ad(mtr); + ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); +@@ -1559,6 +1624,11 @@ + #endif + buf_pool->n_page_gets++; + ++ if (srv_slow_log) { ++ trx = innobase_get_trx(); ++ _increment_page_get_statistics(block, trx); ++ } ++ + return(TRUE); + } + +diff -r 1242d4575291 innobase/buf/buf0rea.c +--- a/innobase/buf/buf0rea.c Tue Jul 28 23:39:12 2009 -0700 ++++ b/innobase/buf/buf0rea.c Tue Jul 28 23:42:44 2009 -0700 +@@ -70,7 +70,8 @@ + treat the tablespace as dropped; this is a timestamp we + use to stop dangling page reads from a tablespace + which we have DISCARDed + IMPORTed back */ +- ulint offset) /* in: page number */ ++ ulint offset, /* in: page number */ ++ trx_t* trx) + { + buf_block_t* block; + ulint wake_later; +@@ -140,10 +141,10 @@ + + ut_a(block->state == BUF_BLOCK_FILE_PAGE); + +- *err = fil_io(OS_FILE_READ | wake_later, ++ *err = _fil_io(OS_FILE_READ | wake_later, + sync, space, + offset, 0, UNIV_PAGE_SIZE, +- (void*)block->frame, (void*)block); ++ (void*)block->frame, (void*)block, trx); + ut_a(*err == DB_SUCCESS); + + if (sync) { +@@ -174,8 +175,9 @@ + the page at the given page number does not get + read even if we return a value > 0! */ + ulint space, /* in: space id */ +- ulint offset) /* in: page number of a page which the current thread ++ ulint offset, /* in: page number of a page which the current thread + wants to access */ ++ trx_t* trx) + { + ib_longlong tablespace_version; + buf_block_t* block; +@@ -270,7 +272,7 @@ + if (!ibuf_bitmap_page(i)) { + count += buf_read_page_low(&err, FALSE, ibuf_mode + | OS_AIO_SIMULATED_WAKE_LATER, +- space, tablespace_version, i); ++ space, tablespace_version, i, trx); + if (err == DB_TABLESPACE_DELETED) { + ut_print_timestamp(stderr); + fprintf(stderr, +@@ -314,7 +316,8 @@ + /* out: number of page read requests issued: this can + be > 1 if read-ahead occurred */ + ulint space, /* in: space id */ +- ulint offset) /* in: page number */ ++ ulint offset, /* in: page number */ ++ trx_t* trx) + { + ib_longlong tablespace_version; + ulint count; +@@ -323,13 +326,13 @@ + + tablespace_version = fil_space_get_version(space); + +- count = buf_read_ahead_random(space, offset); ++ count = buf_read_ahead_random(space, offset, trx); + + /* We do the i/o in the synchronous aio mode to save thread + switches: hence TRUE */ + + count2 = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space, +- tablespace_version, offset); ++ tablespace_version, offset, trx); + srv_buf_pool_reads+= count2; + if (err == DB_TABLESPACE_DELETED) { + ut_print_timestamp(stderr); +@@ -374,8 +377,9 @@ + /*==================*/ + /* out: number of page read requests issued */ + ulint space, /* in: space id */ +- ulint offset) /* in: page number of a page; NOTE: the current thread ++ ulint offset, /* in: page number of a page; NOTE: the current thread + must want access to this page (see NOTE 3 above) */ ++ trx_t* trx) + { + ib_longlong tablespace_version; + buf_block_t* block; +@@ -556,7 +560,7 @@ + if (!ibuf_bitmap_page(i)) { + count += buf_read_page_low(&err, FALSE, ibuf_mode + | OS_AIO_SIMULATED_WAKE_LATER, +- space, tablespace_version, i); ++ space, tablespace_version, i, trx); + if (err == DB_TABLESPACE_DELETED) { + ut_print_timestamp(stderr); + fprintf(stderr, +@@ -625,10 +629,10 @@ + for (i = 0; i < n_stored; i++) { + if ((i + 1 == n_stored) && sync) { + buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, +- space_ids[i], space_versions[i], page_nos[i]); ++ space_ids[i], space_versions[i], page_nos[i], NULL); + } else { + buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE, +- space_ids[i], space_versions[i], page_nos[i]); ++ space_ids[i], space_versions[i], page_nos[i], NULL); + } + + if (err == DB_TABLESPACE_DELETED) { +@@ -704,11 +708,11 @@ + + if ((i + 1 == n_stored) && sync) { + buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space, +- tablespace_version, page_nos[i]); ++ tablespace_version, page_nos[i], NULL); + } else { + buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE + | OS_AIO_SIMULATED_WAKE_LATER, +- space, tablespace_version, page_nos[i]); ++ space, tablespace_version, page_nos[i], NULL); + } + } + +diff -r 1242d4575291 innobase/fil/fil0fil.c +--- a/innobase/fil/fil0fil.c Tue Jul 28 23:39:12 2009 -0700 ++++ b/innobase/fil/fil0fil.c Tue Jul 28 23:42:44 2009 -0700 +@@ -3527,7 +3527,7 @@ + node->name, node->handle, buf, + offset_low, offset_high, + UNIV_PAGE_SIZE * n_pages, +- NULL, NULL); ++ NULL, NULL, NULL); + #endif + if (success) { + node->size += n_pages; +@@ -3851,7 +3851,7 @@ + Reads or writes data. This operation is asynchronous (aio). */ + + ulint +-fil_io( ++_fil_io( + /*===*/ + /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED + if we are trying to do i/o on a tablespace +@@ -3877,8 +3877,9 @@ + void* buf, /* in/out: buffer where to store read data + or from where to write; in aio this must be + appropriately aligned */ +- void* message) /* in: message for aio handler if non-sync ++ void* message, /* in: message for aio handler if non-sync + aio used, else ignored */ ++ trx_t* trx) + { + fil_system_t* system = fil_system; + ulint mode; +@@ -4018,7 +4019,7 @@ + #else + /* Queue the aio request */ + ret = os_aio(type, mode | wake_later, node->name, node->handle, buf, +- offset_low, offset_high, len, node, message); ++ offset_low, offset_high, len, node, message, trx); + #endif + ut_a(ret); + +diff -r 1242d4575291 innobase/include/buf0rea.h +--- a/innobase/include/buf0rea.h Tue Jul 28 23:39:12 2009 -0700 ++++ b/innobase/include/buf0rea.h Tue Jul 28 23:42:44 2009 -0700 +@@ -10,6 +10,7 @@ + #define buf0rea_h + + #include "univ.i" ++#include "trx0types.h" + #include "buf0types.h" + + /************************************************************************ +@@ -25,7 +26,8 @@ + /* out: number of page read requests issued: this can + be > 1 if read-ahead occurred */ + ulint space, /* in: space id */ +- ulint offset);/* in: page number */ ++ ulint offset, /* in: page number */ ++ trx_t* trx); + /************************************************************************ + Applies linear read-ahead if in the buf_pool the page is a border page of + a linear read-ahead area and all the pages in the area have been accessed. +@@ -55,8 +57,9 @@ + /*==================*/ + /* out: number of page read requests issued */ + ulint space, /* in: space id */ +- ulint offset);/* in: page number of a page; NOTE: the current thread ++ ulint offset, /* in: page number of a page; NOTE: the current thread + must want access to this page (see NOTE 3 above) */ ++ trx_t* trx); + /************************************************************************ + Issues read requests for pages which the ibuf module wants to read in, in + order to contract the insert buffer tree. Technically, this function is like +diff -r 1242d4575291 innobase/include/fil0fil.h +--- a/innobase/include/fil0fil.h Tue Jul 28 23:39:12 2009 -0700 ++++ b/innobase/include/fil0fil.h Tue Jul 28 23:42:44 2009 -0700 +@@ -534,8 +534,11 @@ + /************************************************************************ + Reads or writes data. This operation is asynchronous (aio). */ + ++#define fil_io(type, sync, space_id, block_offset, byte_offset, len, buf, message) \ ++ _fil_io(type, sync, space_id, block_offset, byte_offset, len, buf, message, NULL) ++ + ulint +-fil_io( ++_fil_io( + /*===*/ + /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED + if we are trying to do i/o on a tablespace +@@ -561,8 +564,9 @@ + void* buf, /* in/out: buffer where to store read data + or from where to write; in aio this must be + appropriately aligned */ +- void* message); /* in: message for aio handler if non-sync ++ void* message, /* in: message for aio handler if non-sync + aio used, else ignored */ ++ trx_t* trx); + /************************************************************************ + Reads data from a space to a buffer. Remember that the possible incomplete + blocks at the end of file are ignored: they are not taken into account when +diff -r 1242d4575291 innobase/include/os0file.h +--- a/innobase/include/os0file.h Tue Jul 28 23:39:12 2009 -0700 ++++ b/innobase/include/os0file.h Tue Jul 28 23:42:44 2009 -0700 +@@ -11,6 +11,8 @@ + + #include "univ.i" + ++#include "trx0types.h" ++ + #ifndef __WIN__ + #include <dirent.h> + #include <sys/stat.h> +@@ -421,8 +423,11 @@ + /*********************************************************************** + Requests a synchronous read operation. */ + ++#define os_file_read(file, buf, offset, offset_high, n) \ ++ _os_file_read(file, buf, offset, offset_high, n, NULL) ++ + ibool +-os_file_read( ++_os_file_read( + /*=========*/ + /* out: TRUE if request was + successful, FALSE if fail */ +@@ -432,7 +437,8 @@ + offset where to read */ + ulint offset_high,/* in: most significant 32 bits of + offset */ +- ulint n); /* in: number of bytes to read */ ++ ulint n, /* in: number of bytes to read */ ++ trx_t* trx); + /*********************************************************************** + Rewind file to its start, read at most size - 1 bytes from it to str, and + NUL-terminate str. All errors are silently ignored. This function is +@@ -584,7 +590,8 @@ + can be used to identify a completed aio + operation); if mode is OS_AIO_SYNC, these + are ignored */ +- void* message2); ++ void* message2, ++ trx_t* trx); + /**************************************************************************** + Wakes up all async i/o threads so that they know to exit themselves in + shutdown. */ +diff -r 1242d4575291 innobase/include/srv0srv.h +--- a/innobase/include/srv0srv.h Tue Jul 28 23:39:12 2009 -0700 ++++ b/innobase/include/srv0srv.h Tue Jul 28 23:42:44 2009 -0700 +@@ -27,6 +27,8 @@ + #define SRV_AUTO_EXTEND_INCREMENT \ + (srv_auto_extend_increment * ((1024 * 1024) / UNIV_PAGE_SIZE)) + ++extern ibool srv_slow_log; ++ + /* This is set to TRUE if the MySQL user has set it in MySQL */ + extern ibool srv_lower_case_table_names; + +diff -r 1242d4575291 innobase/include/trx0trx.h +--- a/innobase/include/trx0trx.h Tue Jul 28 23:39:12 2009 -0700 ++++ b/innobase/include/trx0trx.h Tue Jul 28 23:42:44 2009 -0700 +@@ -668,6 +668,17 @@ + /*------------------------------*/ + char detailed_error[256]; /* detailed error message for last + error, or empty. */ ++ /*------------------------------*/ ++ ulint io_reads; ++ ib_longlong io_read; ++ ulint io_reads_wait_timer; ++ ib_longlong lock_que_wait_ustarted; ++ ulint lock_que_wait_timer; ++ ulint innodb_que_wait_timer; ++ ulint distinct_page_access; ++#define DPAH_SIZE 8192 ++ byte* distinct_page_access_hash; ++ ibool take_stats; + }; + + #define TRX_MAX_N_THREADS 32 /* maximum number of concurrent +diff -r 1242d4575291 innobase/lock/lock0lock.c +--- a/innobase/lock/lock0lock.c Tue Jul 28 23:39:12 2009 -0700 ++++ b/innobase/lock/lock0lock.c Tue Jul 28 23:42:44 2009 -0700 +@@ -1806,6 +1806,8 @@ + { + lock_t* lock; + trx_t* trx; ++ ulint sec; ++ ulint ms; + + #ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +@@ -1861,6 +1863,10 @@ + trx->que_state = TRX_QUE_LOCK_WAIT; + trx->was_chosen_as_deadlock_victim = FALSE; + trx->wait_started = time(NULL); ++ if (srv_slow_log && trx->take_stats) { ++ ut_usectime(&sec, &ms); ++ trx->lock_que_wait_ustarted = (ib_longlong)sec * 1000000 + ms; ++ } + + ut_a(que_thr_stop(thr)); + +@@ -3514,7 +3520,9 @@ + { + lock_t* lock; + trx_t* trx; +- ++ ulint sec; ++ ulint ms; ++ + #ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); + #endif /* UNIV_SYNC_DEBUG */ +@@ -3564,6 +3572,10 @@ + return(DB_SUCCESS); + } + ++ if (srv_slow_log && trx->take_stats) { ++ ut_usectime(&sec, &ms); ++ trx->lock_que_wait_ustarted = (ib_longlong)sec * 1000000 + ms; ++ } + trx->que_state = TRX_QUE_LOCK_WAIT; + trx->was_chosen_as_deadlock_victim = FALSE; + trx->wait_started = time(NULL); +diff -r 1242d4575291 innobase/os/os0file.c +--- a/innobase/os/os0file.c Tue Jul 28 23:39:12 2009 -0700 ++++ b/innobase/os/os0file.c Tue Jul 28 23:42:44 2009 -0700 +@@ -14,6 +14,8 @@ + #include "srv0start.h" + #include "fil0fil.h" + #include "buf0buf.h" ++#include "trx0sys.h" ++#include "trx0trx.h" + + #if defined(UNIV_HOTBACKUP) && defined(__WIN__) + /* Add includes for the _stat() call to compile on Windows */ +@@ -1903,9 +1905,13 @@ + #ifndef __WIN__ + /*********************************************************************** + Does a synchronous read operation in Posix. */ ++ ++#define os_file_pread(file, buf, n, offset, offset_high) \ ++ _os_file_pread(file, buf, n, offset, offset_high, NULL); ++ + static + ssize_t +-os_file_pread( ++_os_file_pread( + /*==========*/ + /* out: number of bytes read, -1 if error */ + os_file_t file, /* in: handle to a file */ +@@ -1913,12 +1919,17 @@ + ulint n, /* in: number of bytes to read */ + ulint offset, /* in: least significant 32 bits of file + offset from where to read */ +- ulint offset_high) /* in: most significant 32 bits of +- offset */ ++ ulint offset_high, /* in: most significant 32 bits of ++ offset */ ++ trx_t* trx) + { + off_t offs; + ssize_t n_bytes; +- ++ ulint sec; ++ ulint ms; ++ ib_longlong start_time; ++ ib_longlong finish_time; ++ + ut_a((offset & 0xFFFFFFFFUL) == offset); + + /* If off_t is > 4 bytes in size, then we assume we can pass a +@@ -1937,7 +1948,13 @@ + } + + os_n_file_reads++; +- ++ if (srv_slow_log && trx && trx->take_stats) ++ { ++ trx->io_reads++; ++ trx->io_read += n; ++ ut_usectime(&sec, &ms); ++ start_time = (ib_longlong)sec * 1000000 + ms; ++ } + #if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD) + os_mutex_enter(os_file_count_mutex); + os_file_n_pending_preads++; +@@ -1951,6 +1968,13 @@ + os_n_pending_reads--; + os_mutex_exit(os_file_count_mutex); + ++ if (srv_slow_log && trx && trx->take_stats && start_time) ++ { ++ ut_usectime(&sec, &ms); ++ finish_time = (ib_longlong)sec * 1000000 + ms; ++ trx->io_reads_wait_timer += (ulint)(finish_time - start_time); ++ } ++ + return(n_bytes); + #else + { +@@ -1981,6 +2005,13 @@ + os_n_pending_reads--; + os_mutex_exit(os_file_count_mutex); + ++ if (srv_slow_log && trx && trx->take_stats && start_time) ++ { ++ ut_usectime(&sec, &ms); ++ finish_time = (ib_longlong)sec * 1000000 + ms; ++ trx->io_reads_wait_timer += (ulint)(finish_time - start_time); ++ } ++ + return(ret); + } + #endif +@@ -2103,7 +2134,7 @@ + Requests a synchronous positioned read operation. */ + + ibool +-os_file_read( ++_os_file_read( + /*=========*/ + /* out: TRUE if request was + successful, FALSE if fail */ +@@ -2113,7 +2144,8 @@ + offset where to read */ + ulint offset_high, /* in: most significant 32 bits of + offset */ +- ulint n) /* in: number of bytes to read */ ++ ulint n, /* in: number of bytes to read */ ++ trx_t* trx) + { + #ifdef __WIN__ + BOOL ret; +@@ -2177,7 +2209,7 @@ + os_bytes_read_since_printout += n; + + try_again: +- ret = os_file_pread(file, buf, n, offset, offset_high); ++ ret = _os_file_pread(file, buf, n, offset, offset_high, trx); + + if ((ulint)ret == n) { + +@@ -3137,7 +3169,8 @@ + offset */ + ulint offset_high, /* in: most significant 32 bits of + offset */ +- ulint len) /* in: length of the block to read or write */ ++ ulint len, /* in: length of the block to read or write */ ++ trx_t* trx) + { + os_aio_slot_t* slot; + #ifdef WIN_ASYNC_IO +@@ -3390,7 +3423,8 @@ + can be used to identify a completed aio + operation); if mode is OS_AIO_SYNC, these + are ignored */ +- void* message2) ++ void* message2, ++ trx_t* trx) + { + os_aio_array_t* array; + os_aio_slot_t* slot; +@@ -3429,8 +3463,8 @@ + wait in the Windows case. */ + + if (type == OS_FILE_READ) { +- return(os_file_read(file, buf, offset, +- offset_high, n)); ++ return(_os_file_read(file, buf, offset, ++ offset_high, n, trx)); + } + + ut_a(type == OS_FILE_WRITE); +@@ -3463,8 +3497,13 @@ + ut_error; + } + ++ if (trx && type == OS_FILE_READ) ++ { ++ trx->io_reads++; ++ trx->io_read += n; ++ } + slot = os_aio_array_reserve_slot(type, array, message1, message2, file, +- name, buf, offset, offset_high, n); ++ name, buf, offset, offset_high, n, trx); + if (type == OS_FILE_READ) { + if (os_aio_use_native_aio) { + #ifdef WIN_ASYNC_IO +diff -r 1242d4575291 innobase/srv/srv0srv.c +--- a/innobase/srv/srv0srv.c Tue Jul 28 23:39:12 2009 -0700 ++++ b/innobase/srv/srv0srv.c Tue Jul 28 23:42:44 2009 -0700 +@@ -48,6 +48,8 @@ + #include "srv0start.h" + #include "row0mysql.h" + ++ibool srv_slow_log = 0; ++ + /* This is set to TRUE if the MySQL user has set it in MySQL; currently + affects only FOREIGN KEY definition parsing */ + ibool srv_lower_case_table_names = FALSE; +@@ -1002,6 +1004,10 @@ + ibool has_slept = FALSE; + srv_conc_slot_t* slot = NULL; + ulint i; ++ ib_longlong start_time = 0L; ++ ib_longlong finish_time = 0L; ++ ulint sec; ++ ulint ms; + + /* If trx has 'free tickets' to enter the engine left, then use one + such ticket */ +@@ -1060,6 +1066,7 @@ + if (SRV_THREAD_SLEEP_DELAY > 0) + { + os_thread_sleep(SRV_THREAD_SLEEP_DELAY); ++ trx->innodb_que_wait_timer += SRV_THREAD_SLEEP_DELAY; + } + + trx->op_info = ""; +@@ -1115,12 +1122,23 @@ + /* Go to wait for the event; when a thread leaves InnoDB it will + release this thread */ + ++ if (srv_slow_log && trx->take_stats) { ++ ut_usectime(&sec, &ms); ++ start_time = (ib_longlong)sec * 1000000 + ms; ++ } ++ + trx->op_info = "waiting in InnoDB queue"; + + os_event_wait(slot->event); + + trx->op_info = ""; + ++ if (srv_slow_log && trx->take_stats && start_time) { ++ ut_usectime(&sec, &ms); ++ finish_time = (ib_longlong)sec * 1000000 + ms; ++ trx->innodb_que_wait_timer += (ulint)(finish_time - start_time); ++ } ++ + os_fast_mutex_lock(&srv_conc_mutex); + + srv_conc_n_waiting_threads--; +diff -r 1242d4575291 innobase/trx/trx0trx.c +--- a/innobase/trx/trx0trx.c Tue Jul 28 23:39:12 2009 -0700 ++++ b/innobase/trx/trx0trx.c Tue Jul 28 23:42:44 2009 -0700 +@@ -190,6 +190,15 @@ + trx->global_read_view_heap = mem_heap_create(256); + trx->global_read_view = NULL; + trx->read_view = NULL; ++ ++ trx->io_reads = 0; ++ trx->io_read = 0; ++ trx->io_reads_wait_timer = 0; ++ trx->lock_que_wait_timer = 0; ++ trx->innodb_que_wait_timer = 0; ++ trx->distinct_page_access = 0; ++ trx->distinct_page_access_hash = NULL; ++ trx->take_stats = FALSE; + + /* Set X/Open XA transaction identification to NULL */ + memset(&trx->xid, 0, sizeof(trx->xid)); +@@ -230,6 +239,11 @@ + + trx->mysql_process_no = os_proc_get_number(); + ++ if (srv_slow_log && trx->take_stats) { ++ trx->distinct_page_access_hash = mem_alloc(DPAH_SIZE); ++ memset(trx->distinct_page_access_hash, 0, DPAH_SIZE); ++ } ++ + return(trx); + } + +@@ -366,6 +380,12 @@ + /*===============*/ + trx_t* trx) /* in, own: trx object */ + { ++ if (trx->distinct_page_access_hash) ++ { ++ mem_free(trx->distinct_page_access_hash); ++ trx->distinct_page_access_hash= NULL; ++ } ++ + thr_local_free(trx->mysql_thread_id); + + mutex_enter(&kernel_mutex); +@@ -389,6 +409,12 @@ + /*====================*/ + trx_t* trx) /* in, own: trx object */ + { ++ if (trx->distinct_page_access_hash) ++ { ++ mem_free(trx->distinct_page_access_hash); ++ trx->distinct_page_access_hash= NULL; ++ } ++ + mutex_enter(&kernel_mutex); + + trx_free(trx); +@@ -1064,7 +1090,10 @@ + trx_t* trx) /* in: transaction */ + { + que_thr_t* thr; +- ++ ulint sec; ++ ulint ms; ++ ib_longlong now; ++ + #ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); + #endif /* UNIV_SYNC_DEBUG */ +@@ -1080,6 +1109,11 @@ + thr = UT_LIST_GET_FIRST(trx->wait_thrs); + } + ++ if (srv_slow_log && trx->take_stats) { ++ ut_usectime(&sec, &ms); ++ now = (ib_longlong)sec * 1000000 + ms; ++ trx->lock_que_wait_timer += (ulint)(now - trx->lock_que_wait_ustarted); ++ } + trx->que_state = TRX_QUE_RUNNING; + } + +@@ -1093,6 +1127,9 @@ + trx_t* trx) /* in: transaction in the TRX_QUE_LOCK_WAIT state */ + { + que_thr_t* thr; ++ ulint sec; ++ ulint ms; ++ ib_longlong now; + + #ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&kernel_mutex)); +@@ -1109,6 +1146,11 @@ + thr = UT_LIST_GET_FIRST(trx->wait_thrs); + } + ++ if (srv_slow_log && trx->take_stats) { ++ ut_usectime(&sec, &ms); ++ now = (ib_longlong)sec * 1000000 + ms; ++ trx->lock_que_wait_timer += (ulint)(now - trx->lock_que_wait_ustarted); ++ } + trx->que_state = TRX_QUE_RUNNING; + } + +diff -r 1242d4575291 mysys/my_getopt.c +--- a/mysys/my_getopt.c Tue Jul 28 23:39:12 2009 -0700 ++++ b/mysys/my_getopt.c Tue Jul 28 23:42:44 2009 -0700 +@@ -827,7 +827,8 @@ + #endif + break; + default: +- DBUG_ASSERT((optp->var_type & GET_TYPE_MASK) == GET_ULL); ++ DBUG_ASSERT((optp->var_type & GET_TYPE_MASK) == GET_ULL ++ || (optp->var_type & GET_TYPE_MASK) == GET_MICROTIME); + break; + } + +@@ -1061,6 +1062,9 @@ + case GET_ULONG: + printf("%lu\n", *((ulong*) value)); + break; ++ case GET_MICROTIME: ++ printf("%6f\n", ((double)(*((longlong*) value))) / 1000000.0); ++ break; + case GET_LL: + printf("%s\n", llstr(*((longlong*) value), buff)); + break; +diff -r 1242d4575291 patch_info/microslow_innodb.info +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/patch_info/microslow_innodb.info Tue Jul 28 23:42:44 2009 -0700 +@@ -0,0 +1,15 @@ ++File=microslow_innodb.patch ++Name=Extended statistics in slow.log ++Version=1.2 ++Author=Percona <info@percona.com> ++License=GPL ++Comment= ++Changelog ++2008-11-26 ++YK: Fix inefficient determination of trx, Make not to call useless gettimeofday when don't use slow log. Make log_slow_queries dynamic (bool). ++ ++2008-11-07 ++VT: Moved log_slow_rate_limit in SHOW VARIABLE into right place ++ ++2008-11 ++Arjen Lentz: Fixups (backward compatibility) by Arjen Lentz <arjen@openquery.com.au> +diff -r 1242d4575291 scripts/mysqldumpslow.sh +--- a/scripts/mysqldumpslow.sh Tue Jul 28 23:39:12 2009 -0700 ++++ b/scripts/mysqldumpslow.sh Tue Jul 28 23:42:44 2009 -0700 +@@ -83,8 +83,8 @@ + s/^#? Time: \d{6}\s+\d+:\d+:\d+.*\n//; + my ($user,$host) = s/^#? User\@Host:\s+(\S+)\s+\@\s+(\S+).*\n// ? ($1,$2) : ('',''); + +- s/^# Query_time: (\d+) Lock_time: (\d+) Rows_sent: (\d+).*\n//; +- my ($t, $l, $r) = ($1, $2, $3); ++ s/^# Query_time: (\d+(\.\d+)?) Lock_time: (\d+(\.\d+)?) Rows_sent: (\d+(\.\d+)?).*\n//; ++ my ($t, $l, $r) = ($1, $3, $5); + $t -= $l unless $opt{l}; + + # remove fluff that mysqld writes to log when it (re)starts: +diff -r 1242d4575291 sql-common/my_time.c +--- a/sql-common/my_time.c Tue Jul 28 23:39:12 2009 -0700 ++++ b/sql-common/my_time.c Tue Jul 28 23:42:44 2009 -0700 +@@ -1253,3 +1253,37 @@ + return 0; + } + ++/* ++ int my_timer(ulonglong *ltime, ulonglong frequency) ++ ++ For performance measurement this function returns the number ++ of microseconds since the epoch (SVr4, BSD 4.3, POSIX 1003.1-2001) ++ or system start (Windows platforms). ++ ++ For windows platforms frequency value (obtained via ++ QueryPerformanceFrequency) has to be specified. The global frequency ++ value is set in mysqld.cc. ++ ++ If Windows platform doesn't support QueryPerformanceFrequency we will ++ obtain the time via GetClockCount, which supports microseconds only. ++*/ ++ ++ulonglong my_timer(ulonglong *ltime, ulonglong frequency) ++{ ++ ulonglong newtime= 0; ++#ifdef __WIN__ ++ if (frequency) ++ { ++ QueryPerformanceCounter((LARGE_INTEGER *)&newtime); ++ newtime/= (frequency * 1000000); ++ } else ++ newtime= (GetTickCount() * 1000; /* GetTickCount only returns milliseconds */ ++#else ++ struct timeval t; ++ if (gettimeofday(&t, NULL) != -1) ++ newtime= (ulonglong)t.tv_sec * 1000000 + t.tv_usec; ++#endif ++ if (ltime) ++ *ltime= newtime; ++ return newtime; ++} +diff -r 1242d4575291 sql/filesort.cc +--- a/sql/filesort.cc Tue Jul 28 23:39:12 2009 -0700 ++++ b/sql/filesort.cc Tue Jul 28 23:42:44 2009 -0700 +@@ -180,6 +180,7 @@ + { + statistic_increment(thd->status_var.filesort_scan_count, &LOCK_status); + } ++ thd->query_plan_flags|= QPLAN_FILESORT; + #ifdef CAN_TRUST_RANGE + if (select && select->quick && select->quick->records > 0L) + { +@@ -245,6 +246,7 @@ + } + else + { ++ thd->query_plan_flags|= QPLAN_FILESORT_DISK; + if (table_sort.buffpek && table_sort.buffpek_len < maxbuffer) + { + x_free(table_sort.buffpek); +@@ -1116,6 +1118,7 @@ + + statistic_increment(current_thd->status_var.filesort_merge_passes, + &LOCK_status); ++ current_thd->query_plan_fsort_passes++; + if (param->not_killable) + { + killed= ¬_killable; +diff -r 1242d4575291 sql/ha_innodb.cc +--- a/sql/ha_innodb.cc Tue Jul 28 23:39:12 2009 -0700 ++++ b/sql/ha_innodb.cc Tue Jul 28 23:42:44 2009 -0700 +@@ -1,3 +1,4 @@ ++ + /* Copyright (C) 2000-2005 MySQL AB & Innobase Oy + + This program is free software; you can redistribute it and/or modify +@@ -819,9 +820,34 @@ + trx->check_unique_secondary = TRUE; + } + ++ if (thd->variables.log_slow_verbosity & SLOG_V_INNODB) { ++ trx->take_stats = TRUE; ++ } else { ++ trx->take_stats = FALSE; ++ } ++ + return(trx); + } + ++/************************************************************************* ++Gets current trx. */ ++extern "C" ++trx_t* ++innobase_get_trx() ++{ ++ THD *thd=current_thd; ++ if (likely(thd != 0)) { ++ return((trx_t*) thd->ha_data[innobase_hton.slot]); ++ } else { ++ return(NULL); ++ } ++} ++ ++void ++innobase_update_var_slow_log() ++{ ++ srv_slow_log = (ibool) opt_slow_log; ++} + + /************************************************************************* + Construct ha_innobase handler. */ +@@ -1324,6 +1350,8 @@ + + /* -------------- Log files ---------------------------*/ + ++ srv_slow_log = (ibool) opt_slow_log; ++ + /* The default dir for log files is the datadir of MySQL */ + + if (!innobase_log_group_home_dir) { +@@ -4697,6 +4725,12 @@ + trx->check_unique_secondary = FALSE; + } + ++ if (thd->variables.log_slow_verbosity & SLOG_V_INNODB) { ++ trx->take_stats = TRUE; ++ } else { ++ trx->take_stats = FALSE; ++ } ++ + if (lower_case_table_names) { + srv_lower_case_table_names = TRUE; + } else { +@@ -4962,6 +4996,12 @@ + trx->check_unique_secondary = FALSE; + } + ++ if (thd->variables.log_slow_verbosity & SLOG_V_INNODB) { ++ trx->take_stats = TRUE; ++ } else { ++ trx->take_stats = FALSE; ++ } ++ + name_len = strlen(name); + + assert(name_len < 1000); +@@ -5049,6 +5089,12 @@ + trx->check_foreigns = FALSE; + } + ++ if (current_thd->variables.log_slow_verbosity & SLOG_V_INNODB) { ++ trx->take_stats = TRUE; ++ } else { ++ trx->take_stats = FALSE; ++ } ++ + error = row_drop_database_for_mysql(namebuf, trx); + my_free(namebuf, MYF(0)); + +@@ -5115,6 +5161,12 @@ + trx->check_foreigns = FALSE; + } + ++ if (current_thd->variables.log_slow_verbosity & SLOG_V_INNODB) { ++ trx->take_stats = TRUE; ++ } else { ++ trx->take_stats = FALSE; ++ } ++ + name_len1 = strlen(from); + name_len2 = strlen(to); + +@@ -6122,6 +6174,7 @@ + { + row_prebuilt_t* prebuilt = (row_prebuilt_t*) innobase_prebuilt; + trx_t* trx; ++ int i; + + DBUG_ENTER("ha_innobase::external_lock"); + DBUG_PRINT("enter",("lock_type: %d", lock_type)); +@@ -6245,7 +6298,24 @@ + + if (trx->n_mysql_tables_in_use == 0) { + +- trx->mysql_n_tables_locked = 0; ++ current_thd->innodb_was_used = TRUE; ++ current_thd->innodb_io_reads += trx->io_reads; ++ current_thd->innodb_io_read += trx->io_read; ++ current_thd->innodb_io_reads_wait_timer += trx->io_reads_wait_timer; ++ current_thd->innodb_lock_que_wait_timer += trx->lock_que_wait_timer; ++ current_thd->innodb_innodb_que_wait_timer += trx->innodb_que_wait_timer; ++ current_thd->innodb_page_access += trx->distinct_page_access; ++ ++ trx->io_reads = 0; ++ trx->io_read = 0; ++ trx->io_reads_wait_timer = 0; ++ trx->lock_que_wait_timer = 0; ++ trx->innodb_que_wait_timer = 0; ++ trx->distinct_page_access = 0; ++ if (trx->distinct_page_access_hash) ++ memset(trx->distinct_page_access_hash, 0, DPAH_SIZE); ++ ++ trx->mysql_n_tables_locked = 0; + prebuilt->used_in_HANDLER = FALSE; + + if (!(thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) { +diff -r 1242d4575291 sql/ha_innodb.h +--- a/sql/ha_innodb.h Tue Jul 28 23:39:12 2009 -0700 ++++ b/sql/ha_innodb.h Tue Jul 28 23:42:44 2009 -0700 +@@ -271,6 +271,8 @@ + + int innobase_start_trx_and_assign_read_view(THD* thd); + ++void innobase_update_var_slow_log(); ++ + /*********************************************************************** + This function is used to prepare X/Open XA distributed transaction */ + +diff -r 1242d4575291 sql/log.cc +--- a/sql/log.cc Tue Jul 28 23:39:12 2009 -0700 ++++ b/sql/log.cc Tue Jul 28 23:42:44 2009 -0700 +@@ -2289,11 +2289,12 @@ + */ + + bool MYSQL_LOG::write(THD *thd,const char *query, uint query_length, +- time_t query_start_arg) ++ time_t query_start_arg, ulonglong query_start_timer) + { + bool error=0; + time_t current_time; +- if (!is_open()) ++ ulonglong current_timer; ++ if (!opt_slow_log || !is_open()) + return 0; + DBUG_ENTER("MYSQL_LOG::write"); + +@@ -2303,7 +2304,8 @@ + int tmp_errno=0; + char buff[80],*end; + end=buff; +- if (!(thd->options & OPTION_UPDATE_LOG)) ++ if (!(thd->options & OPTION_UPDATE_LOG) && ++ !(thd->slave_thread && opt_log_slow_slave_statements)) + { + VOID(pthread_mutex_unlock(&LOCK_log)); + DBUG_RETURN(0); +@@ -2333,22 +2335,72 @@ + if (my_b_printf(&log_file, "# User@Host: %s[%s] @ %s [%s]\n", + sctx->priv_user ? + sctx->priv_user : "", +- sctx->user ? sctx->user : "", ++ sctx->user ? sctx->user : (thd->slave_thread ? "SQL_SLAVE" : ""), + sctx->host ? sctx->host : "", + sctx->ip ? sctx->ip : "") == + (uint) -1) + tmp_errno=errno; + } +- if (query_start_arg) ++ if (query_start_timer) + { ++ char buf[5][20]; ++ ulonglong current_timer= my_timer(¤t_timer, frequency); ++ snprintf(buf[0], 20, "%.6f", (current_timer ? (current_timer - query_start_timer):0) / 1000000.0); ++ snprintf(buf[1], 20, "%.6f", (thd->timer_after_lock - query_start_timer) / 1000000.0); ++ if (!query_length) ++ { ++ thd->sent_row_count= thd->examined_row_count= 0; ++ thd->row_count= 0; ++ thd->innodb_was_used= FALSE; ++ thd->query_plan_flags= QPLAN_NONE; ++ thd->query_plan_fsort_passes= 0; ++ } ++ + /* For slow query log */ + if (my_b_printf(&log_file, +- "# Query_time: %lu Lock_time: %lu Rows_sent: %lu Rows_examined: %lu\n", +- (ulong) (current_time - query_start_arg), +- (ulong) (thd->time_after_lock - query_start_arg), ++ "# Thread_id: %lu Schema: %s\n" \ ++ "# Query_time: %s Lock_time: %s Rows_sent: %lu Rows_examined: %lu Rows_affected: %lu Rows_read: %lu\n", ++ (ulong) thd->thread_id, (thd->db ? thd->db : ""), ++ buf[0], buf[1], + (ulong) thd->sent_row_count, +- (ulong) thd->examined_row_count) == (uint) -1) ++ (ulong) thd->examined_row_count, ++ ((long) thd->row_count_func > 0 ) ? (ulong) thd->row_count_func : 0, ++ (ulong) thd->row_count) == (uint) -1) + tmp_errno=errno; ++ if ((thd->variables.log_slow_verbosity & SLOG_V_QUERY_PLAN) && ++ my_b_printf(&log_file, ++ "# QC_Hit: %s Full_scan: %s Full_join: %s Tmp_table: %s Tmp_table_on_disk: %s\n" \ ++ "# Filesort: %s Filesort_on_disk: %s Merge_passes: %lu\n", ++ ((thd->query_plan_flags & QPLAN_QC) ? "Yes" : "No"), ++ ((thd->query_plan_flags & QPLAN_FULL_SCAN) ? "Yes" : "No"), ++ ((thd->query_plan_flags & QPLAN_FULL_JOIN) ? "Yes" : "No"), ++ ((thd->query_plan_flags & QPLAN_TMP_TABLE) ? "Yes" : "No"), ++ ((thd->query_plan_flags & QPLAN_TMP_DISK) ? "Yes" : "No"), ++ ((thd->query_plan_flags & QPLAN_FILESORT) ? "Yes" : "No"), ++ ((thd->query_plan_flags & QPLAN_FILESORT_DISK) ? "Yes" : "No"), ++ thd->query_plan_fsort_passes) == (uint) -1) ++ tmp_errno=errno; ++ if ((thd->variables.log_slow_verbosity & SLOG_V_INNODB) && thd->innodb_was_used) ++ { ++ snprintf(buf[2], 20, "%.6f", thd->innodb_io_reads_wait_timer / 1000000.0); ++ snprintf(buf[3], 20, "%.6f", thd->innodb_lock_que_wait_timer / 1000000.0); ++ snprintf(buf[4], 20, "%.6f", thd->innodb_innodb_que_wait_timer / 1000000.0); ++ if (my_b_printf(&log_file, ++ "# InnoDB_IO_r_ops: %lu InnoDB_IO_r_bytes: %lu InnoDB_IO_r_wait: %s\n" \ ++ "# InnoDB_rec_lock_wait: %s InnoDB_queue_wait: %s\n" \ ++ "# InnoDB_pages_distinct: %lu\n", ++ (ulong) thd->innodb_io_reads, ++ (ulong) thd->innodb_io_read, ++ buf[2], buf[3], buf[4], ++ (ulong) thd->innodb_page_access) == (uint) -1) ++ tmp_errno=errno; ++ } ++ else ++ { ++ if ((thd->variables.log_slow_verbosity & SLOG_V_INNODB) && ++ my_b_printf(&log_file,"# No InnoDB statistics available for this query\n") == (uint) -1) ++ tmp_errno=errno; ++ } + } + if (thd->db && strcmp(thd->db,db)) + { // Database changed +diff -r 1242d4575291 sql/log_event.cc +--- a/sql/log_event.cc Tue Jul 28 23:39:12 2009 -0700 ++++ b/sql/log_event.cc Tue Jul 28 23:42:44 2009 -0700 +@@ -2061,6 +2061,7 @@ + /* Execute the query (note that we bypass dispatch_command()) */ + const char* found_semicolon= NULL; + mysql_parse(thd, thd->query, thd->query_length, &found_semicolon); ++ log_slow_statement(thd); + + } + else +diff -r 1242d4575291 sql/mysql_priv.h +--- a/sql/mysql_priv.h Tue Jul 28 23:39:12 2009 -0700 ++++ b/sql/mysql_priv.h Tue Jul 28 23:42:44 2009 -0700 +@@ -507,6 +507,78 @@ + + #define STRING_BUFFER_USUAL_SIZE 80 + ++/* Slow log */ ++ ++struct msl_opts ++{ ++ ulong val; ++ const char *name; ++}; ++ ++#define SLOG_V_MICROTIME 1 << 0 ++#define SLOG_V_QUERY_PLAN 1 << 1 ++#define SLOG_V_INNODB 1 << 2 ++/* ... */ ++#define SLOG_V_INVALID 1 << 31 ++#define SLOG_V_NONE SLOG_V_MICROTIME ++ ++static const struct msl_opts slog_verb[]= ++{ ++ /* Basic flags */ ++ ++ { SLOG_V_MICROTIME, "microtime" }, ++ { SLOG_V_QUERY_PLAN, "query_plan" }, ++ { SLOG_V_INNODB, "innodb" }, ++ ++ /* End of baisc flags */ ++ ++ { 0, "" }, ++ ++ /* Complex flags */ ++ ++ { SLOG_V_MICROTIME, "minimal" }, ++ { SLOG_V_MICROTIME|SLOG_V_QUERY_PLAN, "standard" }, ++ { SLOG_V_MICROTIME|SLOG_V_QUERY_PLAN|SLOG_V_INNODB, "full" }, ++ ++ /* End of complex flags */ ++ ++ { SLOG_V_INVALID, (char *)0 } ++}; ++ ++#define QPLAN_NONE 0 ++#define QPLAN_QC 1 << 0 ++#define QPLAN_QC_NO 1 << 1 ++#define QPLAN_FULL_SCAN 1 << 2 ++#define QPLAN_FULL_JOIN 1 << 3 ++#define QPLAN_TMP_TABLE 1 << 4 ++#define QPLAN_TMP_DISK 1 << 5 ++#define QPLAN_FILESORT 1 << 6 ++#define QPLAN_FILESORT_DISK 1 << 7 ++/* ... */ ++#define QPLAN_MAX 1 << 31 ++ ++#define SLOG_F_QC_NO QPLAN_QC_NO ++#define SLOG_F_FULL_SCAN QPLAN_FULL_SCAN ++#define SLOG_F_FULL_JOIN QPLAN_FULL_JOIN ++#define SLOG_F_TMP_TABLE QPLAN_TMP_TABLE ++#define SLOG_F_TMP_DISK QPLAN_TMP_DISK ++#define SLOG_F_FILESORT QPLAN_FILESORT ++#define SLOG_F_FILESORT_DISK QPLAN_FILESORT_DISK ++#define SLOG_F_INVALID 1 << 31 ++#define SLOG_F_NONE 0 ++ ++static const struct msl_opts slog_filter[]= ++{ ++ { SLOG_F_QC_NO, "qc_miss" }, ++ { SLOG_F_FULL_SCAN, "full_scan" }, ++ { SLOG_F_FULL_JOIN, "full_join" }, ++ { SLOG_F_TMP_TABLE, "tmp_table" }, ++ { SLOG_F_TMP_DISK, "tmp_table_on_disk" }, ++ { SLOG_F_FILESORT, "filesort" }, ++ { SLOG_F_FILESORT_DISK, "filesort_on_disk" }, ++ { SLOG_F_INVALID, (char *)0 } ++}; ++ + enum enum_parsing_place + { + NO_MATTER, +@@ -1365,6 +1437,7 @@ + extern bool using_update_log, opt_large_files, server_id_supplied; + extern bool opt_update_log, opt_bin_log, opt_error_log; + extern my_bool opt_log, opt_slow_log, opt_log_queries_not_using_indexes; ++extern char *opt_slow_logname; + extern bool opt_disable_networking, opt_skip_show_db; + extern my_bool opt_character_set_client_handshake; + extern bool volatile abort_loop, shutdown_in_progress, grant_option; +@@ -1376,7 +1449,8 @@ + extern my_bool opt_enable_named_pipe, opt_sync_frm, opt_allow_suspicious_udfs; + extern my_bool opt_secure_auth; + extern char* opt_secure_file_priv; +-extern my_bool opt_log_slow_admin_statements; ++extern my_bool opt_log_slow_admin_statements, opt_log_slow_slave_statements; ++extern my_bool opt_use_global_long_query_time; + extern my_bool sp_automatic_privileges, opt_noacl; + extern my_bool opt_old_style_user_limits, trust_function_creators; + extern uint opt_crash_binlog_innodb; +diff -r 1242d4575291 sql/mysqld.cc +--- a/sql/mysqld.cc Tue Jul 28 23:39:12 2009 -0700 ++++ b/sql/mysqld.cc Tue Jul 28 23:42:44 2009 -0700 +@@ -176,7 +176,6 @@ + static void getvolumeID(BYTE *volumeName); + #endif /* __NETWARE__ */ + +- + #ifdef _AIX41 + int initgroups(const char *,unsigned int); + #endif +@@ -411,10 +410,13 @@ + my_bool opt_secure_auth= 0; + char* opt_secure_file_priv= 0; + my_bool opt_log_slow_admin_statements= 0; ++my_bool opt_log_slow_slave_statements= 0; ++my_bool opt_use_global_long_query_time= 0; + my_bool lower_case_file_system= 0; + my_bool opt_large_pages= 0; + uint opt_large_page_size= 0; + my_bool opt_old_style_user_limits= 0, trust_function_creators= 0; ++char* opt_slow_logname= 0; + /* + True if there is at least one per-hour limit for some user, so we should + check them before each query (and possibly reset counters when hour is +@@ -509,6 +511,7 @@ + Ge_creator ge_creator; + Le_creator le_creator; + ++ulonglong frequency= 0; + + FILE *bootstrap_file; + int bootstrap_error; +@@ -588,7 +591,7 @@ + static int cleanup_done; + static ulong opt_specialflag, opt_myisam_block_size; + static char *opt_logname, *opt_update_logname, *opt_binlog_index_name; +-static char *opt_slow_logname, *opt_tc_heuristic_recover; ++static char *opt_tc_heuristic_recover; + static char *mysql_home_ptr, *pidfile_name_ptr; + static char **defaults_argv; + static char *opt_bin_logname; +@@ -3697,6 +3700,8 @@ + unireg_abort(1); + } + } ++ if (!QueryPerformanceFrequency((LARGE_INTEGER *)&frequency)) ++ frequency= 0; + #endif /* __WIN__ */ + + if (init_common_variables(MYSQL_CONFIG_NAME, +@@ -4947,7 +4952,7 @@ + OPT_INTERACTIVE_TIMEOUT, OPT_JOIN_BUFF_SIZE, + OPT_KEY_BUFFER_SIZE, OPT_KEY_CACHE_BLOCK_SIZE, + OPT_KEY_CACHE_DIVISION_LIMIT, OPT_KEY_CACHE_AGE_THRESHOLD, +- OPT_LONG_QUERY_TIME, ++ OPT_LONG_QUERY_TIME, OPT_MIN_EXAMINED_ROW_LIMIT, + OPT_LOWER_CASE_TABLE_NAMES, OPT_MAX_ALLOWED_PACKET, + OPT_MAX_BINLOG_CACHE_SIZE, OPT_MAX_BINLOG_SIZE, + OPT_MAX_CONNECTIONS, OPT_MAX_CONNECT_ERRORS, +@@ -5038,11 +5043,18 @@ + OPT_TIMED_MUTEXES, + OPT_OLD_STYLE_USER_LIMITS, + OPT_LOG_SLOW_ADMIN_STATEMENTS, ++ OPT_LOG_SLOW_SLAVE_STATEMENTS, ++ OPT_LOG_SLOW_RATE_LIMIT, ++ OPT_LOG_SLOW_VERBOSITY, ++ OPT_LOG_SLOW_FILTER, + OPT_TABLE_LOCK_WAIT_TIMEOUT, + OPT_PLUGIN_DIR, + OPT_PORT_OPEN_TIMEOUT, + OPT_MERGE, + OPT_PROFILING, ++ OPT_SLOW_LOG, ++ OPT_SLOW_QUERY_LOG_FILE, ++ OPT_USE_GLOBAL_LONG_QUERY_TIME, + OPT_INNODB_ROLLBACK_ON_TIMEOUT, + OPT_SECURE_FILE_PRIV, + OPT_KEEP_FILES_ON_CREATE, +@@ -5441,10 +5453,19 @@ + (gptr*) &opt_log_slow_admin_statements, + (gptr*) &opt_log_slow_admin_statements, + 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, ++ {"log-slow-slave-statements", OPT_LOG_SLOW_SLAVE_STATEMENTS, ++ "Log slow replicated statements to the slow log if it is open.", ++ (gptr*) &opt_log_slow_slave_statements, ++ (gptr*) &opt_log_slow_slave_statements, ++ 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"log-slow-queries", OPT_SLOW_QUERY_LOG, + "Log slow queries to this log file. Defaults logging to hostname-slow.log file. Must be enabled to activate other slow log options.", + (gptr*) &opt_slow_logname, (gptr*) &opt_slow_logname, 0, GET_STR, OPT_ARG, + 0, 0, 0, 0, 0, 0}, ++ {"slow_query_log_file", OPT_SLOW_QUERY_LOG_FILE, ++ "Log slow queries to given log file. Defaults logging to hostname-slow.log. Must be enabled to activate other slow log options.", ++ (gptr*) &opt_slow_logname, (gptr*) &opt_slow_logname, 0, GET_STR, OPT_ARG, ++ 0, 0, 0, 0, 0, 0}, + {"log-tc", OPT_LOG_TC, + "Path to transaction coordinator log (used for transactions that affect " + "more than one storage engine, when binary log is disabled)", +@@ -5808,6 +5829,9 @@ + "Tells the slave thread to continue replication when a query returns an error from the provided list.", + 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + #endif ++ {"slow-query-log", OPT_SLOW_LOG, ++ "Enable|disable slow query log", (gptr*) &opt_slow_log, ++ (gptr*) &opt_slow_log, 0, GET_BOOL, OPT_ARG, 0, 0, 0, 0, 0, 0}, + {"socket", OPT_SOCKET, "Socket file to use for connection.", + (gptr*) &mysqld_unix_port, (gptr*) &mysqld_unix_port, 0, GET_STR, + REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, +@@ -6110,11 +6134,31 @@ + (gptr*) 0, + 0, (GET_ULONG | GET_ASK_ADDR) , REQUIRED_ARG, 100, + 1, 100, 0, 1, 0}, +- {"long_query_time", OPT_LONG_QUERY_TIME, +- "Log all queries that have taken more than long_query_time seconds to execute to file.", +- (gptr*) &global_system_variables.long_query_time, +- (gptr*) &max_system_variables.long_query_time, 0, GET_ULONG, +- REQUIRED_ARG, 10, 1, LONG_TIMEOUT, 0, 1, 0}, ++ {"log_slow_filter", OPT_LOG_SLOW_FILTER, ++ "Log only the queries that followed certain execution plan. Multiple flags allowed in a comma-separated string. [qc_miss, full_scan, full_join, tmp_table, tmp_table_on_disk, filesort, filesort_on_disk]", ++ 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, SLOG_F_NONE, 0, 0}, ++ {"log_slow_rate_limit", OPT_LOG_SLOW_RATE_LIMIT, ++ "Rate limit statement writes to slow log to only those from every (1/log_slow_rate_limit) session.", ++ (gptr*) &global_system_variables.log_slow_rate_limit, ++ (gptr*) &max_system_variables.log_slow_rate_limit, 0, GET_ULONG, ++ REQUIRED_ARG, 1, 1, LONG_MAX, 0, 1L, 0}, ++ {"log_slow_verbosity", OPT_LOG_SLOW_VERBOSITY, ++ "Choose how verbose the messages to your slow log will be. Multiple flags allowed in a comma-separated string. [microtime, query_plan, innodb]", ++ 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, SLOG_V_MICROTIME, 0, 0}, ++ {"long_query_time", OPT_LONG_QUERY_TIME, ++ "Log all queries that have taken more than long_query_time seconds to execute to file.", ++ (gptr*) &global_system_variables.long_query_time, ++ (gptr*) &max_system_variables.long_query_time, 0, GET_MICROTIME, ++ REQUIRED_ARG, 10000000, 0, LONG_TIMEOUT * 1000000, 0, 1, 0}, ++ {"min_examined_row_limit", OPT_MIN_EXAMINED_ROW_LIMIT, ++ "Don't log queries which examine less than min_examined_row_limit rows to file.", ++ (gptr*) &global_system_variables.min_examined_row_limit, ++ (gptr*) &max_system_variables.min_examined_row_limit, 0, GET_ULONG, ++ REQUIRED_ARG, 0, 0, LONG_MAX, 0, 1L, 0}, ++ {"use_global_long_query_time", OPT_USE_GLOBAL_LONG_QUERY_TIME, ++ "Control always use global long_query_time or local long_query_time.", ++ (gptr*) &opt_use_global_long_query_time, (gptr*) &opt_use_global_long_query_time, ++ 0, GET_BOOL, NO_ARG, 0, 0, 1, 0, 1, 0}, + {"lower_case_table_names", OPT_LOWER_CASE_TABLE_NAMES, + "If set to 1 table names are stored in lowercase on disk and table names will be case-insensitive. Should be set to 2 if you are using a case insensitive file system", + (gptr*) &lower_case_table_names, +@@ -6893,7 +6937,11 @@ + global_system_variables.max_join_size= (ulonglong) HA_POS_ERROR; + max_system_variables.max_join_size= (ulonglong) HA_POS_ERROR; + global_system_variables.old_passwords= 0; +- ++ global_system_variables.long_query_time = 10000000; ++ max_system_variables.long_query_time = LONG_TIMEOUT * 1000000; ++ global_system_variables.log_slow_verbosity= SLOG_V_MICROTIME; ++ global_system_variables.log_slow_filter= SLOG_F_NONE; ++ + /* + Default behavior for 4.1 and 5.0 is to treat NULL values as unequal + when collecting index statistics for MyISAM tables. +@@ -7364,6 +7412,35 @@ + case OPT_BOOTSTRAP: + opt_noacl=opt_bootstrap=1; + break; ++ case OPT_LOG_SLOW_FILTER: ++ if ((global_system_variables.log_slow_filter= ++ msl_flag_resolve_by_name(slog_filter, argument, ++ SLOG_F_NONE, SLOG_F_INVALID)) == SLOG_F_INVALID) ++ { ++ fprintf(stderr,"Invalid argument in log_slow_filter: %s\n", argument); ++ exit(1); ++ } ++ break; ++ case OPT_LOG_SLOW_VERBOSITY: ++ if ((global_system_variables.log_slow_verbosity= ++ msl_flag_resolve_by_name(slog_verb, argument, ++ SLOG_V_NONE, SLOG_V_INVALID)) == SLOG_V_INVALID) ++ { ++ fprintf(stderr,"Invalid argument in log_slow_verbosity: %s\n", argument); ++ exit(1); ++ } ++ break; ++ case OPT_LONG_QUERY_TIME: ++ { ++ double doubleslow = strtod(argument,NULL); ++ if (doubleslow < 0 || doubleslow > (LONG_TIMEOUT)) ++ { ++ fprintf(stderr,"Out of range long_query_time value: %s\n", argument); ++ exit(1); ++ } ++ global_system_variables.long_query_time = (ulonglong) (doubleslow * 1000000); ++ break; ++ } + case OPT_STORAGE_ENGINE: + { + if ((enum db_type)((global_system_variables.table_type= +@@ -7696,10 +7773,14 @@ + if (opt_bdb) + sql_print_warning("this binary does not contain BDB storage engine"); + #endif +- if ((opt_log_slow_admin_statements || opt_log_queries_not_using_indexes) && ++ if ((opt_log_slow_admin_statements || opt_log_queries_not_using_indexes || ++ opt_log_slow_slave_statements) && + !opt_slow_log) +- sql_print_warning("options --log-slow-admin-statements and --log-queries-not-using-indexes have no effect if --log-slow-queries is not set"); +- ++ { ++ sql_print_warning("options --log-slow-admin-statements, --log-slow-slave-statements and --log-queries-not-using-indexes have no effect if --log-slow-queries is not set"); ++ opt_log_slow_slave_statements= FALSE; ++ } ++ + if (argc > 0) + { + fprintf(stderr, "%s: Too many arguments (first extra is '%s').\nUse --help to get a list of available options\n", my_progname, *argv); +diff -r 1242d4575291 sql/set_var.cc +--- a/sql/set_var.cc Tue Jul 28 23:39:12 2009 -0700 ++++ b/sql/set_var.cc Tue Jul 28 23:42:44 2009 -0700 +@@ -217,9 +217,13 @@ + sys_log_queries_not_using_indexes("log_queries_not_using_indexes", + &opt_log_queries_not_using_indexes); + sys_var_thd_ulong sys_log_warnings("log_warnings", &SV::log_warnings); +-sys_var_thd_ulong sys_long_query_time("long_query_time", ++sys_var_thd_microtime sys_long_query_time("long_query_time", + &SV::long_query_time); ++sys_var_bool_ptr sys_use_global_long_query_time("use_global_long_query_time", ++ &opt_use_global_long_query_time); + sys_var_bool_const_ptr sys_log_slow("log_slow_queries", &opt_slow_log); ++sys_var_log_slow sys_slow_query_log("slow_query_log", &opt_slow_log); ++sys_var_const_str_ptr sys_slow_query_log_file("slow_query_log_file", &opt_slow_logname); + sys_var_thd_bool sys_low_priority_updates("low_priority_updates", + &SV::low_priority_updates, + fix_low_priority_updates); +@@ -283,6 +287,8 @@ + &SV::max_tmp_tables); + sys_var_long_ptr sys_max_write_lock_count("max_write_lock_count", + &max_write_lock_count); ++sys_var_thd_ulong sys_min_examined_row_limit("min_examined_row_limit", ++ &SV::min_examined_row_limit); + sys_var_thd_ulong sys_multi_range_count("multi_range_count", + &SV::multi_range_count); + sys_var_long_ptr sys_myisam_data_pointer_size("myisam_data_pointer_size", +@@ -327,6 +333,20 @@ + sys_var_bool_ptr sys_relay_log_purge("relay_log_purge", + &relay_log_purge); + #endif ++sys_var_thd_ulong sys_log_slow_rate_limit("log_slow_rate_limit", ++ &SV::log_slow_rate_limit); ++sys_var_thd_msl_flag sys_log_slow_filter("log_slow_filter", ++ &SV::log_slow_filter, ++ SLOG_F_NONE, ++ SLOG_F_NONE, ++ SLOG_F_INVALID, ++ slog_filter); ++sys_var_thd_msl_flag sys_log_slow_verbosity("log_slow_verbosity", ++ &SV::log_slow_verbosity, ++ SLOG_V_NONE, ++ SLOG_V_MICROTIME, ++ SLOG_V_INVALID, ++ slog_verb); + sys_var_long_ptr sys_rpl_recovery_rank("rpl_recovery_rank", + &rpl_recovery_rank); + sys_var_long_ptr sys_query_cache_size("query_cache_size", +@@ -697,6 +717,10 @@ + &sys_log_off, + &sys_log_queries_not_using_indexes, + &sys_log_slow, ++ &sys_log_slow_filter, ++ &sys_log_slow_rate_limit, ++ &sys_log_slow_verbosity, ++ &sys_use_global_long_query_time, + &sys_log_update, + &sys_log_warnings, + &sys_long_query_time, +@@ -720,6 +744,7 @@ + &sys_max_tmp_tables, + &sys_max_user_connections, + &sys_max_write_lock_count, ++ &sys_min_examined_row_limit, + &sys_multi_range_count, + &sys_myisam_data_pointer_size, + &sys_myisam_max_sort_file_size, +@@ -773,6 +798,8 @@ + &sys_slave_skip_counter, + #endif + &sys_slow_launch_time, ++ &sys_slow_query_log, ++ &sys_slow_query_log_file, + &sys_sort_buffer, + &sys_sql_big_tables, + &sys_sql_low_priority_updates, +@@ -994,8 +1021,11 @@ + {"log_slave_updates", (char*) &opt_log_slave_updates, SHOW_MY_BOOL}, + #endif + {sys_log_slow.name, (char*) &sys_log_slow, SHOW_SYS}, ++ {sys_log_slow_filter.name, (char*) &sys_log_slow_filter, SHOW_SYS}, ++ {sys_log_slow_rate_limit.name, (char*) &sys_log_slow_rate_limit, SHOW_SYS}, ++ {sys_log_slow_verbosity.name, (char*) &sys_log_slow_verbosity, SHOW_SYS}, + {sys_log_warnings.name, (char*) &sys_log_warnings, SHOW_SYS}, +- {sys_long_query_time.name, (char*) &sys_long_query_time, SHOW_SYS}, ++ {sys_long_query_time.name, (char*) &sys_long_query_time, SHOW_MICROTIME}, + {sys_low_priority_updates.name, (char*) &sys_low_priority_updates, SHOW_SYS}, + {"lower_case_file_system", (char*) &lower_case_file_system, SHOW_MY_BOOL}, + {"lower_case_table_names", (char*) &lower_case_table_names, SHOW_INT}, +@@ -1022,6 +1052,7 @@ + {sys_max_tmp_tables.name, (char*) &sys_max_tmp_tables, SHOW_SYS}, + {sys_max_user_connections.name,(char*) &sys_max_user_connections, SHOW_SYS}, + {sys_max_write_lock_count.name, (char*) &sys_max_write_lock_count,SHOW_SYS}, ++ {sys_min_examined_row_limit.name, (char*) &sys_min_examined_row_limit, SHOW_SYS}, + {sys_multi_range_count.name, (char*) &sys_multi_range_count, SHOW_SYS}, + {sys_myisam_data_pointer_size.name, (char*) &sys_myisam_data_pointer_size, SHOW_SYS}, + {sys_myisam_max_sort_file_size.name, (char*) &sys_myisam_max_sort_file_size, +@@ -1109,6 +1140,8 @@ + {sys_slave_trans_retries.name,(char*) &sys_slave_trans_retries, SHOW_SYS}, + #endif + {sys_slow_launch_time.name, (char*) &sys_slow_launch_time, SHOW_SYS}, ++ {sys_slow_query_log.name, (char*) &sys_slow_query_log, SHOW_SYS}, ++ {sys_slow_query_log_file.name,(char*) &sys_slow_query_log_file, SHOW_SYS}, + #ifdef HAVE_SYS_UN_H + {"socket", (char*) &mysqld_unix_port, SHOW_CHAR_PTR}, + #endif +@@ -1149,6 +1182,7 @@ + {sys_tx_isolation.name, (char*) &sys_tx_isolation, SHOW_SYS}, + {sys_updatable_views_with_limit.name, + (char*) &sys_updatable_views_with_limit,SHOW_SYS}, ++ {sys_use_global_long_query_time.name, (char*) &sys_use_global_long_query_time, SHOW_SYS}, + {sys_version.name, (char*) &sys_version, SHOW_SYS}, + #ifdef HAVE_BERKELEY_DB + {sys_version_bdb.name, (char*) &sys_version_bdb, SHOW_SYS}, +@@ -1777,6 +1811,17 @@ + } + + ++bool sys_var_thd_microtime::check(THD *thd, set_var *var) ++{ ++ if (var->value->result_type() == DECIMAL_RESULT) ++ var->save_result.ulonglong_value= (ulonglong)(var->value->val_real() * 1000000); ++ else ++ var->save_result.ulonglong_value= (ulonglong)(var->value->val_int() * 1000000); ++ ++ return 0; ++} ++ ++ + bool sys_var_thd_bool::update(THD *thd, set_var *var) + { + if (var->type == OPT_GLOBAL) +@@ -1933,6 +1978,19 @@ + pthread_mutex_unlock(&LOCK_global_system_variables); + return new Item_int(value); + } ++ case SHOW_MICROTIME: ++ { ++ longlong value; ++ char buff[80]; ++ int len; ++ ++ pthread_mutex_lock(&LOCK_global_system_variables); ++ value= *(longlong*) value_ptr(thd, var_type, base); ++ pthread_mutex_unlock(&LOCK_global_system_variables); ++ ++ len = snprintf(buff, 80, "%f", ((double) value) / 1000000.0); ++ return new Item_float(buff,len); ++ } + case SHOW_HA_ROWS: + { + ha_rows value; +@@ -2765,6 +2823,30 @@ + } + + ++bool sys_var_log_slow::update(THD *thd, set_var *var) ++{ ++ bool ret; ++ ++ pthread_mutex_lock(&LOCK_global_system_variables); ++ if (var->save_result.ulong_value) ++ { ++ if(!mysql_slow_log.is_open()) ++ { ++ mysql_slow_log.open_slow_log(opt_slow_logname); ++ } ++ } ++ pthread_mutex_unlock(&LOCK_global_system_variables); ++ ++ ret = sys_var_bool_ptr::update(thd, var); ++ ++#ifdef HAVE_INNOBASE_DB ++ innobase_update_var_slow_log(); ++#endif ++ ++ return(ret); ++} ++ ++ + #ifdef HAVE_REPLICATION + bool sys_var_slave_skip_counter::check(THD *thd, set_var *var) + { +@@ -3549,6 +3631,191 @@ + #endif + } + ++/* Slow log stuff */ ++ ++ulong msl_option_resolve_by_name(const struct msl_opts *opts, const char *name, ulong len) ++{ ++ ulong i; ++ ++ for (i=0; opts[i].name; i++) ++ { ++ if (!my_strnncoll(&my_charset_latin1, ++ (const uchar *)name, len, ++ (const uchar *)opts[i].name, strlen(opts[i].name))) ++ return opts[i].val; ++ } ++ return opts[i].val; ++} ++ ++ulong msl_flag_resolve_by_name(const struct msl_opts *opts, const char *names_list, ++ const ulong none_val, const ulong invalid_val) ++{ ++ const char *p, *e; ++ ulong val= none_val; ++ ++ if (!*names_list) ++ return val; ++ ++ for (p= e= names_list; ; e++) ++ { ++ ulong i; ++ ++ if (*e != ',' && *e) ++ continue; ++ for (i=0; opts[i].name; i++) ++ { ++ if (!my_strnncoll(&my_charset_latin1, ++ (const uchar *)p, e - p, ++ (const uchar *)opts[i].name, strlen(opts[i].name))) ++ { ++ val= val | opts[i].val; ++ break; ++ } ++ } ++ if (opts[i].val == invalid_val) ++ return invalid_val; ++ if (!*e) ++ break; ++ p= e + 1; ++ } ++ return val; ++} ++ ++const char *msl_option_get_name(const struct msl_opts *opts, ulong val) ++{ ++ for (ulong i=0; opts[i].name && opts[i].name[0]; i++) ++ { ++ if (opts[i].val == val) ++ return opts[i].name; ++ } ++ return "*INVALID*"; ++} ++ ++char *msl_flag_get_name(const struct msl_opts *opts, char *buf, ulong val) ++{ ++ uint offset= 0; ++ ++ *buf= '\0'; ++ for (ulong i=0; opts[i].name && opts[i].name[0]; i++) ++ { ++ if (opts[i].val & val) ++ offset+= snprintf(buf+offset, STRING_BUFFER_USUAL_SIZE - offset - 1, ++ "%s%s", (offset ? "," : ""), opts[i].name); ++ } ++ return buf; ++} ++ ++/**************************************************************************** ++ Functions to handle log_slow_verbosity ++****************************************************************************/ ++ ++/* Based upon sys_var::check_enum() */ ++ ++bool sys_var_thd_msl_option::check(THD *thd, set_var *var) ++{ ++ char buff[STRING_BUFFER_USUAL_SIZE]; ++ String str(buff, sizeof(buff), &my_charset_latin1), *res; ++ ++ if (var->value->result_type() == STRING_RESULT) ++ { ++ ulong verb= this->invalid_val; ++ if (!(res=var->value->val_str(&str)) || ++ (var->save_result.ulong_value= ++ (ulong) (verb= msl_option_resolve_by_name(this->opts, res->ptr(), res->length()))) == this->invalid_val) ++ goto err; ++ return 0; ++ } ++ ++err: ++ my_error(ER_WRONG_ARGUMENTS, MYF(0), var->var->name); ++ return 1; ++} ++ ++byte *sys_var_thd_msl_option::value_ptr(THD *thd, enum_var_type type, ++ LEX_STRING *base) ++{ ++ ulong val; ++ val= ((type == OPT_GLOBAL) ? global_system_variables.*offset : ++ thd->variables.*offset); ++ const char *verbosity= msl_option_get_name(this->opts, val); ++ return (byte *) verbosity; ++} ++ ++ ++void sys_var_thd_msl_option::set_default(THD *thd, enum_var_type type) ++{ ++ if (type == OPT_GLOBAL) ++ global_system_variables.*offset= (ulong) this->default_val; ++ else ++ thd->variables.*offset= (ulong) (global_system_variables.*offset); ++} ++ ++ ++bool sys_var_thd_msl_option::update(THD *thd, set_var *var) ++{ ++ if (var->type == OPT_GLOBAL) ++ global_system_variables.*offset= var->save_result.ulong_value; ++ else ++ thd->variables.*offset= var->save_result.ulong_value; ++ return 0; ++} ++ ++/**************************************************************************** ++ Functions to handle log_slow_filter ++****************************************************************************/ ++ ++/* Based upon sys_var::check_enum() */ ++ ++bool sys_var_thd_msl_flag::check(THD *thd, set_var *var) ++{ ++ char buff[2 * STRING_BUFFER_USUAL_SIZE]; ++ String str(buff, sizeof(buff), &my_charset_latin1), *res; ++ ++ if (var->value->result_type() == STRING_RESULT) ++ { ++ ulong filter= this->none_val; ++ if (!(res=var->value->val_str(&str)) || ++ (var->save_result.ulong_value= ++ (ulong) (filter= msl_flag_resolve_by_name(this->flags, res->ptr(), this->none_val, ++ this->invalid_val))) == this->invalid_val) ++ goto err; ++ return 0; ++ } ++ ++err: ++ my_error(ER_WRONG_ARGUMENTS, MYF(0), var->var->name); ++ return 1; ++} ++ ++byte *sys_var_thd_msl_flag::value_ptr(THD *thd, enum_var_type type, ++ LEX_STRING *base) ++{ ++ ulong val; ++ val= ((type == OPT_GLOBAL) ? global_system_variables.*offset : ++ thd->variables.*offset); ++ msl_flag_get_name(this->flags, this->flags_string, val); ++ return (byte *) this->flags_string; ++} ++ ++ ++void sys_var_thd_msl_flag::set_default(THD *thd, enum_var_type type) ++{ ++ if (type == OPT_GLOBAL) ++ global_system_variables.*offset= (ulong) this->default_val; ++ else ++ thd->variables.*offset= (ulong) (global_system_variables.*offset); ++} ++ ++ ++bool sys_var_thd_msl_flag::update(THD *thd, set_var *var) ++{ ++ if (var->type == OPT_GLOBAL) ++ global_system_variables.*offset= var->save_result.ulong_value; ++ else ++ thd->variables.*offset= var->save_result.ulong_value; ++ return 0; ++} ++ + /**************************************************************************** + Functions to handle table_type + ****************************************************************************/ +diff -r 1242d4575291 sql/set_var.h +--- a/sql/set_var.h Tue Jul 28 23:39:12 2009 -0700 ++++ b/sql/set_var.h Tue Jul 28 23:42:44 2009 -0700 +@@ -132,6 +132,7 @@ + }; + + ++ + class sys_var_ulonglong_ptr :public sys_var + { + public: +@@ -168,6 +169,13 @@ + bool check_update_type(Item_result type) { return 0; } + }; + ++class sys_var_log_slow :public sys_var_bool_ptr ++{ ++public: ++ sys_var_log_slow(const char *name_arg, my_bool *value_arg) ++ :sys_var_bool_ptr(name_arg, value_arg) {} ++ bool update(THD *thd, set_var *var); ++}; + + class sys_var_bool_const_ptr : public sys_var + { +@@ -340,7 +348,6 @@ + } + }; + +- + class sys_var_thd_ulong :public sys_var_thd + { + sys_check_func check_func; +@@ -360,7 +367,6 @@ + byte *value_ptr(THD *thd, enum_var_type type, LEX_STRING *base); + }; + +- + class sys_var_thd_ha_rows :public sys_var_thd + { + public: +@@ -378,7 +384,6 @@ + byte *value_ptr(THD *thd, enum_var_type type, LEX_STRING *base); + }; + +- + class sys_var_thd_ulonglong :public sys_var_thd + { + public: +@@ -407,6 +412,19 @@ + } + }; + ++class sys_var_thd_microtime :public sys_var_thd_ulonglong ++{ ++public: ++ sys_var_thd_microtime(const char *name_arg, ulonglong SV::*offset_arg) ++ :sys_var_thd_ulonglong(name_arg, offset_arg) ++ {} ++ SHOW_TYPE show_type() { return SHOW_MICROTIME; } ++ bool check(THD *thd, set_var *var); ++ bool check_update_type(Item_result type) ++ { ++ return type != INT_RESULT && type != DECIMAL_RESULT; ++ } ++}; + + class sys_var_thd_bool :public sys_var_thd + { +@@ -478,6 +496,66 @@ + }; + + ++class sys_var_thd_msl_option :public sys_var_thd ++{ ++protected: ++ ulong SV::*offset; ++ const ulong none_val; ++ const ulong default_val; ++ const ulong invalid_val; ++ const struct msl_opts *opts; ++public: ++ sys_var_thd_msl_option(const char *name_arg, ulong SV::*offset_arg, ++ const ulong none_val_arg, ++ const ulong default_val_arg, ++ const ulong invalid_val_arg, ++ const struct msl_opts *opts_arg) ++ :sys_var_thd(name_arg), offset(offset_arg), none_val(none_val_arg), ++ default_val(default_val_arg), invalid_val(invalid_val_arg), ++ opts(opts_arg) ++ {} ++ bool check(THD *thd, set_var *var); ++ SHOW_TYPE show_type() { return SHOW_CHAR; } ++ bool check_update_type(Item_result type) ++ { ++ return type != STRING_RESULT; /* Only accept strings */ ++ } ++ void set_default(THD *thd, enum_var_type type); ++ bool update(THD *thd, set_var *var); ++ byte *value_ptr(THD *thd, enum_var_type type, LEX_STRING *base); ++}; ++ ++ ++class sys_var_thd_msl_flag :public sys_var_thd ++{ ++protected: ++ char flags_string[2 * STRING_BUFFER_USUAL_SIZE]; ++ ulong SV::*offset; ++ const ulong none_val; ++ const ulong default_val; ++ const ulong invalid_val; ++ const struct msl_opts *flags; ++public: ++ sys_var_thd_msl_flag(const char *name_arg, ulong SV::*offset_arg, ++ const ulong none_val_arg, ++ const ulong default_val_arg, ++ const ulong invalid_val_arg, ++ const struct msl_opts *flags_arg) ++ :sys_var_thd(name_arg), offset(offset_arg), none_val(none_val_arg), ++ default_val(default_val_arg), invalid_val(invalid_val_arg), ++ flags(flags_arg) ++ {} ++ bool check(THD *thd, set_var *var); ++ SHOW_TYPE show_type() { return SHOW_CHAR; } ++ bool check_update_type(Item_result type) ++ { ++ return type != STRING_RESULT; /* Only accept strings */ ++ } ++ void set_default(THD *thd, enum_var_type type); ++ bool update(THD *thd, set_var *var); ++ byte *value_ptr(THD *thd, enum_var_type type, LEX_STRING *base); ++}; ++ + class sys_var_thd_storage_engine :public sys_var_thd + { + protected: +@@ -1109,3 +1187,11 @@ + bool process_key_caches(int (* func) (const char *name, KEY_CACHE *)); + void delete_elements(I_List<NAMED_LIST> *list, + void (*free_element)(const char*, gptr)); ++ ++/* Slow log functions */ ++ ++ulong msl_option_resolve_by_name(const struct msl_opts *opts, const char *name, ulong len); ++ulong msl_flag_resolve_by_name(const struct msl_opts *opts, const char *names_list, ++ const ulong none_val, const ulong invalid_val); ++const char *msl_option_get_name(const struct msl_opts *opts, ulong val); ++char *msl_flag_get_name(const struct msl_opts *opts, char *buf, ulong val); +diff -r 1242d4575291 sql/slave.cc +--- a/sql/slave.cc Tue Jul 28 23:39:12 2009 -0700 ++++ b/sql/slave.cc Tue Jul 28 23:42:44 2009 -0700 +@@ -2983,6 +2983,12 @@ + + MAX_LOG_EVENT_HEADER; /* note, incr over the global not session var */ + thd->slave_thread = 1; + set_slave_thread_options(thd); ++ if (opt_log_slow_slave_statements) ++ { ++ thd->enable_slow_log= TRUE; ++ /* Slave thread is excluded from rate limiting the slow log writes. */ ++ thd->write_to_slow_log= TRUE; ++ } + thd->client_capabilities = CLIENT_LOCAL_FILES; + thd->real_id=pthread_self(); + pthread_mutex_lock(&LOCK_thread_count); +diff -r 1242d4575291 sql/sql_cache.cc +--- a/sql/sql_cache.cc Tue Jul 28 23:39:12 2009 -0700 ++++ b/sql/sql_cache.cc Tue Jul 28 23:42:44 2009 -0700 +@@ -1402,6 +1402,7 @@ + + thd->limit_found_rows = query->found_rows(); + thd->status_var.last_query_cost= 0.0; ++ thd->query_plan_flags|= QPLAN_QC; + + BLOCK_UNLOCK_RD(query_block); + DBUG_RETURN(1); // Result sent to client +@@ -1409,6 +1410,7 @@ + err_unlock: + STRUCT_UNLOCK(&structure_guard_mutex); + err: ++ thd->query_plan_flags|= QPLAN_QC_NO; + DBUG_RETURN(0); // Query was not cached + } + +diff -r 1242d4575291 sql/sql_class.cc +--- a/sql/sql_class.cc Tue Jul 28 23:39:12 2009 -0700 ++++ b/sql/sql_class.cc Tue Jul 28 23:42:44 2009 -0700 +@@ -190,7 +190,7 @@ + table_map_for_update(0), + global_read_lock(0), is_fatal_error(0), + transaction_rollback_request(0), is_fatal_sub_stmt_error(0), +- rand_used(0), time_zone_used(0), ++ rand_used(0), time_zone_used(0), user_timer(0), + last_insert_id_used(0), last_insert_id_used_bin_log(0), insert_id_used(0), + clear_next_insert_id(0), in_lock_tables(0), bootstrap(0), + derived_tables_processing(FALSE), spcont(NULL), +@@ -2251,6 +2251,12 @@ + backup->cuted_fields= cuted_fields; + backup->client_capabilities= client_capabilities; + backup->savepoints= transaction.savepoints; ++ backup->innodb_io_reads= innodb_io_reads; ++ backup->innodb_io_read= innodb_io_read; ++ backup->innodb_io_reads_wait_timer= innodb_io_reads_wait_timer; ++ backup->innodb_lock_que_wait_timer= innodb_lock_que_wait_timer; ++ backup->innodb_innodb_que_wait_timer= innodb_innodb_que_wait_timer; ++ backup->innodb_page_access= innodb_page_access; + + if (!lex->requires_prelocking() || is_update_query(lex->sql_command)) + options&= ~OPTION_BIN_LOG; +@@ -2267,7 +2273,13 @@ + sent_row_count= 0; + cuted_fields= 0; + transaction.savepoints= 0; +- ++ innodb_io_reads= 0; ++ innodb_io_read= 0; ++ innodb_io_reads_wait_timer= 0; ++ innodb_lock_que_wait_timer= 0; ++ innodb_innodb_que_wait_timer= 0; ++ innodb_page_access= 0; ++ + /* Surpress OK packets in case if we will execute statements */ + net.no_send_ok= TRUE; + } +@@ -2320,6 +2332,12 @@ + */ + examined_row_count+= backup->examined_row_count; + cuted_fields+= backup->cuted_fields; ++ innodb_io_reads+= backup->innodb_io_reads; ++ innodb_io_read+= backup->innodb_io_read; ++ innodb_io_reads_wait_timer+= backup->innodb_io_reads_wait_timer; ++ innodb_lock_que_wait_timer+= backup->innodb_lock_que_wait_timer; ++ innodb_innodb_que_wait_timer+= backup->innodb_innodb_que_wait_timer; ++ innodb_page_access+= backup->innodb_page_access; + } + + +diff -r 1242d4575291 sql/sql_class.h +--- a/sql/sql_class.h Tue Jul 28 23:39:12 2009 -0700 ++++ b/sql/sql_class.h Tue Jul 28 23:42:44 2009 -0700 +@@ -43,6 +43,7 @@ + extern char internal_table_name[2]; + extern char empty_c_string[1]; + extern const char **errmesg; ++extern ulonglong frequency; + + #define TC_LOG_PAGE_SIZE 8192 + #define TC_LOG_MIN_SIZE (3*TC_LOG_PAGE_SIZE) +@@ -321,7 +322,7 @@ + bool write(THD *thd, enum enum_server_command command, + const char *format, ...) ATTRIBUTE_FORMAT(printf, 4, 5); + bool write(THD *thd, const char *query, uint query_length, +- time_t query_start=0); ++ time_t query_start=0, ulonglong query_start_timer=0); + bool write(Log_event* event_info); // binary log write + bool write(THD *thd, IO_CACHE *cache, Log_event *commit_event); + +@@ -527,13 +528,14 @@ + ulong auto_increment_increment, auto_increment_offset; + ulong bulk_insert_buff_size; + ulong join_buff_size; +- ulong long_query_time; ++ ulonglong long_query_time; + ulong max_allowed_packet; + ulong max_error_count; + ulong max_length_for_sort_data; + ulong max_sort_length; + ulong max_tmp_tables; + ulong max_insert_delayed_threads; ++ ulong min_examined_row_limit; + ulong multi_range_count; + ulong myisam_repair_threads; + ulong myisam_sort_buff_size; +@@ -549,10 +551,13 @@ + ulong preload_buff_size; + ulong profiling_history_size; + ulong query_cache_type; ++ ulong log_slow_rate_limit; + ulong read_buff_size; + ulong read_rnd_buff_size; + ulong div_precincrement; + ulong sortbuff_size; ++ ulong log_slow_filter; ++ ulong log_slow_verbosity; + ulong table_type; + ulong tx_isolation; + ulong completion_type; +@@ -1129,6 +1134,12 @@ + uint in_sub_stmt; + bool enable_slow_log, insert_id_used, clear_next_insert_id; + bool last_insert_id_used; ++ ulong innodb_io_reads; ++ ulonglong innodb_io_read; ++ ulong innodb_io_reads_wait_timer; ++ ulong innodb_lock_que_wait_timer; ++ ulong innodb_innodb_que_wait_timer; ++ ulong innodb_page_access; + my_bool no_send_ok; + SAVEPOINT *savepoints; + }; +@@ -1185,6 +1196,11 @@ + class THD :public Statement, + public Open_tables_state + { ++private: ++ inline ulonglong query_start_timer() { return start_timer; } ++ inline void set_timer() { if (user_timer) start_timer=timer_after_lock=user_timer; else timer_after_lock=my_timer(&start_timer, frequency); } ++ inline void end_timer() { my_timer(&start_timer, frequency); } ++ inline void lock_timer() { my_timer(&timer_after_lock, frequency); } + public: + /* + Constant for THD::where initialization in the beginning of every query. +@@ -1293,10 +1309,24 @@ + */ + const char *where; + time_t start_time,time_after_lock,user_time; ++ ulonglong start_timer,timer_after_lock, user_timer; + time_t connect_time,thr_create_time; // track down slow pthread_create + thr_lock_type update_lock_default; + Delayed_insert *di; + ++ bool write_to_slow_log; ++ ++ bool innodb_was_used; ++ ulong innodb_io_reads; ++ ulonglong innodb_io_read; ++ ulong innodb_io_reads_wait_timer; ++ ulong innodb_lock_que_wait_timer; ++ ulong innodb_innodb_que_wait_timer; ++ ulong innodb_page_access; ++ ++ ulong query_plan_flags; ++ ulong query_plan_fsort_passes; ++ + /* <> 0 if we are inside of trigger or stored function. */ + uint in_sub_stmt; + +@@ -1696,11 +1726,11 @@ + sql_print_information("time() failed with %d", errno); + } + +- inline time_t query_start() { query_start_used=1; return start_time; } +- inline void set_time() { if (user_time) start_time=time_after_lock=user_time; else { safe_time(&start_time); time_after_lock= start_time; }} +- inline void end_time() { safe_time(&start_time); } +- inline void set_time(time_t t) { time_after_lock=start_time=user_time=t; } +- inline void lock_time() { safe_time(&time_after_lock); } ++ inline time_t query_start() { query_start_timer(); query_start_used=1; return start_time; } ++ inline void set_time() { set_timer(); if (user_time) start_time=time_after_lock=user_time; else { safe_time(&start_time); time_after_lock= start_time; }} ++ inline void end_time() { end_timer(); safe_time(&start_time); } ++ inline void set_time(time_t t) { set_timer(); time_after_lock=start_time=user_time=t; } ++ inline void lock_time() { lock_timer(); safe_time(&time_after_lock); } + inline void insert_id(ulonglong id_arg) + { + last_insert_id= id_arg; +diff -r 1242d4575291 sql/sql_parse.cc +--- a/sql/sql_parse.cc Tue Jul 28 23:39:12 2009 -0700 ++++ b/sql/sql_parse.cc Tue Jul 28 23:42:44 2009 -0700 +@@ -20,6 +20,7 @@ + #include <m_ctype.h> + #include <myisam.h> + #include <my_dir.h> ++#include <my_time.h> + + #ifdef HAVE_INNOBASE_DB + #include "ha_innodb.h" +@@ -1227,6 +1228,15 @@ + my_net_set_read_timeout(net, thd->variables.net_read_timeout); + my_net_set_write_timeout(net, thd->variables.net_write_timeout); + ++ /* ++ If rate limiting of slow log writes is enabled, decide whether to log this ++ new thread's queries or not. Uses extremely simple algorithm. :) ++ */ ++ thd->write_to_slow_log= FALSE; ++ if (thd->variables.log_slow_rate_limit <= 1 || ++ (thd->thread_id % thd->variables.log_slow_rate_limit) == 0) ++ thd->write_to_slow_log= TRUE; ++ + while (!net->error && net->vio != 0 && + !(thd->killed == THD::KILL_CONNECTION)) + { +@@ -2353,28 +2363,57 @@ + return; // Don't set time for sub stmt + + start_of_query= thd->start_time; +- thd->end_time(); // Set start time ++ ulonglong start_of_query_timer= thd->start_timer; ++ thd->end_time(); // Set start timea ++ ++ ++ /* Follow the slow log filter configuration. */ ++ if (thd->variables.log_slow_filter != SLOG_F_NONE && ++ (!(thd->variables.log_slow_filter & thd->query_plan_flags) || ++ ((thd->variables.log_slow_filter & SLOG_F_QC_NO) && ++ (thd->query_plan_flags & QPLAN_QC)))) ++ return; ++ ++ /* ++ Low long_query_time value most likely means user is debugging stuff and even ++ though some thread's queries are not supposed to be logged b/c of the rate ++ limit, if one of them takes long enough (>= 1 second) it will be sensible ++ to make an exception and write to slow log anyway. ++ */ ++ ++ if (opt_use_global_long_query_time) ++ thd->variables.long_query_time = global_system_variables.long_query_time; ++ ++ /* Do not log this thread's queries due to rate limiting. */ ++ if (thd->write_to_slow_log != TRUE ++ && (thd->variables.long_query_time >= 1000000 ++ || (ulong) (thd->start_timer - thd->timer_after_lock) < 1000000)) ++ return; ++ + + /* + Do not log administrative statements unless the appropriate option is + set; do not log into slow log if reading from backup. + */ +- if (thd->enable_slow_log && !thd->user_time) ++ if (thd->enable_slow_log && ++ (!thd->user_time || (thd->slave_thread && opt_log_slow_slave_statements)) ++ ) ++ + { + thd_proc_info(thd, "logging slow query"); + +- if ((thd->start_time > thd->time_after_lock && +- (ulong) (thd->start_time - thd->time_after_lock) > +- thd->variables.long_query_time) || +- ((thd->server_status & +- (SERVER_QUERY_NO_INDEX_USED | SERVER_QUERY_NO_GOOD_INDEX_USED)) && +- opt_log_queries_not_using_indexes && +- /* == SQLCOM_END unless this is a SHOW command */ +- thd->lex->orig_sql_command == SQLCOM_END)) ++ if (((ulong) (thd->start_timer - thd->timer_after_lock) >= ++ thd->variables.long_query_time || ++ (thd->server_status & ++ (SERVER_QUERY_NO_INDEX_USED | SERVER_QUERY_NO_GOOD_INDEX_USED)) && ++ opt_log_queries_not_using_indexes && ++ /* == SQLCOM_END unless this is a SHOW command */ ++ thd->lex->orig_sql_command == SQLCOM_END) && ++ thd->examined_row_count >= thd->variables.min_examined_row_limit) + { + thd_proc_info(thd, "logging slow query"); + thd->status_var.long_query_count++; +- mysql_slow_log.write(thd, thd->query, thd->query_length, start_of_query); ++ mysql_slow_log.write(thd, thd->query, thd->query_length, start_of_query, start_of_query_timer); + } + } + } +@@ -2669,6 +2708,8 @@ + context.resolve_in_table_list_only((TABLE_LIST*)select_lex-> + table_list.first); + ++ /* Reset the counter at all cases for the extended slow query log */ ++ thd->row_count= 1; + /* + Reset warning count for each query that uses tables + A better approach would be to reset this for any commands +@@ -6203,6 +6244,15 @@ + thd->total_warn_count=0; // Warnings for this query + thd->rand_used= 0; + thd->sent_row_count= thd->examined_row_count= 0; ++ thd->innodb_was_used= FALSE; ++ thd->innodb_io_reads= 0; ++ thd->innodb_io_read= 0; ++ thd->innodb_io_reads_wait_timer= 0; ++ thd->innodb_lock_que_wait_timer= 0; ++ thd->innodb_innodb_que_wait_timer= 0; ++ thd->innodb_page_access= 0; ++ thd->query_plan_flags= QPLAN_NONE; ++ thd->query_plan_fsort_passes= 0; + } + DBUG_VOID_RETURN; + } +diff -r 1242d4575291 sql/sql_select.cc +--- a/sql/sql_select.cc Tue Jul 28 23:39:12 2009 -0700 ++++ b/sql/sql_select.cc Tue Jul 28 23:42:44 2009 -0700 +@@ -6272,8 +6272,11 @@ + { + join->thd->server_status|=SERVER_QUERY_NO_INDEX_USED; + if (statistics) ++ { + statistic_increment(join->thd->status_var.select_scan_count, + &LOCK_status); ++ join->thd->query_plan_flags|= QPLAN_FULL_SCAN; ++ } + } + } + else +@@ -6288,8 +6291,11 @@ + { + join->thd->server_status|=SERVER_QUERY_NO_INDEX_USED; + if (statistics) ++ { + statistic_increment(join->thd->status_var.select_full_join_count, + &LOCK_status); ++ join->thd->query_plan_flags|= QPLAN_FULL_JOIN; ++ } + } + } + if (!table->no_keyread) +@@ -9350,6 +9356,7 @@ + (ulong) rows_limit,test(group))); + + statistic_increment(thd->status_var.created_tmp_tables, &LOCK_status); ++ thd->query_plan_flags|= QPLAN_TMP_TABLE; + + if (use_temp_pool && !(test_flags & TEST_KEEP_TMP_TABLES)) + temp_pool_slot = bitmap_set_next(&temp_pool); +@@ -10210,6 +10217,7 @@ + } + statistic_increment(table->in_use->status_var.created_tmp_disk_tables, + &LOCK_status); ++ table->in_use->query_plan_flags|= QPLAN_TMP_DISK; + table->s->db_record_offset= 1; + DBUG_RETURN(0); + err: +diff -r 1242d4575291 sql/sql_show.cc +--- a/sql/sql_show.cc Tue Jul 28 23:39:12 2009 -0700 ++++ b/sql/sql_show.cc Tue Jul 28 23:42:44 2009 -0700 +@@ -1560,6 +1560,12 @@ + case SHOW_LONGLONG: + end= longlong10_to_str(*(longlong*) value, buff, 10); + break; ++ case SHOW_MICROTIME: ++ show_type= ((sys_var*) value)->show_type(); ++ value= (char*) ((sys_var*) value)->value_ptr(thd, value_type, ++ &null_lex_str); ++ end= buff + sprintf(buff, "%f", (((double) (*(ulonglong*)value))) / 1000000.0); ++ break; + case SHOW_HA_ROWS: + end= longlong10_to_str((longlong) *(ha_rows*) value, buff, 10); + break; +diff -r 1242d4575291 sql/structs.h +--- a/sql/structs.h Tue Jul 28 23:39:12 2009 -0700 ++++ b/sql/structs.h Tue Jul 28 23:42:44 2009 -0700 +@@ -168,8 +168,8 @@ + enum SHOW_TYPE + { + SHOW_UNDEF, +- SHOW_LONG, SHOW_LONGLONG, SHOW_INT, SHOW_CHAR, SHOW_CHAR_PTR, +- SHOW_DOUBLE_STATUS, ++ SHOW_LONG, SHOW_LONGLONG, SHOW_MICROTIME, SHOW_INT, SHOW_CHAR, SHOW_CHAR_PTR, ++ SHOW_DOUBLE_STATUS, + SHOW_BOOL, SHOW_MY_BOOL, SHOW_OPENTABLES, SHOW_STARTTIME, SHOW_QUERIES, + SHOW_LONG_CONST, SHOW_INT_CONST, SHOW_HAVE, SHOW_SYS, SHOW_HA_ROWS, + SHOW_VARS, diff --git a/percona/5.0.91-b22-20100522/mirror_binlog.patch b/percona/5.0.91-b22-20100522/mirror_binlog.patch new file mode 100644 index 0000000..d52e806 --- /dev/null +++ b/percona/5.0.91-b22-20100522/mirror_binlog.patch @@ -0,0 +1,2694 @@ +diff -r 66cc9e0a6768 mysql-test/lib/mtr_cases.pl +--- a/mysql-test/lib/mtr_cases.pl Thu Dec 04 21:37:12 2008 -0800 ++++ b/mysql-test/lib/mtr_cases.pl Thu Dec 04 21:46:15 2008 -0800 +@@ -334,6 +334,10 @@ + + $tinfo->{'slave_num'}= 1; # Default for rpl* tests, use one slave + ++ if ( $tname eq 'rpl_mirror_binlog' ) ++ { ++ $tinfo->{'slave_num'}= 3; ++ } + } + + if ( defined mtr_match_prefix($tname,"federated") ) +@@ -344,15 +348,20 @@ + + my $master_opt_file= "$testdir/$tname-master.opt"; + my $slave_opt_file= "$testdir/$tname-slave.opt"; +- my $slave_mi_file= "$testdir/$tname.slave-mi"; ++ my $slave_mi_files= ["$testdir/$tname.slave-mi", ++ "$testdir/$tname.1.slave-mi", ++ "$testdir/$tname.2.slave-mi"]; + my $master_sh= "$testdir/$tname-master.sh"; + my $slave_sh= "$testdir/$tname-slave.sh"; + my $disabled_file= "$testdir/$tname.disabled"; + my $im_opt_file= "$testdir/$tname-im.opt"; + +- $tinfo->{'master_opt'}= []; +- $tinfo->{'slave_opt'}= []; +- $tinfo->{'slave_mi'}= []; ++ $tinfo->{'master_opt'}= []; ++ $tinfo->{'slave_opt'}= []; ++ $tinfo->{'slave_mi'}= {}; ++ $tinfo->{'slave_mi'}{0}= []; ++ $tinfo->{'slave_mi'}{1}= []; ++ $tinfo->{'slave_mi'}{2}= []; + + if ( -f $master_opt_file ) + { +@@ -427,9 +436,14 @@ + push(@{$tinfo->{'slave_opt'}}, @$slave_opt); + } + +- if ( -f $slave_mi_file ) ++ my $mi_idx= 0; ++ foreach my $slave_mi_file ( @$slave_mi_files ) + { +- $tinfo->{'slave_mi'}= mtr_get_opts_from_file($slave_mi_file); ++ if ( -f $slave_mi_file ) ++ { ++ $tinfo->{'slave_mi'}{$mi_idx}= mtr_get_opts_from_file($slave_mi_file); ++ } ++ $mi_idx+= 1; + } + + if ( -f $master_sh ) +diff -r 66cc9e0a6768 mysql-test/mysql-test-run.pl +--- a/mysql-test/mysql-test-run.pl Thu Dec 04 21:37:12 2008 -0800 ++++ b/mysql-test/mysql-test-run.pl Thu Dec 04 21:46:15 2008 -0800 +@@ -275,6 +275,7 @@ + our $opt_stress_test_file= ""; + + our $opt_warnings; ++our $opt_slave_innodb= 0; + + our $opt_skip_ndbcluster= 0; + our $opt_skip_ndbcluster_slave= 0; +@@ -299,6 +300,8 @@ + our $used_binlog_format; + our $used_default_engine; + our $debug_compiled_binaries; ++ ++our $current_testname= ""; + + our %mysqld_variables; + +@@ -645,6 +648,7 @@ + 'testcase-timeout=i' => \$opt_testcase_timeout, + 'suite-timeout=i' => \$opt_suite_timeout, + 'warnings|log-warnings' => \$opt_warnings, ++ 'slave-innodb' => \$opt_slave_innodb, + + # Options which are no longer used + (map { $_ => \&warn_about_removed_option } @removed_options), +@@ -1001,6 +1005,14 @@ + { + $ENV{'BIG_TEST'}= 1; + } ++ ++ # -------------------------------------------------------------------------- ++ # Big test flags ++ # -------------------------------------------------------------------------- ++ if ( $opt_big_test ) ++ { ++ $ENV{'BIG_TEST'}= 1; ++ } + + # -------------------------------------------------------------------------- + # Gcov flag +@@ -1885,7 +1897,9 @@ + $ENV{'SLAVE_MYSOCK'}= $slave->[0]->{'path_sock'}; + $ENV{'SLAVE_MYPORT'}= $slave->[0]->{'port'}; + $ENV{'SLAVE_MYPORT1'}= $slave->[1]->{'port'}; ++ $ENV{'SLAVE_MYSOCK1'}= $slave->[1]->{'path_sock'}; + $ENV{'SLAVE_MYPORT2'}= $slave->[2]->{'port'}; ++ $ENV{'SLAVE_MYSOCK2'}= $slave->[2]->{'path_sock'}; + $ENV{'MYSQL_TCP_PORT'}= $mysqld_variables{'port'}; + $ENV{'DEFAULT_MASTER_PORT'}= $mysqld_variables{'master-port'}; + +@@ -2375,6 +2389,8 @@ + if ( ! $glob_win32 ) + { + symlink("$glob_mysql_test_dir/std_data", "$opt_vardir/std_data_ln"); ++ my @a = ("chmod", "-R", "o+r", "$glob_mysql_test_dir/std_data"); ++ system(@a) == 0 or die "system @ failed: $?" + } + else + { +@@ -3466,6 +3482,8 @@ + $ENV{'TZ'}= $tinfo->{'timezone'}; + mtr_verbose("Setting timezone: $tinfo->{'timezone'}"); + ++ $current_testname= $tinfo->{'name'}; ++ + my $master_restart= run_testcase_need_master_restart($tinfo); + my $slave_restart= run_testcase_need_slave_restart($tinfo); + +@@ -3881,7 +3899,8 @@ + unless $mysqld->{'type'} eq 'slave'; + + mtr_add_arg($args, "%s--init-rpl-role=slave", $prefix); +- if (! ( $opt_skip_slave_binlog || $skip_binlog )) ++ ++ if (! ($opt_skip_slave_binlog or ($current_testname eq 'rpl_mirror_binlog')) ) + { + mtr_add_arg($args, "%s--log-bin=%s/log/slave%s-bin", $prefix, + $opt_vardir, $sidx); # FIXME use own dir for binlogs +@@ -4568,7 +4587,7 @@ + if ( ! $slave->[$idx]->{'pid'} ) + { + mysqld_start($slave->[$idx],$tinfo->{'slave_opt'}, +- $tinfo->{'slave_mi'}); ++ $tinfo->{'slave_mi'}{$idx}); + + } + } +@@ -4580,7 +4599,6 @@ + # Wait for clusters to start + foreach my $cluster (@{$clusters}) + { +- + next if !$cluster->{'pid'}; + + if (ndbcluster_wait_started($cluster, "")) +@@ -5179,6 +5197,7 @@ + skip-im Don't start IM, and skip the IM test cases + big-test Set the environment variable BIG_TEST, which can be + checked from test cases. ++ + + Options that specify ports + +diff -r 66cc9e0a6768 mysql-test/r/rpl_mirror_binlog.result +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/mysql-test/r/rpl_mirror_binlog.result Thu Dec 04 21:46:15 2008 -0800 +@@ -0,0 +1,441 @@ ++stop slave; ++drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9; ++reset master; ++reset slave; ++drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9; ++start slave; ++drop table if exists t1; ++create table t1(n int) engine = InnoDB; ++insert into t1 values (300); ++insert into t1 values (299); ++insert into t1 values (298); ++insert into t1 values (297); ++insert into t1 values (296); ++insert into t1 values (295); ++insert into t1 values (294); ++insert into t1 values (293); ++insert into t1 values (292); ++insert into t1 values (291); ++insert into t1 values (290); ++insert into t1 values (289); ++insert into t1 values (288); ++insert into t1 values (287); ++insert into t1 values (286); ++insert into t1 values (285); ++insert into t1 values (284); ++insert into t1 values (283); ++insert into t1 values (282); ++insert into t1 values (281); ++insert into t1 values (280); ++insert into t1 values (279); ++insert into t1 values (278); ++insert into t1 values (277); ++insert into t1 values (276); ++insert into t1 values (275); ++insert into t1 values (274); ++insert into t1 values (273); ++insert into t1 values (272); ++insert into t1 values (271); ++insert into t1 values (270); ++insert into t1 values (269); ++insert into t1 values (268); ++insert into t1 values (267); ++insert into t1 values (266); ++insert into t1 values (265); ++insert into t1 values (264); ++insert into t1 values (263); ++insert into t1 values (262); ++insert into t1 values (261); ++insert into t1 values (260); ++insert into t1 values (259); ++insert into t1 values (258); ++insert into t1 values (257); ++insert into t1 values (256); ++insert into t1 values (255); ++insert into t1 values (254); ++insert into t1 values (253); ++insert into t1 values (252); ++insert into t1 values (251); ++insert into t1 values (250); ++insert into t1 values (249); ++insert into t1 values (248); ++insert into t1 values (247); ++insert into t1 values (246); ++insert into t1 values (245); ++insert into t1 values (244); ++insert into t1 values (243); ++insert into t1 values (242); ++insert into t1 values (241); ++insert into t1 values (240); ++insert into t1 values (239); ++insert into t1 values (238); ++insert into t1 values (237); ++insert into t1 values (236); ++insert into t1 values (235); ++insert into t1 values (234); ++insert into t1 values (233); ++insert into t1 values (232); ++insert into t1 values (231); ++insert into t1 values (230); ++insert into t1 values (229); ++insert into t1 values (228); ++insert into t1 values (227); ++insert into t1 values (226); ++insert into t1 values (225); ++insert into t1 values (224); ++insert into t1 values (223); ++insert into t1 values (222); ++insert into t1 values (221); ++insert into t1 values (220); ++insert into t1 values (219); ++insert into t1 values (218); ++insert into t1 values (217); ++insert into t1 values (216); ++insert into t1 values (215); ++insert into t1 values (214); ++insert into t1 values (213); ++insert into t1 values (212); ++insert into t1 values (211); ++insert into t1 values (210); ++insert into t1 values (209); ++insert into t1 values (208); ++insert into t1 values (207); ++insert into t1 values (206); ++insert into t1 values (205); ++insert into t1 values (204); ++insert into t1 values (203); ++insert into t1 values (202); ++insert into t1 values (201); ++insert into t1 values (200); ++insert into t1 values (199); ++insert into t1 values (198); ++insert into t1 values (197); ++insert into t1 values (196); ++insert into t1 values (195); ++insert into t1 values (194); ++insert into t1 values (193); ++insert into t1 values (192); ++insert into t1 values (191); ++insert into t1 values (190); ++insert into t1 values (189); ++insert into t1 values (188); ++insert into t1 values (187); ++insert into t1 values (186); ++insert into t1 values (185); ++insert into t1 values (184); ++insert into t1 values (183); ++insert into t1 values (182); ++insert into t1 values (181); ++insert into t1 values (180); ++insert into t1 values (179); ++insert into t1 values (178); ++insert into t1 values (177); ++insert into t1 values (176); ++insert into t1 values (175); ++insert into t1 values (174); ++insert into t1 values (173); ++insert into t1 values (172); ++insert into t1 values (171); ++insert into t1 values (170); ++insert into t1 values (169); ++insert into t1 values (168); ++insert into t1 values (167); ++insert into t1 values (166); ++insert into t1 values (165); ++insert into t1 values (164); ++insert into t1 values (163); ++insert into t1 values (162); ++insert into t1 values (161); ++insert into t1 values (160); ++insert into t1 values (159); ++insert into t1 values (158); ++insert into t1 values (157); ++insert into t1 values (156); ++insert into t1 values (155); ++insert into t1 values (154); ++insert into t1 values (153); ++insert into t1 values (152); ++insert into t1 values (151); ++insert into t1 values (150); ++insert into t1 values (149); ++insert into t1 values (148); ++insert into t1 values (147); ++insert into t1 values (146); ++insert into t1 values (145); ++insert into t1 values (144); ++insert into t1 values (143); ++insert into t1 values (142); ++insert into t1 values (141); ++insert into t1 values (140); ++insert into t1 values (139); ++insert into t1 values (138); ++insert into t1 values (137); ++insert into t1 values (136); ++insert into t1 values (135); ++insert into t1 values (134); ++insert into t1 values (133); ++insert into t1 values (132); ++insert into t1 values (131); ++insert into t1 values (130); ++insert into t1 values (129); ++insert into t1 values (128); ++insert into t1 values (127); ++insert into t1 values (126); ++insert into t1 values (125); ++insert into t1 values (124); ++insert into t1 values (123); ++insert into t1 values (122); ++insert into t1 values (121); ++insert into t1 values (120); ++insert into t1 values (119); ++insert into t1 values (118); ++insert into t1 values (117); ++insert into t1 values (116); ++insert into t1 values (115); ++insert into t1 values (114); ++insert into t1 values (113); ++insert into t1 values (112); ++insert into t1 values (111); ++insert into t1 values (110); ++insert into t1 values (109); ++insert into t1 values (108); ++insert into t1 values (107); ++insert into t1 values (106); ++insert into t1 values (105); ++insert into t1 values (104); ++insert into t1 values (103); ++insert into t1 values (102); ++insert into t1 values (101); ++insert into t1 values (100); ++insert into t1 values (99); ++insert into t1 values (98); ++insert into t1 values (97); ++insert into t1 values (96); ++insert into t1 values (95); ++insert into t1 values (94); ++insert into t1 values (93); ++insert into t1 values (92); ++insert into t1 values (91); ++insert into t1 values (90); ++insert into t1 values (89); ++insert into t1 values (88); ++insert into t1 values (87); ++insert into t1 values (86); ++insert into t1 values (85); ++insert into t1 values (84); ++insert into t1 values (83); ++insert into t1 values (82); ++insert into t1 values (81); ++insert into t1 values (80); ++insert into t1 values (79); ++insert into t1 values (78); ++insert into t1 values (77); ++insert into t1 values (76); ++insert into t1 values (75); ++insert into t1 values (74); ++insert into t1 values (73); ++insert into t1 values (72); ++insert into t1 values (71); ++insert into t1 values (70); ++insert into t1 values (69); ++insert into t1 values (68); ++insert into t1 values (67); ++insert into t1 values (66); ++insert into t1 values (65); ++insert into t1 values (64); ++insert into t1 values (63); ++insert into t1 values (62); ++insert into t1 values (61); ++insert into t1 values (60); ++insert into t1 values (59); ++insert into t1 values (58); ++insert into t1 values (57); ++insert into t1 values (56); ++insert into t1 values (55); ++insert into t1 values (54); ++insert into t1 values (53); ++insert into t1 values (52); ++insert into t1 values (51); ++insert into t1 values (50); ++insert into t1 values (49); ++insert into t1 values (48); ++insert into t1 values (47); ++insert into t1 values (46); ++insert into t1 values (45); ++insert into t1 values (44); ++insert into t1 values (43); ++insert into t1 values (42); ++insert into t1 values (41); ++insert into t1 values (40); ++insert into t1 values (39); ++insert into t1 values (38); ++insert into t1 values (37); ++insert into t1 values (36); ++insert into t1 values (35); ++insert into t1 values (34); ++insert into t1 values (33); ++insert into t1 values (32); ++insert into t1 values (31); ++insert into t1 values (30); ++insert into t1 values (29); ++insert into t1 values (28); ++insert into t1 values (27); ++insert into t1 values (26); ++insert into t1 values (25); ++insert into t1 values (24); ++insert into t1 values (23); ++insert into t1 values (22); ++insert into t1 values (21); ++insert into t1 values (20); ++insert into t1 values (19); ++insert into t1 values (18); ++insert into t1 values (17); ++insert into t1 values (16); ++insert into t1 values (15); ++insert into t1 values (14); ++insert into t1 values (13); ++insert into t1 values (12); ++insert into t1 values (11); ++insert into t1 values (10); ++insert into t1 values (9); ++insert into t1 values (8); ++insert into t1 values (7); ++insert into t1 values (6); ++insert into t1 values (5); ++insert into t1 values (4); ++insert into t1 values (3); ++insert into t1 values (2); ++insert into t1 values (1); ++"The following are SLAVE." ++select count(distinct n) from t1; ++count(distinct n) ++300 ++select min(n) from t1; ++min(n) ++1 ++select max(n) from t1; ++max(n) ++300 ++show slave status; ++Slave_IO_State Master_Host Master_User Master_Port Connect_Retry Master_Log_File Read_Master_Log_Pos Relay_Log_File Relay_Log_Pos Relay_Master_Log_File Slave_IO_Running Slave_SQL_Running Replicate_Do_DB Replicate_Ignore_DB Replicate_Do_Table Replicate_Ignore_Table Replicate_Wild_Do_Table Replicate_Wild_Ignore_Table Last_Errno Last_Error Skip_Counter Exec_Master_Log_Pos Relay_Log_Space Until_Condition Until_Log_File Until_Log_Pos Master_SSL_Allowed Master_SSL_CA_File Master_SSL_CA_Path Master_SSL_Cert Master_SSL_Cipher Master_SSL_Key Seconds_Behind_Master ++Waiting for master to send event 127.0.0.1 root 9306 1 master-bin.000014 2849 # # master-bin.000014 Yes Yes # 0 0 2849 # None 0 No # ++show master status; ++File Position Binlog_Do_DB Binlog_Ignore_DB ++master-bin.000014 2849 ++"The following are SLAVE1." ++start slave; ++select count(distinct n) from t1; ++count(distinct n) ++300 ++select min(n) from t1; ++min(n) ++1 ++select max(n) from t1; ++max(n) ++300 ++show slave status; ++Slave_IO_State Master_Host Master_User Master_Port Connect_Retry Master_Log_File Read_Master_Log_Pos Relay_Log_File Relay_Log_Pos Relay_Master_Log_File Slave_IO_Running Slave_SQL_Running Replicate_Do_DB Replicate_Ignore_DB Replicate_Do_Table Replicate_Ignore_Table Replicate_Wild_Do_Table Replicate_Wild_Ignore_Table Last_Errno Last_Error Skip_Counter Exec_Master_Log_Pos Relay_Log_Space Until_Condition Until_Log_File Until_Log_Pos Master_SSL_Allowed Master_SSL_CA_File Master_SSL_CA_Path Master_SSL_Cert Master_SSL_Cipher Master_SSL_Key Seconds_Behind_Master ++Waiting for master to send event 127.0.0.1 root 9308 1 master-bin.000014 2849 # # master-bin.000014 Yes Yes # 0 0 2849 # None 0 No # ++"The following are SLAVE." ++MAKE MASTER MASTER_LOG_FILE='master-bin', ++MASTER_SERVER_ID=2, ++INDEX='replication-log'; ++ERROR HY000: Could not initialize master info structure; more error messages can be found in the MySQL error log ++stop slave; ++MAKE MASTER MASTER_LOG_FILE='master-bin', ++MASTER_SERVER_ID=2, ++INDEX='replication_log'; ++ERROR HY000: Could not initialize master info structure; more error messages can be found in the MySQL error log ++MAKE MASTER REVOKE SESSION WITH KILL; ++MAKE MASTER MASTER_LOG_FILE='master-bin', ++MASTER_SERVER_ID=2, ++INDEX='replication_log' ++ WITH BINLOG; ++MAKE MASTER GRANT SESSION; ++delete from t1 where n > 250; ++select count(distinct n) from t1; ++count(distinct n) ++250 ++"The following are SLAVE1." ++select count(distinct n) from t1; ++count(distinct n) ++250 ++select min(n) from t1; ++min(n) ++1 ++select max(n) from t1; ++max(n) ++250 ++"The following are SLAVE2." ++start slave; ++select count(distinct n) from t1; ++count(distinct n) ++250 ++select min(n) from t1; ++min(n) ++1 ++select max(n) from t1; ++max(n) ++250 ++show slave status; ++Slave_IO_State Master_Host Master_User Master_Port Connect_Retry Master_Log_File Read_Master_Log_Pos Relay_Log_File Relay_Log_Pos Relay_Master_Log_File Slave_IO_Running Slave_SQL_Running Replicate_Do_DB Replicate_Ignore_DB Replicate_Do_Table Replicate_Ignore_Table Replicate_Wild_Do_Table Replicate_Wild_Ignore_Table Last_Errno Last_Error Skip_Counter Exec_Master_Log_Pos Relay_Log_Space Until_Condition Until_Log_File Until_Log_Pos Master_SSL_Allowed Master_SSL_CA_File Master_SSL_CA_Path Master_SSL_Cert Master_SSL_Cipher Master_SSL_Key Seconds_Behind_Master ++Waiting for master to send event 127.0.0.1 root 9308 1 master-bin.000015 189 # # master-bin.000015 Yes Yes # 0 0 189 # None 0 No # ++drop table t1; ++drop table t1; ++"The following are SLAVE." ++show master logs; ++Log_name File_size ++master-bin.000001 4214 ++master-bin.000002 4212 ++master-bin.000003 4212 ++master-bin.000004 4212 ++master-bin.000005 4212 ++master-bin.000006 4212 ++master-bin.000007 4212 ++master-bin.000008 4212 ++master-bin.000009 4212 ++master-bin.000010 4194 ++master-bin.000011 4190 ++master-bin.000012 4190 ++master-bin.000013 4190 ++master-bin.000014 2849 ++master-bin.000015 265 ++show master status; ++File Position Binlog_Do_DB Binlog_Ignore_DB ++master-bin.000015 265 ++"The following are SLAVE2." ++show master logs; ++Log_name File_size ++master-bin.000001 4214 ++master-bin.000002 4212 ++master-bin.000003 4212 ++master-bin.000004 4212 ++master-bin.000005 4212 ++master-bin.000006 4212 ++master-bin.000007 4212 ++master-bin.000008 4212 ++master-bin.000009 4212 ++master-bin.000010 4194 ++master-bin.000011 4190 ++master-bin.000012 4190 ++master-bin.000013 4190 ++master-bin.000014 2849 ++master-bin.000015 265 ++show master status; ++File Position Binlog_Do_DB Binlog_Ignore_DB ++master-bin.000015 265 ++purge master logs to 'master-bin.000006'; ++show master logs; ++Log_name File_size ++master-bin.000006 4212 ++master-bin.000007 4212 ++master-bin.000008 4212 ++master-bin.000009 4212 ++master-bin.000010 4194 ++master-bin.000011 4190 ++master-bin.000012 4190 ++master-bin.000013 4190 ++master-bin.000014 2849 ++master-bin.000015 265 ++reset master; ++ERROR HY000: Binlog closed, cannot RESET MASTER +diff -r 66cc9e0a6768 mysql-test/t/rpl_mirror_binlog-master.opt +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/mysql-test/t/rpl_mirror_binlog-master.opt Thu Dec 04 21:46:15 2008 -0800 +@@ -0,0 +1,1 @@ ++-O max_binlog_size=4096 +diff -r 66cc9e0a6768 mysql-test/t/rpl_mirror_binlog-slave.opt +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/mysql-test/t/rpl_mirror_binlog-slave.opt Thu Dec 04 21:46:15 2008 -0800 +@@ -0,0 +1,1 @@ ++--rpl_mirror_binlog_enabled=1 --log-bin-index=replication_log +diff -r 66cc9e0a6768 mysql-test/t/rpl_mirror_binlog.1.slave-mi +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/mysql-test/t/rpl_mirror_binlog.1.slave-mi Thu Dec 04 21:46:15 2008 -0800 +@@ -0,0 +1,1 @@ ++--master-user=root --master-connect-retry=1 --master-host=127.0.0.1 --master-password="" --master-port=9308 --server-id=3 +diff -r 66cc9e0a6768 mysql-test/t/rpl_mirror_binlog.2.slave-mi +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/mysql-test/t/rpl_mirror_binlog.2.slave-mi Thu Dec 04 21:46:15 2008 -0800 +@@ -0,0 +1,1 @@ ++--master-user=root --master-connect-retry=1 --master-host=127.0.0.1 --master-password="" --master-port=9308 --server-id=4 +diff -r 66cc9e0a6768 mysql-test/t/rpl_mirror_binlog.test +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/mysql-test/t/rpl_mirror_binlog.test Thu Dec 04 21:46:15 2008 -0800 +@@ -0,0 +1,119 @@ ++-- source include/master-slave.inc ++-- source include/have_innodb.inc ++connect (slave_sec,localhost,root,,test,$SLAVE_MYPORT1,$SLAVE_MYSOCK1); ++connect (slave_ter,localhost,root,,test,$SLAVE_MYPORT2,$SLAVE_MYSOCK2); ++ ++connection master; ++--disable_warnings ++drop table if exists t1; ++--enable_warnings ++create table t1(n int) engine = InnoDB; ++ ++let $i=300; ++while ($i) ++{ ++ eval insert into t1 values ($i); ++ dec $i; ++} ++ ++save_master_pos; ++ ++connection slave; ++sync_with_master; ++ ++echo "The following are SLAVE."; ++select count(distinct n) from t1; ++select min(n) from t1; ++select max(n) from t1; ++--replace_column 8 # 9 # 18 # 23 # 33 # ++show slave status; ++show master status; ++ ++connection slave_sec; ++echo "The following are SLAVE1."; ++start slave; ++sync_with_master; ++ ++select count(distinct n) from t1; ++select min(n) from t1; ++select max(n) from t1; ++--replace_column 8 # 9 # 18 # 23 # 33 # ++show slave status; ++ ++# make the slave the new master ++connection slave; ++echo "The following are SLAVE."; ++ ++# The first 1201 error is caused by running slave. ++--error 1201 ++MAKE MASTER MASTER_LOG_FILE='master-bin', ++ MASTER_SERVER_ID=2, ++ INDEX='replication-log'; ++stop slave; ++ ++# The second 1201 error is caused by failover mode. ++--error 1201 ++MAKE MASTER MASTER_LOG_FILE='master-bin', ++ MASTER_SERVER_ID=2, ++ INDEX='replication_log'; ++ ++MAKE MASTER REVOKE SESSION WITH KILL; ++MAKE MASTER MASTER_LOG_FILE='master-bin', ++ MASTER_SERVER_ID=2, ++ INDEX='replication_log' ++ WITH BINLOG; ++ ++MAKE MASTER GRANT SESSION; ++ ++delete from t1 where n > 250; ++save_master_pos; ++ ++select count(distinct n) from t1; ++ ++connection slave_sec; ++echo "The following are SLAVE1."; ++ ++sync_with_master; ++select count(distinct n) from t1; ++select min(n) from t1; ++select max(n) from t1; ++ ++connection slave_ter; ++echo "The following are SLAVE2."; ++start slave; ++sync_with_master; ++ ++select count(distinct n) from t1; ++select min(n) from t1; ++select max(n) from t1; ++ ++--replace_column 8 # 9 # 18 # 23 # 33 # ++show slave status; ++ ++connection master; ++drop table t1; ++ ++connection slave; ++drop table t1; ++save_master_pos; ++ ++connection slave_sec; ++sync_with_master; ++ ++connection slave; ++echo "The following are SLAVE."; ++ ++show master logs; ++show master status; ++ ++ ++connection slave_ter; ++echo "The following are SLAVE2."; ++sync_with_master; ++ ++show master logs; ++show master status; ++purge master logs to 'master-bin.000006'; ++show master logs; ++--error 1186 ++reset master; +diff -r 66cc9e0a6768 patch_info/mirror_binlog.info +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/patch_info/mirror_binlog.info Thu Dec 04 21:46:15 2008 -0800 +@@ -0,0 +1,6 @@ ++File=mirror_binlog.patch ++Name=Mirroring binary logs on slave ++Version=V1 ++Author=Google ++License=GPL ++Comment=contains FastMaster promotion patch +diff -r 66cc9e0a6768 sql/Makefile.am +--- a/sql/Makefile.am Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/Makefile.am Thu Dec 04 21:46:15 2008 -0800 +@@ -68,7 +68,7 @@ + sql_array.h sql_cursor.h \ + examples/ha_example.h ha_archive.h \ + examples/ha_tina.h ha_blackhole.h \ +- ha_federated.h ++ ha_federated.h repl_mule.h + mysqld_SOURCES = sql_lex.cc sql_handler.cc \ + item.cc item_sum.cc item_buff.cc item_func.cc \ + item_cmpfunc.cc item_strfunc.cc item_timefunc.cc \ +@@ -105,7 +105,7 @@ + sp_cache.cc parse_file.cc sql_trigger.cc \ + examples/ha_example.cc ha_archive.cc \ + examples/ha_tina.cc ha_blackhole.cc \ +- ha_federated.cc ++ ha_federated.cc repl_mule.cc + + gen_lex_hash_SOURCES = gen_lex_hash.cc + gen_lex_hash_LDADD = $(LDADD) $(CXXLDFLAGS) +diff -r 66cc9e0a6768 sql/Makefile.in +--- a/sql/Makefile.in Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/Makefile.in Thu Dec 04 21:46:15 2008 -0800 +@@ -152,7 +152,7 @@ + sp_rcontext.$(OBJEXT) sp.$(OBJEXT) sp_cache.$(OBJEXT) \ + parse_file.$(OBJEXT) sql_trigger.$(OBJEXT) \ + ha_example.$(OBJEXT) ha_archive.$(OBJEXT) ha_tina.$(OBJEXT) \ +- ha_blackhole.$(OBJEXT) ha_federated.$(OBJEXT) ++ ha_blackhole.$(OBJEXT) ha_federated.$(OBJEXT) repl_mule.$(OBJEXT) + mysqld_OBJECTS = $(am_mysqld_OBJECTS) + mysqld_DEPENDENCIES = $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_2) \ + $(am__DEPENDENCIES_2) $(am__DEPENDENCIES_2) \ +@@ -516,7 +516,7 @@ + sql_array.h sql_cursor.h \ + examples/ha_example.h ha_archive.h \ + examples/ha_tina.h ha_blackhole.h \ +- ha_federated.h ++ ha_federated.h repl_mule.h + + mysqld_SOURCES = sql_lex.cc sql_handler.cc \ + item.cc item_sum.cc item_buff.cc item_func.cc \ +@@ -554,7 +554,7 @@ + sp_cache.cc parse_file.cc sql_trigger.cc \ + examples/ha_example.cc ha_archive.cc \ + examples/ha_tina.cc ha_blackhole.cc \ +- ha_federated.cc ++ ha_federated.cc repl_mule.cc + + gen_lex_hash_SOURCES = gen_lex_hash.cc + gen_lex_hash_LDADD = $(LDADD) $(CXXLDFLAGS) +@@ -748,6 +748,7 @@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/protocol.Po@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/records.Po@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/repl_failsafe.Po@am__quote@ ++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/repl_mule.Po@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/set_var.Po@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/slave.Po@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sp.Po@am__quote@ +diff -r 66cc9e0a6768 sql/lex.h +--- a/sql/lex.h Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/lex.h Thu Dec 04 21:46:15 2008 -0800 +@@ -292,6 +292,7 @@ + { "LONGTEXT", SYM(LONGTEXT)}, + { "LOOP", SYM(LOOP_SYM)}, + { "LOW_PRIORITY", SYM(LOW_PRIORITY)}, ++ { "MAKE", SYM(MAKE_SYM)}, + { "MASTER", SYM(MASTER_SYM)}, + { "MASTER_CONNECT_RETRY", SYM(MASTER_CONNECT_RETRY_SYM)}, + { "MASTER_HOST", SYM(MASTER_HOST_SYM)}, +diff -r 66cc9e0a6768 sql/log.cc +--- a/sql/log.cc Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/log.cc Thu Dec 04 21:46:15 2008 -0800 +@@ -79,7 +79,9 @@ + + bool binlog_init() + { +- return !opt_bin_log; ++ if (!opt_bin_log) ++ binlog_hton.prepare = NULL; ++ return 0; /* return !opt_bin_log; */ + } + + static int binlog_close_connection(THD *thd) +@@ -406,6 +408,7 @@ + :bytes_written(0), last_time(0), query_start(0), name(0), + prepared_xids(0), log_type(LOG_CLOSED), file_id(1), open_count(1), + write_error(FALSE), inited(FALSE), need_start_event(TRUE), ++ mule_binlog_(0), + description_event_for_exec(0), description_event_for_queue(0) + { + /* +@@ -506,7 +509,10 @@ + const char *log_name) + { + File index_file_nr= -1; +- DBUG_ASSERT(!my_b_inited(&index_file)); ++ ++ /* If the index is already opened, do not open it again. */ ++ if (my_b_inited(&index_file)) ++ return FALSE; + + /* + First open of this class instance +@@ -750,7 +756,7 @@ + if (file >= 0) + my_close(file,MYF(0)); + end_io_cache(&log_file); +- end_io_cache(&index_file); ++ close_index_file(); + safeFree(name); + log_type= LOG_CLOSED; + DBUG_RETURN(1); +@@ -768,7 +774,10 @@ + int MYSQL_LOG::raw_get_current_log(LOG_INFO* linfo) + { + strmake(linfo->log_file_name, log_file_name, sizeof(linfo->log_file_name)-1); +- linfo->pos = my_b_tell(&log_file); ++ if (!mule_binlog_) ++ linfo->pos = my_b_tell(&log_file); ++ else ++ linfo->pos = my_b_filelength(&log_file); + return 0; + } + +@@ -935,6 +944,11 @@ + if (need_lock) + pthread_mutex_lock(&LOCK_index); + safe_mutex_assert_owner(&LOCK_index); ++ ++ if (open_index_file(index_file_name, NULL) != 0) { ++ error = -1; ++ goto err; ++ } + + /* As the file is flushed, we can't get an error here */ + (void) reinit_io_cache(&index_file, READ_CACHE, linfo->index_file_offset, 0, +@@ -1446,18 +1460,19 @@ + SYNOPSIS + new_file() + need_lock Set to 1 if caller has not locked LOCK_log ++ logfile_name the specified log filename. + + NOTE + The new file name is stored last in the index file + */ + +-void MYSQL_LOG::new_file(bool need_lock) ++void MYSQL_LOG::new_file(bool need_lock, const char* log_filename) + { + char new_name[FN_REFLEN], *new_name_ptr, *old_name; + enum_log_type save_log_type; + + DBUG_ENTER("MYSQL_LOG::new_file"); +- if (!is_open()) ++ if (!is_log_open()) + { + DBUG_PRINT("info",("log is closed")); + DBUG_VOID_RETURN; +@@ -1496,7 +1511,9 @@ + We have to do this here and not in open as we want to store the + new file name in the current binary log file. + */ +- if (generate_new_name(new_name, name)) ++ if (log_filename) { ++ fn_format(new_name,log_filename,mysql_data_home,"",4); ++ } else if (generate_new_name(new_name, name)) + goto end; + new_name_ptr=new_name; + +@@ -1571,7 +1588,7 @@ + bytes_written+= ev->data_written; + DBUG_PRINT("info",("max_size: %lu",max_size)); + if ((uint) my_b_append_tell(&log_file) > max_size) +- new_file(0); ++ new_file(0); + + err: + pthread_mutex_unlock(&LOCK_log); +@@ -1600,8 +1617,14 @@ + bytes_written += len; + } while ((buf=va_arg(args,const char*)) && (len=va_arg(args,uint))); + DBUG_PRINT("info",("max_size: %lu",max_size)); +- if ((uint) my_b_append_tell(&log_file) > max_size) +- new_file(0); ++ ++ /* If max_size is BINLOG_NOSWITCH_SIZE, binlog would not switch because ++ * of file size limit. ++ */ ++ if (max_size != BINLOG_NOSWITCH_SIZE && ++ (uint) my_b_append_tell(&log_file) > max_size) { ++ new_file(0); ++ } + + err: + if (!error) +@@ -2492,6 +2515,17 @@ + DBUG_VOID_RETURN; + } + ++int MYSQL_LOG::flush_log_file() { ++ return flush_io_cache(&log_file); ++} ++ ++int MYSQL_LOG::close_index_file() { ++ if (my_b_inited(&index_file)) { ++ end_io_cache(&index_file); ++ my_close(index_file.file, MYF(0)); ++ } ++ return 0; ++} + + /* + Check if a string is a valid number +diff -r 66cc9e0a6768 sql/log_event.h +--- a/sql/log_event.h Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/log_event.h Thu Dec 04 21:46:15 2008 -0800 +@@ -94,6 +94,14 @@ + #define LINE_TERM_EMPTY 0x4 + #define LINE_START_EMPTY 0x8 + #define ESCAPED_EMPTY 0x10 ++ ++/* This server-id value is used to indicate a special master-info event ++ * in relay-log. ++ * We will enforce in database that replication can not set this value ++ * as the server-id. ++ */ ++#define MASTER_INFO_SERVER_ID 0xffffffff ++ + + /***************************************************************************** + +diff -r 66cc9e0a6768 sql/mysql_priv.h +--- a/sql/mysql_priv.h Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/mysql_priv.h Thu Dec 04 21:46:15 2008 -0800 +@@ -462,6 +462,7 @@ + /* BINLOG_DUMP options */ + + #define BINLOG_DUMP_NON_BLOCK 1 ++#define BINLOG_MIRROR_CLIENT 0x0004 + + /* sql_show.cc:show_log_files() */ + #define SHOW_LOG_STATUS_FREE "FREE" +@@ -1374,6 +1375,7 @@ + extern const char **errmesg; /* Error messages */ + extern const char *myisam_recover_options_str; + extern const char *in_left_expr_name, *in_additional_cond, *in_having_cond; ++extern char *opt_binlog_index_name; + extern const char * const triggers_file_ext; + extern const char * const trigname_file_ext; + extern Eq_creator eq_creator; +@@ -1875,6 +1877,10 @@ + extern "C" void unireg_abort(int exit_code); + void kill_delayed_threads(void); + bool check_stack_overrun(THD *thd, long margin, char *dummy); ++extern my_bool rpl_mirror_binlog_enabled; ++extern ulong sync_mirror_binlog_period; ++extern my_bool rpl_mirror_binlog_no_replicate; ++extern ulong rpl_mirror_binlog_clients, rpl_mirror_binlog_status; + #else + #define unireg_abort(exit_code) DBUG_RETURN(exit_code) + inline void kill_delayed_threads(void) {} +diff -r 66cc9e0a6768 sql/mysqld.cc +--- a/sql/mysqld.cc Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/mysqld.cc Thu Dec 04 21:46:15 2008 -0800 +@@ -555,6 +555,7 @@ + pthread_mutex_t LOCK_global_user_client_stats; + pthread_mutex_t LOCK_global_table_stats; + pthread_mutex_t LOCK_global_index_stats; ++pthread_mutex_t LOCK_failover_master; + /* + The below lock protects access to two global server variables: + max_prepared_stmt_count and prepared_stmt_count. These variables +@@ -584,13 +585,15 @@ + char *master_ssl_key, *master_ssl_cert; + char *master_ssl_ca, *master_ssl_capath, *master_ssl_cipher; + ++char *opt_binlog_index_name; ++ + /* Static variables */ + + static bool kill_in_progress, segfaulted; + static my_bool opt_do_pstack, opt_bootstrap, opt_myisam_log; + static int cleanup_done; + static ulong opt_specialflag, opt_myisam_block_size; +-static char *opt_logname, *opt_update_logname, *opt_binlog_index_name; ++static char *opt_logname, *opt_update_logname; + static char *opt_tc_heuristic_recover; + static char *mysql_home_ptr, *pidfile_name_ptr; + static char **defaults_argv; +@@ -598,6 +601,32 @@ + + static my_socket unix_sock,ip_sock; + struct rand_struct sql_rand; // used by sql_class.cc:THD::THD() ++ ++/* When set, we are inside a failover slave and deny all non-super access */ ++bool failover_deny_access= 0; ++ ++/* When set, binlog will be mirrored on the replica. */ ++my_bool rpl_mirror_binlog_enabled; ++ ++/* Sync the mirrored binlog to disk after every #th event. */ ++ulong sync_mirror_binlog_period; ++ ++/* The fixed size for replication event buffer. Replication event can exceed ++ * the size. ++ */ ++//ulong rpl_event_buffer_size; ++ ++/* This is a mirror binlog status variable on the primary to indicate how many ++ * mirror binlog servers are connecting. ++ */ ++ulong rpl_mirror_binlog_clients = 0; ++ ++/* This indicates whether mirror binlog is working on a replica database. It ++ * requires: ++ * . rpl_mirror_binlog_enabled = 1 ++ * . the slave I/O thread is running and mirror binlog is also dumped ++ */ ++ulong rpl_mirror_binlog_status = 0; + + /* OS specific variables */ + +@@ -1315,6 +1344,7 @@ + (void) pthread_cond_destroy(&COND_flush_thread_cache); + (void) pthread_cond_destroy(&COND_manager); + (void) pthread_mutex_destroy(&LOCK_stats); ++ (void) pthread_mutex_destroy(&LOCK_failover_master); + (void) pthread_mutex_destroy(&LOCK_global_user_client_stats); + (void) pthread_mutex_destroy(&LOCK_global_table_stats); + (void) pthread_mutex_destroy(&LOCK_global_index_stats); +@@ -3164,6 +3194,7 @@ + (void) pthread_cond_init(&COND_rpl_status, NULL); + #endif + (void) pthread_mutex_init(&LOCK_stats, MY_MUTEX_INIT_FAST); ++ (void) pthread_mutex_init(&LOCK_failover_master, MY_MUTEX_INIT_FAST); + (void) pthread_mutex_init(&LOCK_global_user_client_stats, MY_MUTEX_INIT_FAST); + (void) pthread_mutex_init(&LOCK_global_table_stats, MY_MUTEX_INIT_FAST); + (void) pthread_mutex_init(&LOCK_global_index_stats, MY_MUTEX_INIT_FAST); +@@ -3398,39 +3429,8 @@ + + if (opt_bin_log) + { +- char buf[FN_REFLEN]; +- const char *ln; +- ln= mysql_bin_log.generate_name(opt_bin_logname, "-bin", 1, buf); +- if (!opt_bin_logname && !opt_binlog_index_name) +- { +- /* +- User didn't give us info to name the binlog index file. +- Picking `hostname`-bin.index like did in 4.x, causes replication to +- fail if the hostname is changed later. So, we would like to instead +- require a name. But as we don't want to break many existing setups, we +- only give warning, not error. +- */ +- sql_print_warning("No argument was provided to --log-bin, and " +- "--log-bin-index was not used; so replication " +- "may break when this MySQL server acts as a " +- "master and has his hostname changed!! Please " +- "use '--log-bin=%s' to avoid this problem.", ln); +- } +- if (ln == buf) +- { +- my_free(opt_bin_logname, MYF(MY_ALLOW_ZERO_PTR)); +- opt_bin_logname=my_strdup(buf, MYF(0)); +- } +- if (mysql_bin_log.open_index_file(opt_binlog_index_name, ln)) +- { +- unireg_abort(1); +- } +- +- /* +- Used to specify which type of lock we need to use for queries of type +- INSERT ... SELECT. This will change when we have row level logging. +- */ +- using_update_log=1; ++ if (make_master_open_index(&opt_bin_logname, opt_binlog_index_name) != 0) ++ unireg_abort(1); + } + + if (xid_cache_init()) +@@ -3480,9 +3480,10 @@ + unireg_abort(1); + } + +- if (opt_bin_log && mysql_bin_log.open(opt_bin_logname, LOG_BIN, 0, +- WRITE_CACHE, 0, max_binlog_size, 0)) +- unireg_abort(1); ++ if (opt_bin_log && ++ make_master(NULL, opt_bin_logname, opt_binlog_index_name, NULL) != 0) { ++ unireg_abort(1); ++ } + + #ifdef HAVE_REPLICATION + if (opt_bin_log && expire_logs_days) +@@ -5098,6 +5098,8 @@ + OPT_INNODB_READ_IO_THREADS, + OPT_INNODB_WRITE_IO_THREADS, + OPT_INNODB_ADAPTIVE_HASH_INDEX, ++ OPT_RPL_MIRROR_BINLOG, ++ OPT_SYNC_MIRROR_BINLOG, + OPT_FEDERATED, + OPT_INNODB_USE_LEGACY_CARDINALITY_ALGORITHM + }; +@@ -5725,6 +5728,11 @@ + {"rpl-recovery-rank", OPT_RPL_RECOVERY_RANK, "Undocumented.", + (gptr*) &rpl_recovery_rank, (gptr*) &rpl_recovery_rank, 0, GET_ULONG, + REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, ++ {"rpl_mirror_binlog_enabled", OPT_RPL_MIRROR_BINLOG, ++ "1 = support mirroring binlogs. 0 = disable mirroring binlogs", ++ (gptr*) &rpl_mirror_binlog_enabled, ++ (gptr*) &rpl_mirror_binlog_enabled, 0, GET_BOOL, NO_ARG, ++ 0, 0, 1, 0, 1, 0}, + {"safe-mode", OPT_SAFE, "Skip some optimize stages (for testing).", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + #ifndef TO_BE_DELETED +@@ -5849,6 +5857,11 @@ + {"symbolic-links", 's', "Enable symbolic link support.", + (gptr*) &my_use_symdir, (gptr*) &my_use_symdir, 0, GET_BOOL, NO_ARG, + IF_PURIFY(0,1), 0, 0, 0, 0, 0}, ++ {"sync-mirror-binlog", OPT_SYNC_MIRROR_BINLOG, ++ "Sync the mirrored binlog to disk after every #th event. " ++ "#=0 (the default) does no sync. Syncing slows MySQL down", ++ (gptr*) &sync_mirror_binlog_period, ++ (gptr*) &sync_mirror_binlog_period, 0, GET_ULONG, REQUIRED_ARG, 0, 0, ~0L, 0, 1, 0}, + {"sysdate-is-now", OPT_SYSDATE_IS_NOW, + "Non-default option to alias SYSDATE() to NOW() to make it safe-replicable. Since 5.0, SYSDATE() returns a `dynamic' value different for different invocations, even within the same statement.", + (gptr*) &global_system_variables.sysdate_is_now, +@@ -6625,6 +6638,7 @@ + {"Delayed_errors", (char*) &delayed_insert_errors, SHOW_LONG}, + {"Delayed_insert_threads", (char*) &delayed_insert_threads, SHOW_LONG_CONST}, + {"Delayed_writes", (char*) &delayed_insert_writes, SHOW_LONG}, ++ {"Failover_deny_access", (char*) &failover_deny_access, SHOW_LONG}, + {"Flush_commands", (char*) &refresh_version, SHOW_LONG_CONST}, + {"Handler_commit", (char*) offsetof(STATUS_VAR, ha_commit_count), SHOW_LONG_STATUS}, + {"Handler_delete", (char*) offsetof(STATUS_VAR, ha_delete_count), SHOW_LONG_STATUS}, +diff -r 66cc9e0a6768 sql/repl_mule.cc +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/sql/repl_mule.cc Thu Dec 04 21:46:15 2008 -0800 +@@ -0,0 +1,466 @@ ++/* ++ Copyright (C) 2007 Google Inc. ++ ++This program is free software; you can redistribute it and/or ++modify it under the terms of the GNU General Public License ++as published by the Free Software Foundation; either version 2 ++of the License, or (at your option) any later version. ++ ++This program is distributed in the hope that it will be useful, ++but WITHOUT ANY WARRANTY; without even the implied warranty of ++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License ++along with this program; if not, write to the Free Software ++Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ++*/ ++ ++#include "mysql_priv.h" ++#include <my_dir.h> ++#include "slave.h" ++#include "repl_mule.h" ++ ++/* max log size: 2GB */ ++#define MAX_LOG_SIZE BINLOG_NOSWITCH_SIZE ++ ++ReplMule::ReplMule(THD* thd, MASTER_INFO *mi, RelayStatus status, ++ my_off_t file_size, const char *binlog_indexname, ++ MYSQL_LOG *binlog, ulong sync_period) ++ : desc_event_(new Format_description_log_event(BINLOG_VERSION)), ++ io_thd_(thd), mi_(mi), status_(status), dump_position_(0L), ++ file_size_(file_size), mule_log_(binlog), ++ mule_log_sync_period_(sync_period), mule_log_event_counter_(0) { ++ char llbuf1[22], llbuf2[22]; ++ ++ DBUG_ENTER("ReplMule::ReplMule"); ++ ++ /* Indicate that we are in replication mule mode. */ ++ mule_log_->set_mule_mode(); ++ ++ strmake(curr_log_filename_, mi->master_log_name, ++ sizeof(curr_log_filename_)-1); ++ strmake(mule_indexname_, binlog_indexname, sizeof(mule_indexname_)-1); ++ ++ /* Open the mule log file */ ++ if (!mule_log_->is_log_open()) { ++ /* Do not open binlog file when master_log_name is not specified. We ++ * are at the I/O thread initialization time and we do not know what ++ * filename we are going to dump. ++ * We wait for the next rotation event to indicate the filename. ++ */ ++ if (strlen(curr_log_filename_) > 0 && ++ mule_log_->open(curr_log_filename_, LOG_BIN, NULL, ++ SEQ_READ_APPEND, true, MAX_LOG_SIZE, 0) != 0) { ++ sql_print_error("ReplMule: open binlog failed: %s", ++ curr_log_filename_); ++ status_ = MULE_ERROR; ++ DBUG_VOID_RETURN; ++ } ++ } ++ ++ switch (status_) { ++ case MULE_BEHIND: ++ dump_position_ = mi->master_log_pos; ++ mi->master_log_pos = file_size_; ++ sql_print_information("ReplicationMule: MULE_BEHIND - new(%s), old(%s)", ++ llstr(mi->master_log_pos, llbuf1), ++ llstr(dump_position_, llbuf2)); ++ break; ++ case RELAY_MATCH_MULE: ++ case RELAY_MATCH_MULE_RUN: ++ dump_position_ = mi->master_log_pos; ++ sql_print_information("ReplicationMule: RELAY_MATCH_MULE."); ++ break; ++ case MULE_VERIFY: ++ case MULE_VERIFY_RELAY_BEHIND: ++ dump_position_ = mi->master_log_pos; ++ mi->master_log_pos = BIN_LOG_HEADER_SIZE; ++ sql_print_information( ++ "ReplicationMule: MULE_VERIFY - old(%s), file_size(%s)", ++ llstr(dump_position_, llbuf1), llstr(file_size_, llbuf2)); ++ ++ /* seek to the beginning of the file for verification */ ++ seekToPosition(BIN_LOG_HEADER_SIZE); ++ break; ++ } ++ ++ DBUG_VOID_RETURN; ++} ++ ++ReplMule::~ReplMule() { ++ DBUG_ENTER("ReplMule::~ReplMule"); ++ ++ if (mule_log_->is_log_open()) ++ mule_log_->close(LOG_CLOSE_INDEX); ++ mule_log_->clear_mule_mode(); ++ ++ /* If we are still in MULE_BEHIND or MULE_VERIFY state and we exit from ++ * I/O thread, it means we encountered some errors. ++ * mi->master_log_pos might be used by later slave start. It is being ++ * changed here to do event dumping or event verification. So, we should ++ * restore it to its original value. ++ */ ++ switch (status_) { ++ case MULE_BEHIND: ++ case MULE_VERIFY: ++ if (mi_->master_log_pos < dump_position_) ++ mi_->master_log_pos = dump_position_; ++ break; ++ } ++ ++ delete desc_event_; ++ ++ DBUG_VOID_RETURN; ++} ++ ++ReplMule::WriteStatus ReplMule::writeEvent(const char* buf, ulong event_len) { ++ WriteStatus dump_status = WRITE_RELAY; ++ char llbuf1[22], llbuf2[22], llbuf3[22]; ++ char *verify_event; ++ bool verified = false; ++ bool skip_event = false; ++ ++ DBUG_ENTER("ReplMule::dumpEvent"); ++ switch (status_) { ++ case MULE_VERIFY: ++ case MULE_VERIFY_RELAY_BEHIND: ++ if (buf[EVENT_TYPE_OFFSET] == ROTATE_EVENT && ++ IsFakeRotation(buf, event_len)) { ++ /* Do not verify the faked rotate event */ ++ if (status_ == MULE_VERIFY) ++ dump_status = SKIP_RELAY; ++ break; ++ } ++ verify_event = new char[event_len]; ++ if (verify_event == NULL) { ++ sql_print_error( ++ "ReplMule::dumpEvent - insufficient memory in verification, " ++ "position(%s), event_len(%d).", ++ llstr(mi_->master_log_pos, llbuf1), event_len); ++ dump_status = WRITE_ERROR; ++ break; ++ } ++ if (my_b_read(mule_log_->get_log_file(), (byte*) verify_event, ++ event_len) != 0) { ++ sql_print_error( ++ "ReplMule::dumpEvent - read log error in verification, " ++ "position(%s), event_len(%d).", ++ llstr(mi_->master_log_pos, llbuf1), event_len); ++ dump_status = WRITE_ERROR; ++ delete verify_event; ++ break; ++ } ++ verified = (memcmp(buf, verify_event, event_len) == 0); ++ delete verify_event; ++ if (!verified) { ++ sql_print_error( ++ "ReplMule::dumpEvent - event does not match at position(%s)", ++ llstr(mi_->master_log_pos, llbuf1)); ++ dump_status = WRITE_ERROR; ++ break; ++ } ++ /* fall through */ ++ case MULE_BEHIND: ++ dump_status = SKIP_RELAY; ++ if (status_ == MULE_BEHIND && ++ queueEvent(buf, event_len, &skip_event) != 0) { ++ dump_status = WRITE_ERROR; ++ break; ++ } ++ ++ /* Skip faked rotation event */ ++ if (!skip_event) ++ mi_->master_log_pos += event_len; ++ ++ if (mi_->master_log_pos == dump_position_) { ++ if (dump_position_ < file_size_) { ++ status_ = MULE_VERIFY_RELAY_BEHIND; ++ } else { ++ status_ = RELAY_MATCH_MULE; ++ } ++ sql_print_information( ++ "ReplMule::dumpEvent - new status(%d) " ++ "master_log_pos(%s), dump_pos(%s), file_size(%s)", status_, ++ llstr(mi_->master_log_pos, llbuf1), llstr(dump_position_, llbuf2), ++ llstr(file_size_, llbuf3)); ++ } else if (mi_->master_log_pos == file_size_) { ++ if (dump_position_ > file_size_) { ++ status_ = MULE_BEHIND; ++ } else { ++ status_ = RELAY_MATCH_MULE; ++ } ++ sql_print_information( ++ "ReplMule::dumpEvent - new status(%d) " ++ "master_log_pos(%s), dump_pos(%s), file_size(%s)", status_, ++ llstr(mi_->master_log_pos, llbuf1), llstr(dump_position_, llbuf2), ++ llstr(file_size_, llbuf3)); ++ } else if (status_ != MULE_VERIFY_RELAY_BEHIND && ++ mi_->master_log_pos > dump_position_) { ++ sql_print_error( ++ "ReplMule::dumpEvent - mule position(%s) does not match " ++ "relay-log position(%s).", ++ llstr(mi_->master_log_pos, llbuf1), llstr(dump_position_, llbuf2)); ++ dump_status = WRITE_ERROR; ++ } ++ break; ++ case RELAY_MATCH_MULE_RUN: ++ if (buf[EVENT_TYPE_OFFSET] == FORMAT_DESCRIPTION_EVENT) { ++ sql_print_information(" RELAY_MATCH_MULE event %d", buf[EVENT_TYPE_OFFSET] ); ++ /* Do not write format description record if size is the same */ ++ break; ++ } ++ case RELAY_MATCH_MULE: ++ if (queueEvent(buf, event_len, &skip_event) != 0) ++ dump_status = WRITE_ERROR; ++ break; ++ } ++ ++ DBUG_RETURN(dump_status); ++} ++ ++int ReplMule::appendEvent(const char* buf, ulong event_len) { ++ char llbuf1[22]; ++ int error; ++ ++ DBUG_ENTER("ReplMule::appendEvent"); ++ ++ error = mule_log_->appendv(buf,event_len,0); ++ if (error != 0) { ++ sql_print_error("ReplMule::appendEvent - append error at %s(%s)", ++ mi_->master_log_name, ++ llstr(mi_->master_log_pos, llbuf1)); ++ } else if (mule_log_->flush_log_file() != 0) { ++ sql_print_error("ReplMule::appendEvent - flush error at %s(%s)", ++ mi_->master_log_name, ++ llstr(mi_->master_log_pos, llbuf1)); ++ error = -1; ++ } else if (mule_log_sync_period_ > 0) { ++ mule_log_event_counter_++; ++ if (mule_log_event_counter_ >= mule_log_sync_period_) { ++ mule_log_event_counter_ = 0; ++ error = my_sync(mule_log_->get_log_file()->file, MYF(MY_WME)); ++ if (error != 0) ++ sql_print_error("ReplMule::appendEvent - sync error at %s(%s)", ++ mi_->master_log_name, ++ llstr(mi_->master_log_pos, llbuf1)); ++ } ++ } ++ ++ DBUG_RETURN(error); ++} ++ ++int ReplMule::queueEvent(const char* buf, ulong event_len, bool *skip_event) { ++ int error = 0; ++ ++ DBUG_ENTER("ReplMule::queueEvent"); ++ ++ *skip_event = false; ++ ++ mule_log_->lock_log(); ++ if (buf[EVENT_TYPE_OFFSET] == ROTATE_EVENT) { ++ Rotate_log_event rev(buf, event_len, desc_event_); ++ ++ /* If this is a faked rotate event and the specified filename is ++ * the same as the current binlog filename, ignore the event. ++ */ ++ if (IsFakeRotation(rev)) { ++ *skip_event = true; ++ DBUG_PRINT("info",("skipped faked rotation event")); ++ } else { ++ /* Only append real events. */ ++ if (rev.when != 0) ++ error = appendEvent(buf, event_len); ++ ++ /* Only rotate file when append succeeds. */ ++ if (error == 0) { ++ /* Create a new file: lock both index and log. */ ++ if (strlen(curr_log_filename_) == 0) { ++ /* If curr_log_filename_ is not specified, then this is the first ++ * valid rotation event to indicate the filename. ++ */ ++ error = mule_log_->open(rev.new_log_ident, LOG_BIN, NULL, ++ SEQ_READ_APPEND, true, MAX_LOG_SIZE, 0); ++ } else { ++ mule_log_->new_file(0, rev.new_log_ident); ++ } ++ ++ strmake(curr_log_filename_, rev.new_log_ident, ++ strlen(rev.new_log_ident)); ++ ++ DBUG_PRINT("info",("rotate file: %s", rev.new_log_ident)); ++ } ++ } ++ } else { ++ error = appendEvent(buf, event_len); ++ } ++ mule_log_->unlock_log(); ++ ++ DBUG_RETURN(error); ++} ++ ++void ReplMule::seekToPosition(my_off_t pos) { ++ DBUG_ENTER("ReplMule::seekToPosition"); ++ DBUG_PRINT("enter",("seek_pos: %ld", (ulong) pos)); ++ ++ my_b_seek(mule_log_->get_log_file(), pos); ++ DBUG_VOID_RETURN; ++} ++ ++bool ReplMule::IsFakeRotation(const char* buf, ulong event_len) { ++ DBUG_ENTER("ReplMule::IsFakeRotation"); ++ ++ Rotate_log_event rev(buf, event_len, desc_event_); ++ DBUG_RETURN(IsFakeRotation(rev)); ++} ++ ++bool ReplMule::IsFakeRotation(const Rotate_log_event& rev) { ++ DBUG_ENTER("ReplMule::IsFakeRotation"); ++ DBUG_RETURN(rev.when == 0 && ++ rev.ident_len == strlen(curr_log_filename_) && ++ strcmp(rev.new_log_ident, curr_log_filename_) == 0); ++} ++ ++/* createReplicationMule: ++ * Create a mule that relays master's replication binlog and ++ * generate an exact same copy on the local filesystem. ++ * ++ * Code flow: ++ * last_mulelog = scan the existing mule log index to find it ++ * if (mulelog index is not created or there is no mule log inside it) ++ * old_mule_log <- requested dumping position ++ * requested dumping position <- 0 in the file ++ * else ++ * check whether the mule log matches the requested dump ++ * (whether the last mule log name/size matches) ++ * if the mule log name does not match ++ * exit with an error ++ * if (the mule log size does not match the requested dump position) ++ * request the dump from position 0 and read all events ++ * verify all events with the corresponding events in mule log ++ * if (the verification succeeds) ++ * continue the dump ++ * else ++ * exit with an error ++ */ ++ReplMule* ReplMule::createReplicationMule( ++ THD* thd, MASTER_INFO *mi, const char *binlog_indexname, ++ MYSQL_LOG *binlog) { ++ ReplMule *mule = NULL; ++ LOG_INFO linfo; ++ bool index_opened = false; ++ ++ DBUG_ENTER("ReplMule::createReplicationMule"); ++ ++ /* binlog_indexname must be set to some real value. */ ++ DBUG_ASSERT(binlog_indexname); ++ ++ /* Lock binlog index for all binlog operations */ ++ binlog->lock_index(); ++ index_opened = binlog->open_index_file(binlog_indexname, NULL); ++ DBUG_PRINT("info",("open index file succeed: %d", index_opened)); ++ sql_print_information("createReplicationMule"); ++ ++ /* Scan the existing binlog index to find the last relayed binlog */ ++ if (index_opened || ++ binlog->find_log_pos(&linfo, NullS, false) != 0) { ++ /* binlog index is not created or has no log file inside: ++ * . old_relay_binlog <- requested dumping position ++ * . requested dumping position <- 0 in the file ++ */ ++ if (mi->master_log_pos == BIN_LOG_HEADER_SIZE) { ++ mule = new ReplMule(thd, mi, RELAY_MATCH_MULE, BIN_LOG_HEADER_SIZE, ++ binlog_indexname, binlog, sync_mirror_binlog_period); ++ } else { ++ mule = new ReplMule(thd, mi, MULE_BEHIND, BIN_LOG_HEADER_SIZE, ++ binlog_indexname, binlog, sync_mirror_binlog_period); ++ } ++ ++ if (mule == NULL) { ++ sql_print_error("Mule malloc operation failed."); ++ } ++ } else { ++ IO_CACHE* log_file; ++ MY_STAT stat; ++ char last_binlog_name[FN_REFLEN]; ++ ++ /* Find the last log file from the binlog index. ++ * Check whether the last binlog matches the requested dump for both ++ * binlog name and binlog size. ++ */ ++ for (;;) { ++ strmake(last_binlog_name, linfo.log_file_name, FN_REFLEN); ++ last_binlog_name[FN_REFLEN - 1] = '\0'; ++ if (binlog->find_next_log(&linfo, false)) ++ break; ++ } ++ DBUG_PRINT("info",("the last binlog: %s", last_binlog_name)); ++ ++ /* if the binlog name does not match, exit with an error. */ ++ if (strcmp(last_binlog_name+dirname_length(last_binlog_name), ++ mi->master_log_name) != 0) { ++ sql_print_error("Mule binlog(%s) does not match new relay-binlog(%s)", ++ last_binlog_name, mi->master_log_name); ++ } /* Open the last binlog. */ ++ else if (binlog->open(last_binlog_name, LOG_BIN, NULL, ++ SEQ_READ_APPEND, true, MAX_LOG_SIZE, 0) != 0) { ++ sql_print_error("Mule open last binlog failed: %s", last_binlog_name); ++ } else { ++ bool valid_file_size = true; ++ ++ /* Get the binlog size. */ ++ log_file = binlog->get_log_file(); ++ if (my_fstat(log_file->file, &stat, MYF(0)) == 0) { ++ /* If the binlog size does not match the requested dump position, then ++ * request the dump from position 0 and verify all events, we need to ++ * verify events because the mule log might be used for serving during ++ * anytime. We must be sure that they are correct. ++ */ ++ sql_print_information("Binglog size %d", stat.st_size); ++ if (stat.st_size == mi->master_log_pos) { ++ mule = new ReplMule(thd, mi, RELAY_MATCH_MULE_RUN, stat.st_size, ++ binlog_indexname, binlog, ++ sync_mirror_binlog_period); ++ } else if (stat.st_size > BIN_LOG_HEADER_SIZE) { ++ mule = new ReplMule(thd, mi, MULE_VERIFY, stat.st_size, ++ binlog_indexname, binlog, ++ sync_mirror_binlog_period); ++ } else if (stat.st_size == BIN_LOG_HEADER_SIZE) { ++ mule = new ReplMule(thd, mi, MULE_BEHIND, BIN_LOG_HEADER_SIZE, ++ binlog_indexname, binlog, ++ sync_mirror_binlog_period); ++ } else { ++ char llbuf[22]; ++ valid_file_size = false; ++ sql_print_error("Mule binlog file(%s) invalid size: %s", ++ last_binlog_name, llstr(stat.st_size, llbuf)); ++ } ++ } else { ++ valid_file_size = false; ++ sql_print_error("Mule binlog file(%s): fstat failed.", ++ last_binlog_name); ++ } ++ ++ if (valid_file_size) { ++ if (mule == NULL) { ++ sql_print_error("Mule malloc operation failed."); ++ } else if (mule->status_ == MULE_ERROR) { ++ /* If mule creation fails, indicate the error. */ ++ delete mule; ++ mule = NULL; ++ } ++ } ++ } ++ } ++ ++ /* Clear the mule binlog mode if there are errors. */ ++ if (mule == NULL) { ++ binlog->clear_mule_mode(); ++ binlog->close_index_file(); ++ } ++ ++ /* Unlock binlog index */ ++ binlog->unlock_index(); ++ ++ DBUG_RETURN(mule); ++} +diff -r 66cc9e0a6768 sql/repl_mule.h +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/sql/repl_mule.h Thu Dec 04 21:46:15 2008 -0800 +@@ -0,0 +1,166 @@ ++/* ++ Copyright (C) 2007 Google Inc. ++ ++This program is free software; you can redistribute it and/or ++modify it under the terms of the GNU General Public License ++as published by the Free Software Foundation; either version 2 ++of the License, or (at your option) any later version. ++ ++This program is distributed in the hope that it will be useful, ++but WITHOUT ANY WARRANTY; without even the implied warranty of ++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License ++along with this program; if not, write to the Free Software ++Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ++*/ ++ ++#ifndef SQL_REPL_MULE_H__ ++#define SQL_REPL_MULE_H__ ++ ++/* Replication Mule is the class that is responsible for generating ++ * an exact copy of the binlog from a master database. We call this feature ++ * mirror binlog and it can be enabled by setting rpl_mirror_binlog. We ++ * need to keep the same copy for the following purposes: ++ * . The replica can serve the binlog transparently as if they are the ++ * master database. This can relieve master connection overhead. ++ * . During failover, the replica can become the new master and serve ++ * old binlogs transparently. ++ * (The Mule name comes from the popular P2P software eMule.) ++ * ++ * Internally, we call the mirrored binlog mule log. ++ */ ++ ++class THD; ++class Rotate_log_event; ++class Format_description_log_event; ++typedef struct st_master_info MASTER_INFO; ++ ++class ReplMule { ++ public: ++ /* Because I/O thread also creates relay-binlog, instead of an exact ++ * copy of the original master's binlog, we have two resources that ++ * might get out of sync. ++ * This enum indicates the status: ++ * MULE_BEHIND - the mule's header is behind: ++ * (mule is activated for the first time) ++ * RELAY_MATCH_MULE - mule matches relay-log ++ * RELAY_MATCH_MULE_RUN - mule matches relay-log and it was not empty binlog ++ * MULE_VERIFY - mule has more events than the relay-log and needs ++ * verification; we can not verify based on relay-log ++ * events because events might get changed a little; ++ * verification starts with downloading all events in ++ * the last binlog from the master and compare with ++ * all events in the mule log; ++ * MULE_VERIFY_RELAY_BEHIND - mule has more events than the relay-log ++ * and relay-log needs to write events ++ * MULE_ERROR - mule detects errors in event duplicate ++ * ++ * When the mule mirrors binlogs, it writes an event into the mule log ++ * first. Then, I/O thread writes the event into the relay log. ++ */ ++ enum RelayStatus { ++ MULE_BEHIND = 1, ++ RELAY_MATCH_MULE = 2, ++ RELAY_MATCH_MULE_RUN = 7, ++ MULE_VERIFY = 3, ++ MULE_VERIFY_RELAY_BEHIND = 4, ++ MULE_ERROR = 5, ++ }; ++ ++ enum WriteStatus { ++ WRITE_RELAY = 1, ++ WRITE_ERROR = 2, ++ SKIP_RELAY = 3, ++ }; ++ ++ private: ++ const Format_description_log_event *desc_event_; ++ THD *io_thd_; ++ MASTER_INFO *mi_; ++ ++ /* ++ * I/O thread will write both mule log for mirror binlog and relay log ++ * for SQL thread. ++ * The variable indicates whether the two are in sync. ++ */ ++ RelayStatus status_; ++ ++ /* The starting event writing position. */ ++ my_off_t dump_position_; ++ ++ /* During the initial setup, the last mule log's file size. */ ++ my_off_t file_size_; ++ ++ /* Internally, we call the mirrored binlog mule log. */ ++ MYSQL_LOG *mule_log_; ++ ++ /* Sync the mule log to disk for every #N events. */ ++ ulong mule_log_sync_period_; ++ ulong mule_log_event_counter_; ++ ++ /* mule log's index filename */ ++ char mule_indexname_[FN_REFLEN]; ++ ++ /* the current mule log's filename */ ++ char curr_log_filename_[FN_REFLEN]; ++ ++ ReplMule(THD* thd, MASTER_INFO *mi, RelayStatus status, ++ my_off_t file_size, const char *binlog_indexname, ++ MYSQL_LOG *binlog, ulong sync_period); ++ ++ /* ++ * Queue the event into the current mule log. If it is a rotation ++ * event, generate a new mule log file. ++ * Indicate whether the event is skipped because it is an fake event. ++ * A fake event is generated by the master to indicate the current ++ * reading position. ++ */ ++ int queueEvent(const char* buf, ulong event_len, bool *skip_event); ++ ++ /* Append the event to the current mule log. */ ++ int appendEvent(const char* buf, ulong event_len); ++ ++ bool IsFakeRotation(const char* buf, ulong event_len); ++ bool IsFakeRotation(const Rotate_log_event& rev); ++ ++ /* Seek to the specified position in the current open mule log. */ ++ void seekToPosition(my_off_t pos); ++ ++ public: ++ ++ ~ReplMule(); ++ ++ /* Dump the event into mule binlog. ++ * Input: ++ * buf (IN) - replication event buffer ++ * event_len (IN) - the event length ++ * ++ * Return: ++ * . WRITE_RELAY: the relay log needs to writing the event ++ * . WRITE_ERROR: the writing encountered errors ++ * . SKIP_RELAY: the relay log should skip the event ++ */ ++ WriteStatus writeEvent(const char* buf, ulong event_len); ++ ++ /* createReplicationMule: ++ * Create a mule that relays master's replication binlog and ++ * generate an exact same copy on the local filesystem. ++ * ++ * Input: ++ * thd (IN) - replication I/O thread ++ * mi (IN) - master info struct for I/O thread's progress ++ * binlog_indexname (IN) - filename for binlog's index ++ * binlog (IN) - replication binlog ++ * ++ * Return: ++ * . a replication mule if success ++ * . NULL if there are any errors ++ */ ++ static ReplMule *createReplicationMule(THD* thd, MASTER_INFO *mi, ++ const char *binlog_indexname, ++ MYSQL_LOG *binlog); ++}; ++ ++#endif /* SQL_REPL_MULE_H__ */ +diff -r 66cc9e0a6768 sql/set_var.cc +--- a/sql/set_var.cc Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/set_var.cc Thu Dec 04 21:46:15 2008 -0800 +@@ -345,6 +345,8 @@ + slog_verb); + sys_var_long_ptr sys_rpl_recovery_rank("rpl_recovery_rank", + &rpl_recovery_rank); ++sys_var_bool_ptr sys_rpl_mirror_binlog_enabled("rpl_mirror_binlog_enabled", ++ &rpl_mirror_binlog_enabled); + sys_var_long_ptr sys_query_cache_size("query_cache_size", + &query_cache_size, + fix_query_cache_size); +@@ -364,6 +366,9 @@ + sys_var_thd_ulong sys_trans_prealloc_size("transaction_prealloc_size", + &SV::trans_prealloc_size, + 0, fix_trans_mem_root); ++sys_var_long_ptr sys_sync_mirror_binlog_period( ++ "sync_mirror_binlog_period", ++ &sync_mirror_binlog_period); + + #ifdef HAVE_QUERY_CACHE + sys_var_long_ptr sys_query_cache_limit("query_cache_limit", +@@ -774,6 +779,7 @@ + &sys_relay_log_purge, + #endif + &sys_rpl_recovery_rank, ++ &sys_rpl_mirror_binlog_enabled, + &sys_safe_updates, + &sys_secure_auth, + &sys_secure_file_priv, +@@ -1113,6 +1119,8 @@ + {"relay_log_space_limit", (char*) &relay_log_space_limit, SHOW_LONGLONG}, + #endif + {sys_rpl_recovery_rank.name,(char*) &sys_rpl_recovery_rank, SHOW_SYS}, ++ {sys_rpl_mirror_binlog_enabled.name, ++ (char *) &sys_rpl_mirror_binlog_enabled, SHOW_SYS}, + {"secure_auth", (char*) &sys_secure_auth, SHOW_SYS}, + {"secure_file_priv", (char*) &sys_secure_file_priv, SHOW_SYS}, + #ifdef HAVE_SMEM +diff -r 66cc9e0a6768 sql/slave.cc +--- a/sql/slave.cc Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/slave.cc Thu Dec 04 21:46:15 2008 -0800 +@@ -25,6 +25,7 @@ + #include <thr_alarm.h> + #include <my_dir.h> + #include <sql_common.h> ++#include "repl_mule.h" + #include <errmsg.h> + #include <mysys_err.h> + +@@ -3527,6 +3528,7 @@ + RELAY_LOG_INFO *rli= &mi->rli; + char llbuff[22]; + uint retry_count; ++ ReplMule *mule = NULL; + + // needs to call my_thread_init(), otherwise we get a coredump in DBUG_ stuff + my_thread_init(); +@@ -3609,6 +3611,23 @@ + if (get_master_version_and_clock(mysql, mi)) + goto err; + ++ if (rpl_mirror_binlog_enabled && !mule) { ++ if (opt_binlog_index_name == NULL) { ++ sql_print_error("\"log-bin-index\" must be set in mirror binlog."); ++ goto err; ++ } ++ ++ /* Create the mule to generate the exact copy of the binlog */ ++ mule = ReplMule::createReplicationMule( ++ thd, mi, opt_binlog_index_name, &mysql_bin_log); ++ ++ /* If we could not create the mule, we stop the I/O thread and report ++ * an error. ++ */ ++ if (mule == NULL) ++ goto err; ++ } ++ + if (mi->rli.relay_log.description_event_for_queue->binlog_version > 1) + { + /* +@@ -3624,6 +3643,7 @@ + DBUG_PRINT("info",("Starting reading binary log from master")); + while (!io_slave_killed(thd,mi)) + { ++ const char* event_buf; + bool suppress_warnings= 0; + thd_proc_info(thd, "Requesting binlog dump"); + if (request_dump(mysql, mi, &suppress_warnings)) +@@ -3754,10 +3774,25 @@ + goto connected; + } // if (event_len == packet_error) + ++ event_buf = (const char*)mysql->net.read_pos + 1; ++ ++ if (mule) { ++ ReplMule::WriteStatus d_status = ++ mule->writeEvent(event_buf, event_len); ++ switch (d_status) { ++ case ReplMule::WRITE_RELAY: ++ break; ++ case ReplMule::SKIP_RELAY: ++ /* Skip writing relay event; go back to read the next event */ ++ continue; ++ case ReplMule::WRITE_ERROR: ++ goto err; ++ } ++ } ++ + retry_count=0; // ok event, reset retry counter + thd_proc_info(thd, "Queueing master event to the relay log"); +- if (queue_event(mi,(const char*)mysql->net.read_pos + 1, +- event_len)) ++ if (queue_event(mi, event_buf, event_len)) + { + sql_print_error("Slave I/O thread could not queue event from master"); + goto err; +@@ -3847,6 +3882,7 @@ + change_rpl_status(RPL_ACTIVE_SLAVE,RPL_IDLE_SLAVE); + DBUG_ASSERT(thd->net.buff != 0); + net_end(&thd->net); // destructor will not free it, because net.vio is 0 ++ delete mule; + close_thread_tables(thd, 0); + pthread_mutex_lock(&LOCK_thread_count); + THD_CHECK_SENTRY(thd); +diff -r 66cc9e0a6768 sql/sql_class.h +--- a/sql/sql_class.h Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/sql_class.h Thu Dec 04 21:46:15 2008 -0800 +@@ -152,6 +152,12 @@ + #define LOG_INFO_FATAL -7 + #define LOG_INFO_IN_USE -8 + ++/* If the maximum size is equal to this value, binlog would not rotate on ++ * size limit. ++ */ ++#define BINLOG_NOSWITCH_SIZE ((ulong) -1) ++ ++ + /* bitmap to SQL_LOG::close() */ + #define LOG_CLOSE_INDEX 1 + #define LOG_CLOSE_TO_BE_OPENED 2 +@@ -245,6 +251,9 @@ + bool no_auto_events; + friend class Log_event; + ++ /* mule replication mode */ ++ bool mule_binlog_; ++ + public: + /* + These describe the log's format. This is used only for relay logs. +@@ -317,7 +326,8 @@ + } + bool open_index_file(const char *index_file_name_arg, + const char *log_name); +- void new_file(bool need_lock); ++ int close_index_file(); ++ void new_file(bool need_lock= 1, const char* log_filename= NULL); + bool write(THD *thd, enum enum_server_command command, + const char *format, ...) ATTRIBUTE_FORMAT(printf, 4, 5); + bool write(THD *thd, const char *query, uint query_length, +@@ -357,7 +367,27 @@ + int get_current_log(LOG_INFO* linfo); + int raw_get_current_log(LOG_INFO* linfo); + uint next_file_id(); +- inline bool is_open() { return log_type != LOG_CLOSED; } ++ ++ /* Because mysql use is_open() to check whether replication is on, ++ * we will let the check fail during binlog mule mode. Mule replication ++ * and normal master replication can not be on at the same time. ++ * ++ * is_log_open(): the binlog file is open for either purpose ++ * ++ * is_open(): the binlog is open for master replication. ++ * is_mule_open(): the binlog is open for mirror binlog or for ++ * replication mule; refer repl_mule.h for details ++ */ ++ bool is_log_open() { ++ return log_type != LOG_CLOSED; ++ } ++ bool is_open() { ++ return (!mule_binlog_) && is_log_open(); ++ } ++ bool is_mule_open() { ++ return (mule_binlog_) && is_log_open(); ++ } ++ + inline char* get_index_fname() { return index_file_name;} + inline char* get_log_fname() { return log_file_name; } + inline char* get_name() { return name; } +@@ -366,8 +396,18 @@ + + inline void lock_index() { pthread_mutex_lock(&LOCK_index);} + inline void unlock_index() { pthread_mutex_unlock(&LOCK_index);} ++ inline void lock_log() { pthread_mutex_lock(&LOCK_log);} ++ inline void unlock_log() { pthread_mutex_unlock(&LOCK_log);} + inline IO_CACHE *get_index_file() { return &index_file;} + inline uint32 get_open_count() { return open_count; } ++ /* Look in file repl_mule.h for the definition of mule. */ ++ void set_mule_mode() { ++ mule_binlog_ = 1; ++ } ++ void clear_mule_mode() { ++ mule_binlog_ = 0; ++ } ++ int flush_log_file(); + }; + + /* +diff -r 66cc9e0a6768 sql/sql_lex.h +--- a/sql/sql_lex.h Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/sql_lex.h Thu Dec 04 21:46:15 2008 -0800 +@@ -104,6 +104,7 @@ + // TODO(mcallaghan): update status_vars in mysqld to export these + SQLCOM_SHOW_USER_STATS, SQLCOM_SHOW_TABLE_STATS, SQLCOM_SHOW_INDEX_STATS, + SQLCOM_SHOW_CLIENT_STATS, ++ SQLCOM_MAKE_MASTER, + /* This should be the last !!! */ + SQLCOM_END + }; +@@ -171,6 +172,12 @@ + char *ssl_key, *ssl_cert, *ssl_ca, *ssl_capath, *ssl_cipher; + char *relay_log_name; + ulong relay_log_pos; ++ ++ /* the following fields are used for make master command */ ++ char *log_index_name; ++ bool in_failover; ++ bool kill_session; ++ bool with_old_binlog; + } LEX_MASTER_INFO; + + +diff -r 66cc9e0a6768 sql/sql_parse.cc +--- a/sql/sql_parse.cc Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/sql_parse.cc Thu Dec 04 21:46:15 2008 -0800 +@@ -402,6 +402,15 @@ + passwd_len ? "yes": "no", + thd->main_security_ctx.master_access, + (thd->db ? thd->db : "*none*"))); ++ ++ /* If we are in failover mode, reject all non-super user connections. */ ++ if (is_in_failover() && ++ !(thd->main_security_ctx.master_access & SUPER_ACL)) { ++ net_send_error(thd, ER_SPECIFIC_ACCESS_DENIED_ERROR, ++ "super-user only during failover"); ++ DBUG_RETURN(-1); ++ } ++ + + if (check_count) + { +@@ -3470,6 +3479,22 @@ + else + res = load_master_data(thd); + break; ++ ++ case SQLCOM_MAKE_MASTER: ++ { ++ thd_proc_info(thd, "Making master"); ++ ++ if (check_global_access(thd, SUPER_ACL)) ++ goto error; ++ res = make_master(thd, NULL, NULL, &lex->mi); ++ if (res == 0) { ++ // TODO -- wei is this OK, setting it to NULL? ++ thd_proc_info(thd, 0); ++ send_ok(thd); ++ } ++ break; ++ } ++ + #endif /* HAVE_REPLICATION */ + #ifdef HAVE_NDBCLUSTER_DB + case SQLCOM_SHOW_NDBCLUSTER_STATUS: +diff -r 66cc9e0a6768 sql/sql_repl.cc +--- a/sql/sql_repl.cc Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/sql_repl.cc Thu Dec 04 21:46:15 2008 -0800 +@@ -20,11 +20,19 @@ + #include "log_event.h" + #include <my_dir.h> + ++extern pthread_mutex_t LOCK_failover_master; ++extern bool failover_deny_access; ++ + int max_binlog_dump_events = 0; // unlimited + my_bool opt_sporadic_binlog_dump_fail = 0; + #ifndef DBUG_OFF + static int binlog_dump_count = 0; + #endif ++ ++static int make_master_open_log(MYSQL_LOG *log, const char *opt_name, ++ bool no_auto_events, ulong max_size); ++static int set_in_failover(bool kill_session); ++static void clear_in_failover(void); + + /* + fake_rotate_event() builds a fake (=which does not exist physically in any +@@ -255,7 +263,7 @@ + bool purge_master_logs(THD* thd, const char* to_log) + { + char search_file_name[FN_REFLEN]; +- if (!mysql_bin_log.is_open()) ++ if (!mysql_bin_log.is_log_open()) + { + send_ok(thd); + return FALSE; +@@ -308,6 +316,44 @@ + return error; + } + ++/* Show processlist command dump the binlog state. ++ * ++ * Input: ++ * output_info - (OUT) the output proc_info ++ * output_len - (IN) output proc_info's length ++ * thd - (IN) the thread ++ * input_msg - (IN) the input proc_info ++ * log_file_name - (IN) binlog file name ++ * log_pos - (IN) binlog position ++ */ ++static void processlist_show_binlog_state(char *output_info, ++ int output_len, ++ THD *thd, ++ const char *input_msg, ++ const char *log_file_name, ++ my_off_t log_pos) { ++ DBUG_ENTER("processlist_show_binlog_state"); ++ ++ /* Point to input_msg in case "show processlist" access it before the copy ++ * is finished. ++ */ ++ thd_proc_info(thd, input_msg); ++ ++ if (snprintf(output_info, output_len, "%s :%s:%lld:", input_msg, ++ log_file_name + dirname_length(log_file_name), ++ log_pos) > 0) { ++ thd_proc_info(thd, output_info); ++ } ++ ++ DBUG_VOID_RETURN; ++} ++ ++static void repl_cleanup(ushort flags) { ++ if (flags & BINLOG_MIRROR_CLIENT) { ++ /* One less mirror binlog client. */ ++ thread_safe_sub(rpl_mirror_binlog_clients, 1, &LOCK_stats); ++ } ++} + + /* + TODO: Clean up loop to only have one call to send_file() +@@ -319,6 +365,11 @@ + LOG_INFO linfo; + char *log_file_name = linfo.log_file_name; + char search_file_name[FN_REFLEN], *name; ++ ++ /* This buffer should be enough for "comments + :file_name:file_pos:". */ ++ char binlog_state_msg[FN_REFLEN + 100]; ++ int binlog_state_msg_len = FN_REFLEN + 100; ++ + IO_CACHE log; + File file = -1; + String* packet = &thd->packet; +@@ -335,6 +386,15 @@ + + bzero((char*) &log,sizeof(log)); + ++ sql_print_information("Start %s binlog_dump to slave_server(%d), pos(%s, %lu)", ++ "asynchronous", ++ thd->server_id, log_ident, (ulong)pos); ++ ++ if (flags & BINLOG_MIRROR_CLIENT) { ++ /* One more mirror binlog clients. */ ++ thread_safe_increment(rpl_mirror_binlog_clients, &LOCK_stats); ++ } ++ + #ifndef DBUG_OFF + if (opt_sporadic_binlog_dump_fail && (binlog_dump_count++ % 2)) + { +@@ -344,7 +404,7 @@ + } + #endif + +- if (!mysql_bin_log.is_open()) ++ if (!mysql_bin_log.is_log_open()) + { + errmsg = "Binary log is not open"; + my_errno= ER_MASTER_FATAL_ERROR_READING_BINLOG; +@@ -529,6 +589,12 @@ + } + #endif + ++ /* Update the binlog sending state. */ ++ processlist_show_binlog_state( ++ binlog_state_msg, binlog_state_msg_len, thd, ++ "Send binlog events to slave", ++ log_file_name, pos); ++ + if ((*packet)[EVENT_TYPE_OFFSET+1] == FORMAT_DESCRIPTION_EVENT) + { + binlog_can_be_corrupted= test((*packet)[FLAGS_OFFSET+1] & +@@ -634,6 +700,13 @@ + } + if (!thd->killed) + { ++ /* Update the binlog sending state. */ ++ processlist_show_binlog_state( ++ binlog_state_msg, binlog_state_msg_len, thd, ++ "Has sent all binlog to slave; " ++ "waiting for binlog to be updated", ++ log_file_name, pos); ++ + /* Note that the following call unlocks lock_log */ + mysql_bin_log.wait_for_update(thd, 0); + } +@@ -650,7 +723,12 @@ + + if (read_packet) + { +- thd_proc_info(thd, "Sending binlog event to slave"); ++ // thd_proc_info(thd, "Sending binlog event to slave"); ++ /* Update the binlog sending state. */ ++ processlist_show_binlog_state(binlog_state_msg, ++ binlog_state_msg_len, thd, ++ "Sending binlog event to slave", ++ log_file_name, pos); + if (my_net_write(net, (char*)packet->ptr(), packet->length()) ) + { + errmsg = "Failed on my_net_write()"; +@@ -685,10 +763,21 @@ + } + else + { ++ char old_log_file_name[FN_REFLEN]; + bool loop_breaker = 0; + /* need this to break out of the for loop from switch */ + +- thd_proc_info(thd, "Finished reading one binlog; switching to next binlog"); ++ // thd_proc_info(thd, "Finished reading one binlog; switching to next binlog"); ++ /* Update the binlog sending state. */ ++ processlist_show_binlog_state( ++ binlog_state_msg, binlog_state_msg_len, thd, ++ "Finished reading one binlog; switching to next binlog", ++ log_file_name, pos); ++ ++ /* Keep the old fileename. */ ++ strmake(old_log_file_name, log_file_name, ++ sizeof(old_log_file_name) - 1); ++ + switch (mysql_bin_log.find_next_log(&linfo, 1)) { + case LOG_INFO_EOF: + loop_breaker = (flags & BINLOG_DUMP_NON_BLOCK); +@@ -706,6 +795,16 @@ + + end_io_cache(&log); + (void) my_close(file, MYF(MY_WME)); ++ ++ /* A sanity check that we can not serve the same binlog twice because ++ * the filenames are stored in a .index file. ++ */ ++ if (strcmp(old_log_file_name, log_file_name) >= 0) { ++ errmsg = "Re-serving an already served binlog file."; ++ my_errno = ER_MASTER_FATAL_ERROR_READING_BINLOG; ++ goto err; ++ } ++ + + /* + Call fake_rotate_event() in case the previous log (the one which +@@ -733,6 +832,8 @@ + end_io_cache(&log); + (void)my_close(file, MYF(MY_WME)); + ++ repl_cleanup(flags); ++ + send_eof(thd); + thd_proc_info(thd, "Waiting to finalize termination"); + pthread_mutex_lock(&LOCK_thread_count); +@@ -743,6 +844,7 @@ + err: + thd_proc_info(thd, "Waiting to finalize termination"); + end_io_cache(&log); ++ repl_cleanup(flags); + /* + Exclude iteration through thread list + this is needed for purge_logs() - it will iterate through +@@ -1316,7 +1418,7 @@ + Format_description_log_event *description_event= new + Format_description_log_event(3); /* MySQL 4.0 by default */ + +- if (mysql_bin_log.is_open()) ++ if (mysql_bin_log.is_log_open()) + { + LEX_MASTER_INFO *lex_mi= &thd->lex->mi; + SELECT_LEX_UNIT *unit= &thd->lex->unit; +@@ -1456,7 +1558,7 @@ + DBUG_RETURN(TRUE); + protocol->prepare_for_resend(); + +- if (mysql_bin_log.is_open()) ++ if (mysql_bin_log.is_log_open()) + { + LOG_INFO li; + mysql_bin_log.get_current_log(&li); +@@ -1497,7 +1599,7 @@ + Protocol *protocol= thd->protocol; + DBUG_ENTER("show_binlogs"); + +- if (!mysql_bin_log.is_open()) ++ if (!mysql_bin_log.is_log_open()) + { + my_message(ER_NO_BINARY_LOGGING, ER(ER_NO_BINARY_LOGGING), MYF(0)); + return 1; +@@ -1606,6 +1708,235 @@ + DBUG_RETURN(0); + } + ++ ++/* make_master: Make the current database a primary and starts the ++ * binlog logging for all updates. ++ * ++ * The function handles the following sql commands: ++ * . MAKE MASTER MASTER_LOG_FILE='replication_log', MASTER_SERVER_ID=1, ++ * [WITH BINLOG]; ++ * . MAKE MASTER MASTER_LOG_FILE='replication_log', MASTER_SERVER_ID=1, ++ * INDEX='replication_log.index' [WITH BINLOG]; ++ * . MAKE MASTER REVOKE SESSION; ++ * . MAKE MASTER REVOKE SESSION WITH KILL; ++ * . MAKE MASTER GRANT SESSION; ++ * ++ * Args: ++ * thd - the current thread ++ * binlog_name - binlog's filename ++ * binlog_indexname - binlog index's filename ++ * mi - master info struct containing binlog name ++ * (set when we enable master during runtime) ++ * ++ * Return: ++ * 0 : success ++ * -1 : failure ++ */ ++int make_master(THD* thd, ++ const char *binlog_name, ++ const char *binlog_indexname, ++ const LEX_MASTER_INFO* mi) { ++ int error = 0; ++ ++ DBUG_ENTER("make_master"); ++ /* In two mode, we enable the binlog: ++ * . !mi - LEX is not provided; this is called from startup time ++ * . mi->log_file_name - binlog is specified in the command ++ */ ++ if (!mi || mi->log_file_name) { ++ /* Get the mutex */ ++ VOID(pthread_mutex_lock(&LOCK_failover_master)); ++ ++ /* If the binlog is already opened, we issue an error. We reuse one ++ * existing error, which might not be fully accurate. ++ */ ++ if (mysql_bin_log.is_log_open()) { ++ my_error(ER_MASTER_INFO, MYF(0)); ++ sql_print_error("Replication master log is already open: cannot " ++ "make another master!"); ++ error = -1; ++ } else { ++ if (!mi) { ++ /* This opening happens at mysql startup time. */ ++ if (make_master_open_log(&mysql_bin_log, binlog_name, ++ 0, max_binlog_size) != 0) { ++ error = -1; ++ } ++ } else { ++ /* This opening happens during mysql runtime, which is mostly ++ * requested to do failover. ++ */ ++ ++ error = -1; ++ if (!is_in_failover()) { ++ sql_print_error( ++ "\"make master\" runs only in failover mode. " ++ "Please run \"make master revoke session (with kill)\""); ++ } else if (strlen(mi->log_file_name) == 0) { ++ sql_print_error("Master log filename is not specified correctly."); ++ } else if (!mi->server_id || mi->server_id == MASTER_INFO_SERVER_ID) { ++ sql_print_error("\"make master\": invalid server_id(%d)", ++ mi->server_id); ++ } else { ++ /* Open the new log files and delete all existing ones to avoid ++ * conflicts. ++ */ ++ uint32 old_server_id = server_id; ++ char *binlog_name = NULL; ++ ++ /* Set the global master server id. ++ * We would not change server id for all connection threads. ++ * All non-super sessions should be blocked by revoke sessions. ++ * Super-user sessions are responsible for their own operations. ++ */ ++ server_id = mi->server_id; ++ thd->server_id = mi->server_id; ++ ++ if (!(binlog_name = my_strdup(mi->log_file_name, MYF(0))) || ++ make_master_open_index(&binlog_name, mi->log_index_name) != 0 || ++ make_master_open_log(&mysql_bin_log, binlog_name, ++ 0, max_binlog_size) != 0) { ++ sql_print_error("Open master logfile failed."); ++ thd->server_id = old_server_id; ++ server_id = old_server_id; ++ } else if (!mi->with_old_binlog && ++ mysql_bin_log.reset_logs(thd) != 0) { ++ sql_print_error("Cleanup existing master logfiles failed."); ++ thd->server_id = old_server_id; ++ server_id = old_server_id; ++ } else { ++ error = 0; ++ } ++ } ++ if (error == -1) ++ my_error(ER_MASTER_INFO, MYF(0)); ++ } ++ } ++ ++ if (error == 0) { ++ /* indicates that binlog is enabled now */ ++ using_update_log = 1; ++ } else if (mysql_bin_log.is_open()) { ++ mysql_bin_log.close(LOG_CLOSE_INDEX); ++ } ++ ++ /* Release the mutex */ ++ VOID(pthread_mutex_unlock(&LOCK_failover_master)); ++ } else { ++ /* The following actions are related to session management during ++ * failover operation. We do not want some sessions come in ++ * during failover and make updates. ++ * This is invoked for command: MAKE MASTER GRANT/REVOKE SESSION; ++ */ ++ if (mi->in_failover) { ++ set_in_failover(mi->kill_session); ++ } else { ++ clear_in_failover(); ++ } ++ } ++ ++ DBUG_RETURN(error); ++} ++ ++static int make_master_open_log(MYSQL_LOG *log, ++ const char *opt_name, ++ bool no_auto_events, ++ ulong max_size) { ++ char tmp[FN_REFLEN]; ++ ++ // get rid of extension ++ char *p = fn_ext(opt_name); ++ uint length=(uint) (p-opt_name); ++ strmake(tmp,opt_name,min(length,FN_REFLEN)); ++ opt_name=tmp; ++ ++ return log->open(opt_name, LOG_BIN, NULL, WRITE_CACHE, 0, ++ max_size, 0); ++} ++ ++int make_master_open_index(char **binlog_name, ++ const char *binlog_indexname) { ++ char buf[FN_REFLEN]; ++ const char *ln; ++ DBUG_ENTER("make_master_open_index"); ++ ++ ln= mysql_bin_log.generate_name(*binlog_name, "-bin", 1, buf); ++ if (!(*binlog_name) && !binlog_indexname) { ++ /* ++ User didn't give us info to name the binlog index file. ++ Picking `hostname`-bin.index like did in 4.x, causes replication to ++ fail if the hostname is changed later. So, we would like to instead ++ require a name. But as we don't want to break many existing setups, we ++ only give warning, not error. ++ */ ++ sql_print_warning("No argument was provided to --log-bin, and " ++ "--log-bin-index was not used; so replication " ++ "may break when this MySQL server acts as a " ++ "master and has his hostname changed!! Please " ++ "use '--log-bin=%s' to avoid this problem.", ln); ++ } ++ if (ln == buf) { ++ my_free(*binlog_name, MYF(MY_ALLOW_ZERO_PTR)); ++ *binlog_name = my_strdup(buf, MYF(0)); ++ } ++ if (mysql_bin_log.open_index_file(binlog_indexname, ln) != 0) { ++ DBUG_RETURN(-1); ++ } ++ ++ /* ++ Used to specify which type of lock we need to use for queries of type ++ INSERT ... SELECT. This will change when we have row level logging. ++ */ ++ using_update_log=1; ++ ++ DBUG_RETURN(0); ++} ++ ++/* Set the status indicating that we are in failover and deny all non-super ++ * user access. ++ * ++ * Args: ++ * kill_session - kill all non-super sessions if specified ++ * ++ * Return: ++ * 0 - success ++ * -1 - failure (caused by not killing all sessions) ++ */ ++static int set_in_failover(bool kill_session) { ++ failover_deny_access = 1; ++ ++ if (kill_session) { ++ /* If kill session option is specified, we need to kill all non-super ++ * user sessions. ++ */ ++ THD *kill_thd; ++ ++ uint error=ER_NO_SUCH_THREAD; ++ pthread_mutex_lock(&LOCK_thread_count); // For unlink from list ++ I_List_iterator<THD> it(threads); ++ while ((kill_thd=it++)) { ++ if (!(kill_thd->main_security_ctx.master_access & SUPER_ACL)) { ++ pthread_mutex_lock(&kill_thd->LOCK_delete); // Lock from delete ++ ++ /* ask the thread to die */ ++ kill_thd->awake(THD::KILL_CONNECTION); ++ pthread_mutex_unlock(&kill_thd->LOCK_delete); ++ } ++ } ++ pthread_mutex_unlock(&LOCK_thread_count); ++ } ++ return 0; ++} ++ ++static void clear_in_failover(void) { ++ failover_deny_access = 0; ++} ++ ++bool is_in_failover(void) { ++ return failover_deny_access; ++} ++ ++ + #endif /* HAVE_REPLICATION */ + + +diff -r 66cc9e0a6768 sql/sql_repl.h +--- a/sql/sql_repl.h Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/sql_repl.h Thu Dec 04 21:46:15 2008 -0800 +@@ -38,6 +38,10 @@ + int start_slave(THD* thd, MASTER_INFO* mi, bool net_report); + int stop_slave(THD* thd, MASTER_INFO* mi, bool net_report); + bool change_master(THD* thd, MASTER_INFO* mi); ++int make_master(THD* thd, const char *binlog_name, ++ const char *binlog_indexname, const LEX_MASTER_INFO* mi); ++int make_master_open_index(char **binlog_name, const char *binlog_indexname); ++bool is_in_failover(void); + bool mysql_show_binlog_events(THD* thd); + int cmp_master_pos(const char* log_file_name1, ulonglong log_pos1, + const char* log_file_name2, ulonglong log_pos2); +diff -r 66cc9e0a6768 sql/sql_yacc.yy +--- a/sql/sql_yacc.yy Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/sql_yacc.yy Thu Dec 04 21:46:15 2008 -0800 +@@ -735,6 +735,7 @@ + %token LOOP_SYM + %token LOW_PRIORITY + %token LT ++%token MAKE_SYM + %token MAKE_SET_SYM + %token MASTER_CONNECT_RETRY_SYM + %token MASTER_HOST_SYM +@@ -1167,7 +1168,7 @@ + query verb_clause create change select do drop insert replace insert2 + insert_values update delete truncate rename + show describe load alter optimize keycache preload flush +- reset purge begin commit rollback savepoint release ++ make reset purge begin commit rollback savepoint release + slave master_def master_defs master_file_def slave_until_opts + repair restore backup analyze check start checksum + field_list field_list_item field_spec kill column_def key_def +@@ -1301,6 +1302,7 @@ + | kill + | load + | lock ++ | make + | optimize + | keycache + | preload +@@ -1428,6 +1430,56 @@ + master_defs + {} + ; ++ ++/* make master */ ++make: ++ MAKE_SYM MASTER_SYM ++ { ++ LEX *lex = Lex; ++ lex->sql_command = SQLCOM_MAKE_MASTER; ++ bzero((char*) &lex->mi, sizeof(lex->mi)); ++ } ++ make_master_defs ++ { ++ } ++ ; ++ ++make_master_defs: ++ MASTER_LOG_FILE_SYM EQ TEXT_STRING ',' MASTER_SERVER_ID_SYM EQ ulong_num ++ { ++ Lex->mi.log_file_name = $3.str; ++ Lex->mi.server_id = $7; ++ } ++ make_master_with_defs {} ++ | MASTER_LOG_FILE_SYM EQ TEXT_STRING ',' MASTER_SERVER_ID_SYM EQ ulong_num ',' INDEX_SYM EQ TEXT_STRING ++ { ++ Lex->mi.log_file_name = $3.str; ++ Lex->mi.server_id = $7; ++ Lex->mi.log_index_name = $11.str; ++ } ++ make_master_with_defs {} ++ | GRANT SESSION_SYM ++ { ++ Lex->mi.in_failover = 0; ++ } ++ | REVOKE SESSION_SYM ++ { ++ Lex->mi.in_failover = 1; ++ } ++ | REVOKE SESSION_SYM WITH KILL_SYM ++ { ++ Lex->mi.in_failover = 1; ++ Lex->mi.kill_session = 1; ++ } ++ ; ++ ++make_master_with_defs: ++ /* empty */ {} ++ | WITH BINLOG_SYM ++ { ++ /* All old binlogs will be kept after "make master" command. */ ++ Lex->mi.with_old_binlog = 1; ++ } + + master_defs: + master_def +@@ -8396,6 +8448,7 @@ + | HANDLER_SYM {} + | HELP_SYM {} + | LANGUAGE_SYM {} ++ | MAKE_SYM {} + | NO_SYM {} + | OPEN_SYM {} + | PREPARE_SYM {} diff --git a/percona/5.0.91-b22-20100522/mysql-test.patch b/percona/5.0.91-b22-20100522/mysql-test.patch new file mode 100644 index 0000000..00e0eb9 --- /dev/null +++ b/percona/5.0.91-b22-20100522/mysql-test.patch @@ -0,0 +1,140 @@ +--- a/mysql-test/r/information_schema.result 2009-05-07 19:31:26.000000000 +0000 ++++ b/mysql-test/r/information_schema.result 2009-05-07 19:32:59.000000000 +0000 +@@ -60,6 +60,7 @@ + USER_STATISTICS + VIEWS + INNODB_IO_PATTERN ++INNODB_RSEG + columns_priv + db + func +@@ -743,7 +744,7 @@ + CREATE VIEW a1 (t_CRASHME) AS SELECT f1 FROM t_crashme GROUP BY f1; + CREATE VIEW a2 AS SELECT t_CRASHME FROM a1; + count(*) +-109 ++110 + drop view a2, a1; + drop table t_crashme; + select table_schema,table_name, column_name from +@@ -819,7 +820,7 @@ + flush privileges; + SELECT table_schema, count(*) FROM information_schema.TABLES GROUP BY TABLE_SCHEMA; + table_schema count(*) +-information_schema 24 ++information_schema 25 + mysql 17 + create table t1 (i int, j int); + create trigger trg1 before insert on t1 for each row +@@ -1228,6 +1229,7 @@ + USER_STATISTICS USER + VIEWS TABLE_SCHEMA + INNODB_IO_PATTERN SPACE ++INNODB_RSEG RSEG_ID + SELECT t.table_name, c1.column_name + FROM information_schema.tables t + INNER JOIN +@@ -1267,6 +1269,7 @@ + USER_STATISTICS USER + VIEWS TABLE_SCHEMA + INNODB_IO_PATTERN SPACE ++INNODB_RSEG RSEG_ID + SELECT MAX(table_name) FROM information_schema.tables; + MAX(table_name) + VIEWS +@@ -1342,6 +1345,7 @@ + INDEX_STATISTICS information_schema.INDEX_STATISTICS 1 + INNODB_BUFFER_POOL_CONTENT information_schema.INNODB_BUFFER_POOL_CONTENT 1 + INNODB_IO_PATTERN information_schema.INNODB_IO_PATTERN 1 ++INNODB_RSEG information_schema.INNODB_RSEG 1 + KEY_COLUMN_USAGE information_schema.KEY_COLUMN_USAGE 1 + PROCESSLIST information_schema.PROCESSLIST 1 + PROFILING information_schema.PROFILING 1 +--- a/mysql-test/r/information_schema_db.result 2009-05-07 19:31:27.000000000 +0000 ++++ b/mysql-test/r/information_schema_db.result 2009-05-07 19:35:01.000000000 +0000 +@@ -29,6 +29,7 @@ + USER_STATISTICS + VIEWS + INNODB_IO_PATTERN ++INNODB_RSEG + show tables from INFORMATION_SCHEMA like 'T%'; + Tables_in_information_schema (T%) + TABLES +--- a/mysql-test/r/mysqlshow.result 2009-05-07 19:31:26.000000000 +0000 ++++ b/mysql-test/r/mysqlshow.result 2009-05-07 19:36:32.000000000 +0000 +@@ -103,6 +103,7 @@ + | USER_STATISTICS | + | VIEWS | + | INNODB_IO_PATTERN | ++| INNODB_RSEG | + +---------------------------------------+ + Database: INFORMATION_SCHEMA + +---------------------------------------+ +@@ -132,6 +133,7 @@ + | USER_STATISTICS | + | VIEWS | + | INNODB_IO_PATTERN | ++| INNODB_RSEG | + +---------------------------------------+ + Wildcard: inf_rmation_schema + +--------------------+ +--- a/mysql-test/r/profiling.result 2009-05-28 19:39:42.000000000 +0000 ++++ b/mysql-test/r/profiling.result 2009-05-28 19:40:14.000000000 +0000 +@@ -6,6 +6,8 @@ + Variable_name Value + profiling OFF + profiling_history_size 15 ++profiling_server OFF ++profiling_use_getrusage OFF + select @@profiling; + @@profiling + 0 +@@ -16,12 +18,16 @@ + Variable_name Value + profiling OFF + profiling_history_size 100 ++profiling_server OFF ++profiling_use_getrusage OFF + set session profiling = ON; + set session profiling_history_size=30; + show session variables like 'profil%'; + Variable_name Value + profiling ON + profiling_history_size 30 ++profiling_server OFF ++profiling_use_getrusage OFF + select @@profiling; + @@profiling + 1 +--- a/mysql-test/r/mysql.result 2010-02-19 23:59:36.000000000 -0500 ++++ b/mysql-test/r/mysql.result 2010-02-19 23:58:50.000000000 -0500 +@@ -162,8 +162,8 @@ + ERROR 1049 (42000) at line 1: Unknown database 'invalid' + Test connect with dbname + hostname + Test connect with dbname + _invalid_ hostname +-ERROR 2005 (HY000) at line 1: Unknown MySQL server host 'invalid_hostname' (errno) +-ERROR 2005 (HY000) at line 1: Unknown MySQL server host 'invalid_hostname' (errno) ++ERROR 2003 (HY000) at line 1: Can't connect to MySQL server on 'invalid_hostname' (errno) ++ERROR 2003 (HY000) at line 1: Can't connect to MySQL server on 'invalid_hostname' (errno) + The commands reported in the bug report + ERROR 2005 (HY000) at line 1: Unknown MySQL server host 'cyril has found a bug :)XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX' (errno) + Too long dbname +@@ -198,6 +198,6 @@ + 1 + COUNT (*) + 1 +-ERROR 2005 (HY000) at line 1: Unknown MySQL server host 'invalid_hostname' (errno) ++ERROR 2003 (HY000) at line 1: Can't connect to MySQL server on 'invalid_hostname' (errno) + <TABLE BORDER=1><TR><TH><</TH></TR><TR><TD>< & ></TD></TR></TABLE> + End of 5.0 tests +--- a/mysql-test/r/mysql_upgrade.result 2010-02-19 23:58:16.000000000 -0500 ++++ b/mysql-test/r/mysql_upgrade.result 2010-02-20 00:01:34.000000000 -0500 +@@ -58,7 +58,7 @@ + mysql.user OK + DROP USER mysqltest1@'%'; + Run mysql_upgrade with a non existing server socket +-mysqlcheck: Got error: 2005: Unknown MySQL server host 'not_existing_host' (errno) when trying to connect ++mysqlcheck: Got error: 2003: Can't connect to MySQL server on 'not_existing_host' (errno) when trying to connect + FATAL ERROR: Upgrade failed + set GLOBAL sql_mode='STRICT_ALL_TABLES,ANSI_QUOTES,NO_ZERO_DATE'; + mysql.columns_priv OK diff --git a/percona/5.0.91-b22-20100522/mysqld_safe_syslog.patch b/percona/5.0.91-b22-20100522/mysqld_safe_syslog.patch new file mode 100644 index 0000000..a493a29 --- /dev/null +++ b/percona/5.0.91-b22-20100522/mysqld_safe_syslog.patch @@ -0,0 +1,127 @@ +diff -r d91edeb58b50 patch_info/mysqld_safe_syslog.info +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/patch_info/mysqld_safe_syslog.info Mon Sep 01 21:58:00 2008 -0700 +@@ -0,0 +1,6 @@ ++File=mysqld_safe_syslog.patch ++Name=Patch allows redirect output of error.log to syslog-ng ++Version=1.0 ++Author=Percona <info@percona.com> ++License=GPL ++Comment=Ported from Debian +diff -r d91edeb58b50 scripts/mysqld_safe.sh +--- a/scripts/mysqld_safe.sh Mon Sep 01 21:57:21 2008 -0700 ++++ b/scripts/mysqld_safe.sh Mon Sep 01 21:58:00 2008 -0700 +@@ -10,12 +10,16 @@ + # mysql.server works by first doing a cd to the base directory and from there + # executing mysqld_safe + +-KILL_MYSQLD=1; + MYSQLD= + + trap '' 1 2 3 15 # we shouldn't let anyone kill us + + umask 007 ++ ++KILL_MYSQLD=1; ++ ++# This command can be used as pipe to syslog. With "-s" it also logs to stderr. ++ERR_LOGGER="logger -p daemon.err -t mysqld_safe -i" + + defaults= + case "$1" in +@@ -177,7 +181,6 @@ + + # these rely on $DATADIR by default, so we'll set them later on + pid_file= +-err_log= + + # Get first arguments from the my.cnf file, groups [mysqld] and [mysqld_safe] + # and then merge with the command line arguments +@@ -245,7 +248,6 @@ + * ) pid_file="$DATADIR/$pid_file" ;; + esac + fi +-test -z "$err_log" && err_log=$DATADIR/`@HOSTNAME@`.err + + if test -n "$mysql_unix_port" + then +@@ -315,8 +317,6 @@ + then + USER_OPTION="--user=$user" + fi +- # If we are root, change the err log to the right user. +- touch $err_log; chown $user $err_log + if test -n "$open_files" + then + ulimit -n $open_files +@@ -341,18 +341,16 @@ + then + if @FIND_PROC@ + then # The pid contains a mysqld process +- echo "A mysqld process already exists" +- echo "A mysqld process already exists at " `date` >> $err_log ++ echo "A mysqld process already exists" | $ERR_LOGGER -s + exit 1 + fi + fi + rm -f $pid_file + if test -f $pid_file + then +- echo "Fatal error: Can't remove the pid file: $pid_file" +- echo "Fatal error: Can't remove the pid file: $pid_file at " `date` >> $err_log +- echo "Please remove it manually and start $0 again" +- echo "mysqld daemon not started" ++ echo "Fatal error: Can't remove the pid file: $pid_file" | $ERR_LOGGER -s ++ echo "Please remove it manually and start $0 again" | $ERR_LOGGER -s ++ echo "mysqld daemon not started" | $ERR_LOGGER -s + exit 1 + fi + fi +@@ -377,15 +375,15 @@ + # ulimit -n 256 > /dev/null 2>&1 # Fix for BSD and FreeBSD systems + #fi + +-echo "`date +'%y%m%d %H:%M:%S mysqld started'`" >> $err_log ++echo "started" | $ERR_LOGGER -s + while true + do + rm -f $safe_mysql_unix_port $pid_file # Some extra safety + if test -z "$args" + then +- $NOHUP_NICENESS $ledir/$MYSQLD $defaults --basedir=$MY_BASEDIR_VERSION --datadir=$DATADIR $USER_OPTION --pid-file=$pid_file @MYSQLD_DEFAULT_SWITCHES@ >> $err_log 2>&1 ++ $NOHUP_NICENESS $ledir/$MYSQLD $defaults --basedir=$MY_BASEDIR_VERSION --datadir=$DATADIR $USER_OPTION --pid-file=$pid_file @MYSQLD_DEFAULT_SWITCHES@ 2>&1 | $ERR_LOGGER -t mysqld + else +- eval "$NOHUP_NICENESS $ledir/$MYSQLD $defaults --basedir=$MY_BASEDIR_VERSION --datadir=$DATADIR $USER_OPTION --pid-file=$pid_file @MYSQLD_DEFAULT_SWITCHES@ $args >> $err_log 2>&1" ++ eval "$NOHUP_NICENESS $ledir/$MYSQLD $defaults --basedir=$MY_BASEDIR_VERSION --datadir=$DATADIR $USER_OPTION --pid-file=$pid_file @MYSQLD_DEFAULT_SWITCHES@ $args 2>&1 | $ERR_LOGGER -t mysqld" + fi + if test ! -f $pid_file # This is removed if normal shutdown + then +@@ -402,7 +400,7 @@ + # kill -9 is used or the process won't react on the kill. + numofproces=`ps xaww | grep -v "grep" | grep "$ledir/$MYSQLD\>" | grep -c "pid-file=$pid_file"` + +- echo -e "\nNumber of processes running now: $numofproces" | tee -a $err_log ++ echo -e "\nNumber of processes running now: $numofproces" | $ERR_LOGGER -s + I=1 + while test "$I" -le "$numofproces" + do +@@ -415,16 +413,14 @@ + # echo "TEST $I - $T **" + if kill -9 $T + then +- echo "$MYSQLD process hanging, pid $T - killed" | tee -a $err_log ++ echo "$MYSQLD process hanging, pid $T - killed" | $ERR_LOGGER -s + else + break + fi + I=`expr $I + 1` + done + fi +- echo "`date +'%y%m%d %H:%M:%S'` mysqld restarted" | tee -a $err_log ++ echo "restarted" | $ERR_LOGGER -s + done + +-echo "`date +'%y%m%d %H:%M:%S'` mysqld ended" | tee -a $err_log +-echo "" | tee -a $err_log +- ++echo "ended" | $ERR_LOGGER -s diff --git a/percona/5.0.91-b22-20100522/profiling_slow.patch b/percona/5.0.91-b22-20100522/profiling_slow.patch new file mode 100644 index 0000000..78d35a0 --- /dev/null +++ b/percona/5.0.91-b22-20100522/profiling_slow.patch @@ -0,0 +1,271 @@ +diff -r 4636d2e0b0d0 patch_info/profiling_slow.info +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/patch_info/profiling_slow.info Fri Jul 03 15:40:29 2009 -0700 +@@ -0,0 +1,9 @@ ++File=profiling_slow.info ++Name=profiling from SHOW PROFILE to slow.log ++Version=1.0 ++Author=Percona <info@percona.com> ++License=GPL ++Comment= ++Changelog ++2009-05-18 ++Initial implementation +diff -r 4636d2e0b0d0 sql/log.cc +--- a/sql/log.cc Fri Jul 03 15:40:20 2009 -0700 ++++ b/sql/log.cc Fri Jul 03 15:40:29 2009 -0700 +@@ -2402,6 +2402,11 @@ + tmp_errno=errno; + } + } ++ ++#if defined(ENABLED_PROFILING) && defined(COMMUNITY_SERVER) ++ thd->profiling.print_current(&log_file); ++#endif ++ + if (thd->db && strcmp(thd->db,db)) + { // Database changed + if (my_b_printf(&log_file,"use %s;\n",thd->db) == (uint) -1) +diff -r 4636d2e0b0d0 sql/mysqld.cc +--- a/sql/mysqld.cc Fri Jul 03 15:40:20 2009 -0700 ++++ b/sql/mysqld.cc Fri Jul 03 15:40:29 2009 -0700 +@@ -5052,6 +5052,8 @@ + OPT_PORT_OPEN_TIMEOUT, + OPT_MERGE, + OPT_PROFILING, ++ OPT_PROFILING_SERVER, ++ OPT_PROFILING_USE_GETRUSAGE, + OPT_SLOW_LOG, + OPT_SLOW_QUERY_LOG_FILE, + OPT_USE_GLOBAL_LONG_QUERY_TIME, +@@ -5675,6 +5677,16 @@ + (gptr*) &global_system_variables.profiling_history_size, + (gptr*) &max_system_variables.profiling_history_size, + 0, GET_ULONG, REQUIRED_ARG, 15, 0, 100, 0, 0, 0}, ++ {"profiling_server", OPT_PROFILING_SERVER, ++ "Enable profiling of all threads", ++ (gptr*) &global_system_variables.profiling_server, ++ (gptr*) &max_system_variables.profiling_server, 0, GET_BOOL, ++ OPT_ARG, 0, 0, 0, 0, 0, 0 }, ++ {"profiling_use_getrusage", OPT_PROFILING_USE_GETRUSAGE, ++ "Enable getrusage function call for profiling", ++ (gptr*) &global_system_variables.profiling_use_getrusage, ++ (gptr*) &max_system_variables.profiling_use_getrusage, 0, GET_BOOL, ++ OPT_ARG, 0, 0, 0, 0, 0, 0 }, + #endif + {"relay-log", OPT_RELAY_LOG, + "The location and name to use for relay logs.", +diff -r 4636d2e0b0d0 sql/set_var.cc +--- a/sql/set_var.cc Fri Jul 03 15:40:20 2009 -0700 ++++ b/sql/set_var.cc Fri Jul 03 15:40:29 2009 -0700 +@@ -592,6 +592,10 @@ + ulonglong(OPTION_PROFILING)); + static sys_var_thd_ulong sys_profiling_history_size("profiling_history_size", + &SV::profiling_history_size); ++static sys_var_thd_bool sys_profiling_server("profiling_server", ++ &SV::profiling_server); ++static sys_var_thd_bool sys_profiling_use_getrusage("profiling_use_getrusage", ++ &SV::profiling_use_getrusage); + #endif + + /* Local state variables */ +@@ -764,6 +768,8 @@ + #if defined(ENABLED_PROFILING) && defined(COMMUNITY_SERVER) + &sys_profiling, + &sys_profiling_history_size, ++ &sys_profiling_server, ++ &sys_profiling_use_getrusage, + #endif + &sys_pseudo_thread_id, + &sys_query_alloc_block_size, +@@ -1094,6 +1100,8 @@ + #if defined(ENABLED_PROFILING) && defined(COMMUNITY_SERVER) + {sys_profiling.name, (char*) &sys_profiling, SHOW_SYS}, + {sys_profiling_history_size.name, (char*) &sys_profiling_history_size, SHOW_SYS}, ++ {sys_profiling_server.name, (char*) &sys_profiling_server, SHOW_SYS}, ++ {sys_profiling_use_getrusage.name, (char*) &sys_profiling_use_getrusage, SHOW_SYS}, + #endif + {"protocol_version", (char*) &protocol_version, SHOW_INT}, + {sys_query_alloc_block_size.name, (char*) &sys_query_alloc_block_size, +diff -r 4636d2e0b0d0 sql/sql_class.h +--- a/sql/sql_class.h Fri Jul 03 15:40:20 2009 -0700 ++++ b/sql/sql_class.h Fri Jul 03 15:40:29 2009 -0700 +@@ -550,6 +550,8 @@ + ulong optimizer_search_depth; + ulong preload_buff_size; + ulong profiling_history_size; ++ my_bool profiling_server; ++ my_bool profiling_use_getrusage; + ulong query_cache_type; + ulong log_slow_rate_limit; + ulong read_buff_size; +diff -r 4636d2e0b0d0 sql/sql_profile.cc +--- a/sql/sql_profile.cc Fri Jul 03 15:40:20 2009 -0700 ++++ b/sql/sql_profile.cc Fri Jul 03 15:40:29 2009 -0700 +@@ -221,9 +221,22 @@ + */ + void PROF_MEASUREMENT::collect() + { ++ struct timespec tp; + time_usecs= (double) my_getsystime() / 10.0; /* 1 sec was 1e7, now is 1e6 */ + #ifdef HAVE_GETRUSAGE +- getrusage(RUSAGE_SELF, &rusage); ++ if ((profile->get_profiling())->enabled_getrusage()) ++ getrusage(RUSAGE_SELF, &rusage); ++#endif ++ ++#ifdef HAVE_CLOCK_GETTIME ++ if (!(clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tp))) ++ { ++ cpu_time_usecs= tp.tv_sec*1000000000.0 + tp.tv_nsec; ++ } ++ else ++ { ++ cpu_time_usecs= 0; ++ } + #endif + } + +@@ -341,7 +354,7 @@ + finish_current_query(); + } + +- enabled= (((thd)->options & OPTION_PROFILING) != 0); ++ enabled= (((thd)->options & OPTION_PROFILING) != 0) || ( thd->variables.profiling_server ); + + if (! enabled) DBUG_VOID_RETURN; + +@@ -379,7 +392,8 @@ + status_change("ending", NULL, NULL, 0); + + if ((enabled) && /* ON at start? */ +- ((thd->options & OPTION_PROFILING) != 0) && /* and ON at end? */ ++ (((thd->options & OPTION_PROFILING) != 0) || ++ (thd->variables.profiling_server)) && /* and ON at end? */ + (current->query_source != NULL) && + (! current->entries.is_empty())) + { +@@ -480,6 +494,88 @@ + DBUG_VOID_RETURN; + } + ++bool PROFILING::enabled_getrusage() ++{ ++ return thd->variables.profiling_use_getrusage; ++} ++ ++/** ++ Print output for current query to file ++*/ ++ ++int PROFILING::print_current(IO_CACHE *log_file) ++{ ++ DBUG_ENTER("PROFILING::print_current"); ++ ulonglong row_number= 0; ++ char query_time_buff[22+7]; ++ char query_cpu_time_buff[22+7]; ++ ++ QUERY_PROFILE *query; ++ /* Get current query */ ++ if (current == NULL) ++ { ++ DBUG_RETURN(0); ++ } ++ ++ query= current; ++ ++ my_b_printf(log_file, "# PROFILE_VALUES "); ++ ++ void *entry_iterator; ++ PROF_MEASUREMENT *entry, *previous= NULL, *first= NULL; ++ /* ...and for each query, go through all its state-change steps. */ ++ for (entry_iterator= query->entries.new_iterator(); ++ entry_iterator != NULL; ++ entry_iterator= query->entries.iterator_next(entry_iterator), ++ previous=entry, row_number++) ++ { ++ entry= query->entries.iterator_value(entry_iterator); ++ ++ /* Skip the first. We count spans of fence, not fence-posts. */ ++ if (previous == NULL) {first= entry; continue;} ++ ++ if (thd->lex->orig_sql_command == SQLCOM_SHOW_PROFILE) ++ { ++ /* ++ We got here via a SHOW command. That means that we stored ++ information about the query we wish to show and that isn't ++ in a WHERE clause at a higher level to filter out rows we ++ wish to exclude. ++ ++ Because that functionality isn't available in the server yet, ++ we must filter here, at the wrong level. Once one can con- ++ struct where and having conditions at the SQL layer, then this ++ condition should be ripped out. ++ */ ++ if (thd->lex->profile_query_id == 0) /* 0 == show final query */ ++ { ++ if (query != last) ++ continue; ++ } ++ else ++ { ++ if (thd->lex->profile_query_id != query->profiling_query_id) ++ continue; ++ } ++ } ++ ++ snprintf(query_time_buff, sizeof(query_time_buff), "%.6f", (entry->time_usecs-previous->time_usecs)/(1000.0*1000)); ++ snprintf(query_cpu_time_buff, sizeof(query_cpu_time_buff), "%.6f", (entry->cpu_time_usecs-previous->cpu_time_usecs)/(1000.0*1000*1000)); ++ my_b_printf(log_file, "%s: %s (cpu: %s), ", previous->status, query_time_buff, query_cpu_time_buff); ++ ++ } ++ ++ my_b_printf(log_file, "\n"); ++ if ((entry != NULL) && (first != NULL)) ++ { ++ snprintf(query_time_buff, sizeof(query_time_buff), "%.6f", (entry->time_usecs-first->time_usecs)/(1000.0*1000)); ++ snprintf(query_cpu_time_buff, sizeof(query_cpu_time_buff), "%.6f", (entry->cpu_time_usecs-first->cpu_time_usecs)/(1000.0*1000*1000)); ++ my_b_printf(log_file, "# PROFILE_TOTALS Total: %s (cpu: %s)\n", query_time_buff, query_cpu_time_buff); ++ } ++ ++ DBUG_RETURN(0); ++} ++ + /** + Fill the information schema table, "query_profile", as defined in show.cc . + There are two ways to get to this function: Selecting from the information +diff -r 4636d2e0b0d0 sql/sql_profile.h +--- a/sql/sql_profile.h Fri Jul 03 15:40:20 2009 -0700 ++++ b/sql/sql_profile.h Fri Jul 03 15:40:29 2009 -0700 +@@ -193,6 +193,7 @@ + unsigned int line; + + double time_usecs; ++ double cpu_time_usecs; + char *allocated_status_memory; + + void set_label(const char *status_arg, const char *function_arg, +@@ -243,6 +244,11 @@ + + /* Show this profile. This is called by PROFILING. */ + bool show(uint options); ++ ++public: ++ ++ inline PROFILING * get_profiling() { return profiling; }; ++ + }; + + +@@ -288,9 +294,11 @@ + + /* SHOW PROFILES */ + bool show_profiles(); ++ bool enabled_getrusage(); + + /* ... from INFORMATION_SCHEMA.PROFILING ... */ + int fill_statistics_info(THD *thd, TABLE_LIST *tables, Item *cond); ++ int print_current(IO_CACHE *log_file); + }; + + # endif /* HAVE_PROFILING */ diff --git a/percona/5.0.91-b22-20100522/series b/percona/5.0.91-b22-20100522/series new file mode 100644 index 0000000..0dcc631 --- /dev/null +++ b/percona/5.0.91-b22-20100522/series @@ -0,0 +1,22 @@ +show_patches.patch +microslow_innodb.patch +profiling_slow.patch +userstatv2.patch +microsec_process.patch +innodb_io_patches.patch +mysqld_safe_syslog.patch +innodb_locks_held.patch +innodb_show_bp.patch +innodb_check_fragmentation.patch +innodb_io_pattern.patch +innodb_fsync_source.patch +innodb_show_hashed_memory.patch +innodb_dict_size_limit.patch +innodb_extra_rseg.patch +innodb_thread_concurrency_timer_based.patch +innodb_use_sys_malloc.patch +innodb_recovery_patches.patch +innodb_misc_patch.patch +innodb_split_buf_pool_mutex.patch +innodb_rw_lock.patch +mysql-test.patch diff --git a/percona/5.0.91-b22-20100522/show_patches.patch b/percona/5.0.91-b22-20100522/show_patches.patch new file mode 100644 index 0000000..7f1d431 --- /dev/null +++ b/percona/5.0.91-b22-20100522/show_patches.patch @@ -0,0 +1,288 @@ +diff -r c3e57b0c22c4 patch_info/show_patches.info +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/patch_info/show_patches.info Mon Dec 22 00:25:06 2008 -0800 +@@ -0,0 +1,6 @@ ++File=show_patches.patch ++Name=SHOW PATCHES ++Version=1.0 ++Author=Jeremy Cole ++License=N/A ++Comment +diff -r c3e57b0c22c4 sql/Makefile.am +--- a/sql/Makefile.am Mon Dec 22 00:20:06 2008 -0800 ++++ b/sql/Makefile.am Mon Dec 22 00:25:06 2008 -0800 +@@ -118,7 +118,7 @@ + -DSHAREDIR="\"$(MYSQLSHAREdir)\"" \ + @DEFS@ + +-BUILT_SOURCES = sql_yacc.cc sql_yacc.h lex_hash.h ++BUILT_SOURCES = sql_yacc.cc sql_yacc.h lex_hash.h patch_info.h + EXTRA_DIST = $(BUILT_SOURCES) nt_servc.cc nt_servc.h \ + message.mc message.h message.rc MSG00001.bin \ + examples/CMakeLists.txt CMakeLists.txt \ +@@ -175,6 +175,8 @@ + udf_example_la_SOURCES= udf_example.c + udf_example_la_LDFLAGS= -module -rpath $(pkglibdir) + ++patch_info.h: patch_info.h.pl ++ $(PERL) $< > $@ + + # Don't update the files from bitkeeper + %::SCCS/s.% +diff -r c3e57b0c22c4 sql/Makefile.in +--- a/sql/Makefile.in Mon Dec 22 00:20:06 2008 -0800 ++++ b/sql/Makefile.in Mon Dec 22 00:25:06 2008 -0800 +@@ -561,7 +561,7 @@ + gen_lex_hash_LDADD = $(LDADD) $(CXXLDFLAGS) + mysql_tzinfo_to_sql_SOURCES = mysql_tzinfo_to_sql.cc + mysql_tzinfo_to_sql_LDADD = @MYSQLD_EXTRA_LDFLAGS@ $(LDADD) $(CXXLDFLAGS) +-BUILT_SOURCES = sql_yacc.cc sql_yacc.h lex_hash.h ++BUILT_SOURCES = sql_yacc.cc sql_yacc.h lex_hash.h patch_info.h + EXTRA_DIST = $(BUILT_SOURCES) nt_servc.cc nt_servc.h \ + message.mc message.h message.rc MSG00001.bin \ + examples/CMakeLists.txt CMakeLists.txt \ +@@ -1237,6 +1237,9 @@ + ./gen_lex_hash$(EXEEXT) > $@-t + $(MV) $@-t $@ + ++patch_info.h: patch_info.h.pl ++ $(PERL) $< > $@ ++ + # Don't update the files from bitkeeper + %::SCCS/s.% + # Tell versions [3.59,3.63) of GNU make to not export all variables. +diff -r c3e57b0c22c4 sql/lex.h +--- a/sql/lex.h Mon Dec 22 00:20:06 2008 -0800 ++++ b/sql/lex.h Mon Dec 22 00:25:06 2008 -0800 +@@ -367,6 +367,7 @@ + { "PACK_KEYS", SYM(PACK_KEYS_SYM)}, + { "PARTIAL", SYM(PARTIAL)}, + { "PASSWORD", SYM(PASSWORD)}, ++ { "PATCHES", SYM(PATCHES)}, + { "PHASE", SYM(PHASE_SYM)}, + { "POINT", SYM(POINT_SYM)}, + { "POLYGON", SYM(POLYGON)}, +diff -r c3e57b0c22c4 sql/mysql_priv.h +--- a/sql/mysql_priv.h Mon Dec 22 00:20:06 2008 -0800 ++++ b/sql/mysql_priv.h Mon Dec 22 00:25:06 2008 -0800 +@@ -968,6 +968,7 @@ + int mysqld_show_status(THD *thd); + int mysqld_show_variables(THD *thd,const char *wild); + bool mysqld_show_storage_engines(THD *thd); ++bool mysqld_show_patches(THD *thd); + bool mysqld_show_privileges(THD *thd); + bool mysqld_show_column_types(THD *thd); + bool mysqld_help (THD *thd, const char *text); +diff -r c3e57b0c22c4 sql/patch_info.h.pl +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/sql/patch_info.h.pl Mon Dec 22 00:25:06 2008 -0800 +@@ -0,0 +1,65 @@ ++use strict; ++ ++my $patch_info_path = '../patch_info'; ++my $file = ''; ++my $output = ''; ++ ++ ++if (opendir(PATCH_DIR, $patch_info_path)) ++{ ++ while ((my $file = readdir(PATCH_DIR))) ++ { ++ open(PATCH_FILE, "<$patch_info_path/$file") || die("Unable to open $patch_info_path/$file ($!)"); ++ my %fields; ++ ++ if ($file =~ /^\./) ++ { ++ next; ++ } ++ ++ while (<PATCH_FILE>) ++ { ++ chomp; ++ ++ my ($key, $value) = split(/\s*=\s*/); ++ $fields{lc($key)} = $value; ++ } ++ ++ $output .= "{\"$fields{'file'}\", \"$fields{'name'}\", \"$fields{'version'}\", \"$fields{'author'}\", \"$fields{'license'}\",\"$fields{'comment'}\"},\n" ++ } ++} ++ ++print <<HEADER; ++ ++/* Copyright (C) 2002-2006 MySQL AB ++ ++ This program is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; version 2 of the License. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software ++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ ++ ++#ifdef USE_PRAGMA_INTERFACE ++#pragma interface /* gcc class implementation */ ++#endif ++ ++struct patch { ++ const char *file; ++ const char *name; ++ const char *version; ++ const char *author; ++ const char *license; ++ const char *comment; ++}patches[] = { ++$output ++{NULL, NULL, NULL, NULL} ++}; ++ ++HEADER +diff -r c3e57b0c22c4 sql/sp_head.cc +--- a/sql/sp_head.cc Mon Dec 22 00:20:06 2008 -0800 ++++ b/sql/sp_head.cc Mon Dec 22 00:25:06 2008 -0800 +@@ -191,6 +191,7 @@ + case SQLCOM_SHOW_MUTEX_STATUS: + case SQLCOM_SHOW_NEW_MASTER: + case SQLCOM_SHOW_OPEN_TABLES: ++ case SQLCOM_SHOW_PATCHES: + case SQLCOM_SHOW_PRIVILEGES: + case SQLCOM_SHOW_PROCESSLIST: + case SQLCOM_SHOW_SLAVE_HOSTS: +diff -r c3e57b0c22c4 sql/sql_lex.h +--- a/sql/sql_lex.h Mon Dec 22 00:20:06 2008 -0800 ++++ b/sql/sql_lex.h Mon Dec 22 00:25:06 2008 -0800 +@@ -95,6 +95,7 @@ + SQLCOM_XA_COMMIT, SQLCOM_XA_ROLLBACK, SQLCOM_XA_RECOVER, + SQLCOM_SHOW_PROC_CODE, SQLCOM_SHOW_FUNC_CODE, + SQLCOM_SHOW_PROFILE, SQLCOM_SHOW_PROFILES, ++ SQLCOM_SHOW_PATCHES, + + /* + When a command is added here, be sure it's also added in mysqld.cc +diff -r c3e57b0c22c4 sql/sql_parse.cc +--- a/sql/sql_parse.cc Mon Dec 22 00:20:06 2008 -0800 ++++ b/sql/sql_parse.cc Mon Dec 22 00:25:06 2008 -0800 +@@ -3947,6 +3947,9 @@ + break; + case SQLCOM_SHOW_STORAGE_ENGINES: + res= mysqld_show_storage_engines(thd); ++ break; ++ case SQLCOM_SHOW_PATCHES: ++ res= mysqld_show_patches(thd); + break; + case SQLCOM_SHOW_PRIVILEGES: + res= mysqld_show_privileges(thd); +diff -r c3e57b0c22c4 sql/sql_prepare.cc +--- a/sql/sql_prepare.cc Mon Dec 22 00:20:06 2008 -0800 ++++ b/sql/sql_prepare.cc Mon Dec 22 00:25:06 2008 -0800 +@@ -1790,6 +1790,7 @@ + case SQLCOM_SHOW_DATABASES: + case SQLCOM_SHOW_PROCESSLIST: + case SQLCOM_SHOW_STORAGE_ENGINES: ++ case SQLCOM_SHOW_PATCHES: + case SQLCOM_SHOW_PRIVILEGES: + case SQLCOM_SHOW_COLUMN_TYPES: + case SQLCOM_SHOW_STATUS: +diff -r c3e57b0c22c4 sql/sql_show.cc +--- a/sql/sql_show.cc Mon Dec 22 00:20:06 2008 -0800 ++++ b/sql/sql_show.cc Mon Dec 22 00:25:06 2008 -0800 +@@ -22,6 +22,7 @@ + #include "sp.h" + #include "sp_head.h" + #include "sql_trigger.h" ++#include "patch_info.h" + #include <my_dir.h> + + #ifdef HAVE_BERKELEY_DB +@@ -45,6 +46,47 @@ + static int + view_store_create_info(THD *thd, TABLE_LIST *table, String *buff); + bool schema_table_store_record(THD *thd, TABLE *table); ++ ++/*************************************************************************** ++** List patches built into this release ++***************************************************************************/ ++ ++bool mysqld_show_patches(THD *thd) ++{ ++ List<Item> field_list; ++ int i = 0; ++ Protocol *protocol= thd->protocol; ++ DBUG_ENTER("mysqld_show_patches"); ++ ++ field_list.push_back(new Item_empty_string("File", 255)); ++ field_list.push_back(new Item_empty_string("Name", 50)); ++ field_list.push_back(new Item_empty_string("Version", 10)); ++ field_list.push_back(new Item_empty_string("Author", 50)); ++ field_list.push_back(new Item_empty_string("License", 50)); ++ field_list.push_back(new Item_empty_string("Comment", 32)); ++ ++ if (protocol->send_fields(&field_list, Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF)) ++ DBUG_RETURN(TRUE); ++ ++ for (i = 0; patches[i].file; i++) ++ { ++ protocol->prepare_for_resend(); ++ protocol->store(patches[i].file, system_charset_info); ++ protocol->store(patches[i].name, system_charset_info); ++ protocol->store(patches[i].version, system_charset_info); ++ protocol->store(patches[i].author, system_charset_info); ++ protocol->store(patches[i].license, system_charset_info); ++ protocol->store(patches[i].comment, system_charset_info); ++ ++ if (protocol->write()) ++ DBUG_RETURN(TRUE); ++ } ++ ++ ++ send_eof(thd); ++ DBUG_RETURN(FALSE); ++ ++} + + + /*************************************************************************** +diff -r c3e57b0c22c4 sql/sql_yacc.yy +--- a/sql/sql_yacc.yy Mon Dec 22 00:20:06 2008 -0800 ++++ b/sql/sql_yacc.yy Mon Dec 22 00:25:06 2008 -0800 +@@ -824,6 +824,7 @@ + %token PAGE_SYM + %token PARTIAL + %token PASSWORD ++%token PATCHES + %token PARAM_MARKER + %token PHASE_SYM + %token POINTFROMTEXT +@@ -8019,7 +8020,7 @@ + ; + + show_param: +- DATABASES wild_and_where ++ DATABASES wild_and_where + { + LEX *lex= Lex; + lex->sql_command= SQLCOM_SELECT; +@@ -8119,6 +8120,10 @@ + LEX *lex=Lex; + lex->sql_command= SQLCOM_SHOW_STORAGE_ENGINES; + WARN_DEPRECATED("SHOW TABLE TYPES", "SHOW [STORAGE] ENGINES"); ++ } ++ | PATCHES ++ { ++ Lex->sql_command= SQLCOM_SHOW_PATCHES; + } + | opt_storage ENGINES_SYM + { +@@ -9554,6 +9559,7 @@ + | PAGE_SYM {} + | PARTIAL {} + | PASSWORD {} ++ | PATCHES {} + | PHASE_SYM {} + | POINT_SYM {} + | POLYGON {} diff --git a/percona/5.0.91-b22-20100522/userstatv2.patch b/percona/5.0.91-b22-20100522/userstatv2.patch new file mode 100644 index 0000000..427fef7 --- /dev/null +++ b/percona/5.0.91-b22-20100522/userstatv2.patch @@ -0,0 +1,4406 @@ +diff -r 592f6c3641ba BUILD/Makefile.in +--- a/BUILD/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/BUILD/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -146,6 +146,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba Docs/Makefile.in +--- a/Docs/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/Docs/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -144,6 +144,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba Makefile.in +--- a/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -171,6 +171,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba SSL/Makefile.in +--- a/SSL/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/SSL/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -144,6 +144,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba client/Makefile.in +--- a/client/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/client/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -247,6 +247,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @CLIENT_LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba cmd-line-utils/Makefile.in +--- a/cmd-line-utils/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/cmd-line-utils/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -157,6 +157,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba cmd-line-utils/libedit/Makefile.in +--- a/cmd-line-utils/libedit/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/cmd-line-utils/libedit/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -166,6 +166,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba cmd-line-utils/readline/Makefile.in +--- a/cmd-line-utils/readline/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/cmd-line-utils/readline/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -173,6 +173,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba configure +--- a/configure Wed Jul 29 13:33:34 2009 -0700 ++++ b/configure Wed Jul 29 13:34:11 2009 -0700 +@@ -35347,7 +35347,91 @@ + # We also disable for SCO for the time being, the headers for the + # thread library we use conflicts with other headers. + ;; +- *) ++*) ++ # most systems require the program be linked with librt library to use ++ # the function clock_gettime ++ my_save_LIBS="$LIBS" ++ LIBS="" ++ ++echo "$as_me:$LINENO: checking for clock_gettime in -lrt" >&5 ++echo $ECHO_N "checking for clock_gettime in -lrt... $ECHO_C" >&6 ++if test "${ac_cv_lib_rt_clock_gettime+set}" = set; then ++ echo $ECHO_N "(cached) $ECHO_C" >&6 ++else ++ ac_check_lib_save_LIBS=$LIBS ++LIBS="-lrt $LIBS" ++cat >conftest.$ac_ext <<_ACEOF ++/* confdefs.h. */ ++_ACEOF ++cat confdefs.h >>conftest.$ac_ext ++cat >>conftest.$ac_ext <<_ACEOF ++/* end confdefs.h. */ ++ ++/* Override any gcc2 internal prototype to avoid an error. */ ++#ifdef __cplusplus ++extern "C" ++#endif ++/* We use char because int might match the return type of a gcc2 ++ builtin and then its argument prototype would still apply. */ ++char clock_gettime (); ++int ++main () ++{ ++clock_gettime (); ++ ; ++ return 0; ++} ++_ACEOF ++rm -f conftest.$ac_objext conftest$ac_exeext ++if { (eval echo "$as_me:$LINENO: \"$ac_link\"") >&5 ++ (eval $ac_link) 2>conftest.er1 ++ ac_status=$? ++ grep -v '^ *+' conftest.er1 >conftest.err ++ rm -f conftest.er1 ++ cat conftest.err >&5 ++ echo "$as_me:$LINENO: \$? = $ac_status" >&5 ++ (exit $ac_status); } && ++ { ac_try='test -z "$ac_c_werror_flag" ++ || test ! -s conftest.err' ++ { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 ++ (eval $ac_try) 2>&5 ++ ac_status=$? ++ echo "$as_me:$LINENO: \$? = $ac_status" >&5 ++ (exit $ac_status); }; } && ++ { ac_try='test -s conftest$ac_exeext' ++ { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 ++ (eval $ac_try) 2>&5 ++ ac_status=$? ++ echo "$as_me:$LINENO: \$? = $ac_status" >&5 ++ (exit $ac_status); }; }; then ++ ac_cv_lib_rt_clock_gettime=yes ++else ++ echo "$as_me: failed program was:" >&5 ++sed 's/^/| /' conftest.$ac_ext >&5 ++ ++ac_cv_lib_rt_clock_gettime=no ++fi ++rm -f conftest.err conftest.$ac_objext \ ++ conftest$ac_exeext conftest.$ac_ext ++LIBS=$ac_check_lib_save_LIBS ++fi ++echo "$as_me:$LINENO: result: $ac_cv_lib_rt_clock_gettime" >&5 ++echo "${ECHO_T}$ac_cv_lib_rt_clock_gettime" >&6 ++if test $ac_cv_lib_rt_clock_gettime = yes; then ++ cat >>confdefs.h <<_ACEOF ++#define HAVE_LIBRT 1 ++_ACEOF ++ ++ LIBS="-lrt $LIBS" ++ ++fi ++ ++ LIBRT=$LIBS ++ LIBS="$my_save_LIBS" ++ ++ ++ LIBS="$LIBS $LIBRT" ++ + for ac_func in clock_gettime + do + as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` +@@ -38791,7 +38875,7 @@ + + fi + +-CLIENT_LIBS="$NON_THREADED_LIBS $openssl_libs $ZLIB_LIBS $STATIC_NSS_FLAGS" ++CLIENT_LIBS="$NON_THREADED_LIBS $openssl_libs $ZLIB_LIBS $STATIC_NSS_FLAGS $LIBRT" + + + +diff -r 592f6c3641ba configure.in +--- a/configure.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/configure.in Wed Jul 29 13:34:11 2009 -0700 +@@ -2136,7 +2136,18 @@ + # We also disable for SCO for the time being, the headers for the + # thread library we use conflicts with other headers. + ;; +- *) AC_CHECK_FUNCS(clock_gettime) ++*) ++ # most systems require the program be linked with librt library to use ++ # the function clock_gettime ++ my_save_LIBS="$LIBS" ++ LIBS="" ++ AC_CHECK_LIB(rt,clock_gettime) ++ LIBRT=$LIBS ++ LIBS="$my_save_LIBS" ++ AC_SUBST(LIBRT) ++ ++ LIBS="$LIBS $LIBRT" ++ AC_CHECK_FUNCS(clock_gettime) + ;; + esac + +@@ -2772,7 +2783,7 @@ + AC_DEFINE([THREAD_SAFE_CLIENT], [1], [Should be client be thread safe]) + fi + +-CLIENT_LIBS="$NON_THREADED_LIBS $openssl_libs $ZLIB_LIBS $STATIC_NSS_FLAGS" ++CLIENT_LIBS="$NON_THREADED_LIBS $openssl_libs $ZLIB_LIBS $STATIC_NSS_FLAGS $LIBRT" + + AC_SUBST(CLIENT_LIBS) + AC_SUBST(NON_THREADED_LIBS) +diff -r 592f6c3641ba dbug/Makefile.in +--- a/dbug/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/dbug/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -192,6 +192,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba extra/Makefile.in +--- a/extra/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/extra/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -240,6 +240,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba extra/yassl/Makefile.in +--- a/extra/yassl/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/extra/yassl/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -142,6 +142,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba extra/yassl/src/Makefile.in +--- a/extra/yassl/src/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/extra/yassl/src/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -151,6 +151,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba extra/yassl/taocrypt/Makefile.in +--- a/extra/yassl/taocrypt/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/extra/yassl/taocrypt/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -142,6 +142,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba extra/yassl/taocrypt/benchmark/Makefile.in +--- a/extra/yassl/taocrypt/benchmark/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/extra/yassl/taocrypt/benchmark/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -153,6 +153,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba extra/yassl/taocrypt/src/Makefile.in +--- a/extra/yassl/taocrypt/src/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/extra/yassl/taocrypt/src/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -164,6 +164,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba extra/yassl/taocrypt/test/Makefile.in +--- a/extra/yassl/taocrypt/test/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/extra/yassl/taocrypt/test/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -153,6 +153,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba extra/yassl/testsuite/Makefile.in +--- a/extra/yassl/testsuite/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/extra/yassl/testsuite/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -156,6 +156,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba heap/Makefile.in +--- a/heap/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/heap/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -202,6 +202,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba include/Makefile.in +--- a/include/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/include/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -160,6 +160,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba include/mysql_com.h +--- a/include/mysql_com.h Wed Jul 29 13:33:34 2009 -0700 ++++ b/include/mysql_com.h Wed Jul 29 13:34:11 2009 -0700 +@@ -25,6 +25,7 @@ + #define USERNAME_LENGTH 16 + #define SERVER_VERSION_LENGTH 60 + #define SQLSTATE_LENGTH 5 ++#define LIST_PROCESS_HOST_LEN 64 + + /* + USER_HOST_BUFF_SIZE -- length of string buffer, that is enough to contain +@@ -106,6 +107,11 @@ + thread */ + #define REFRESH_MASTER 128 /* Remove all bin logs in the index + and truncate the index */ ++#define REFRESH_TABLE_STATS 256 /* Refresh table stats hash table */ ++#define REFRESH_INDEX_STATS 512 /* Refresh index stats hash table */ ++#define REFRESH_USER_STATS 1024 /* Refresh user stats hash table */ ++#define REFRESH_SLOW_QUERY_LOG 4096 /* Flush slow query log and rotate*/ ++#define REFRESH_CLIENT_STATS 8192 /* Refresh client stats hash table */ + + /* The following can't be set with mysql_refresh() */ + #define REFRESH_READ_LOCK 16384 /* Lock tables for read */ +diff -r 592f6c3641ba libmysql/Makefile.in +--- a/libmysql/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/libmysql/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -224,6 +224,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @CLIENT_LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba libmysql_r/Makefile.in +--- a/libmysql_r/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/libmysql_r/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -221,6 +221,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ @ZLIB_LIBS@ @openssl_libs@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba libmysqld/Makefile.in +--- a/libmysqld/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/libmysqld/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -246,6 +246,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba libmysqld/examples/Makefile.in +--- a/libmysqld/examples/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/libmysqld/examples/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -192,6 +192,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ @WRAPLIBS@ @CLIENT_LIBS@ $(yassl_libs) + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba man/Makefile.in +--- a/man/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/man/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -151,6 +151,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba myisam/Makefile.in +--- a/myisam/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/myisam/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -235,6 +235,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba myisammrg/Makefile.in +--- a/myisammrg/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/myisammrg/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -183,6 +183,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba mysql-test/Makefile.in +--- a/mysql-test/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/mysql-test/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -161,6 +161,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba mysql-test/ndb/Makefile.in +--- a/mysql-test/ndb/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/mysql-test/ndb/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -147,6 +147,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba mysql-test/r/information_schema.result +--- a/mysql-test/r/information_schema.result Wed Jul 29 13:33:34 2009 -0700 ++++ b/mysql-test/r/information_schema.result Wed Jul 29 13:34:11 2009 -0700 +@@ -37,10 +37,12 @@ + select * from v1; + c + CHARACTER_SETS ++CLIENT_STATISTICS + COLLATIONS + COLLATION_CHARACTER_SET_APPLICABILITY + COLUMNS + COLUMN_PRIVILEGES ++INDEX_STATISTICS + KEY_COLUMN_USAGE + PROFILING + ROUTINES +@@ -50,8 +52,10 @@ + TABLES + TABLE_CONSTRAINTS + TABLE_PRIVILEGES ++TABLE_STATISTICS + TRIGGERS + USER_PRIVILEGES ++USER_STATISTICS + VIEWS + columns_priv + db +@@ -83,6 +87,7 @@ + TABLES TABLES + TABLE_CONSTRAINTS TABLE_CONSTRAINTS + TABLE_PRIVILEGES TABLE_PRIVILEGES ++TABLE_STATISTICS TABLE_STATISTICS + TRIGGERS TRIGGERS + tables_priv tables_priv + time_zone time_zone +@@ -102,6 +107,7 @@ + TABLES TABLES + TABLE_CONSTRAINTS TABLE_CONSTRAINTS + TABLE_PRIVILEGES TABLE_PRIVILEGES ++TABLE_STATISTICS TABLE_STATISTICS + TRIGGERS TRIGGERS + tables_priv tables_priv + time_zone time_zone +@@ -121,6 +127,7 @@ + TABLES TABLES + TABLE_CONSTRAINTS TABLE_CONSTRAINTS + TABLE_PRIVILEGES TABLE_PRIVILEGES ++TABLE_STATISTICS TABLE_STATISTICS + TRIGGERS TRIGGERS + tables_priv tables_priv + time_zone time_zone +@@ -594,12 +601,13 @@ + where table_schema='information_schema' limit 2; + TABLE_NAME TABLE_TYPE ENGINE + CHARACTER_SETS SYSTEM VIEW MEMORY +-COLLATIONS SYSTEM VIEW MEMORY ++CLIENT_STATISTICS SYSTEM VIEW MEMORY + show tables from information_schema like "T%"; + Tables_in_information_schema (T%) + TABLES + TABLE_CONSTRAINTS + TABLE_PRIVILEGES ++TABLE_STATISTICS + TRIGGERS + create database information_schema; + ERROR 42000: Access denied for user 'root'@'localhost' to database 'information_schema' +@@ -609,6 +617,7 @@ + TABLES SYSTEM VIEW + TABLE_CONSTRAINTS SYSTEM VIEW + TABLE_PRIVILEGES SYSTEM VIEW ++TABLE_STATISTICS SYSTEM VIEW + TRIGGERS SYSTEM VIEW + create table t1(a int); + ERROR 42S02: Unknown table 't1' in information_schema +@@ -621,6 +630,7 @@ + TABLES + TABLE_CONSTRAINTS + TABLE_PRIVILEGES ++TABLE_STATISTICS + TRIGGERS + select table_name from tables where table_name='user'; + table_name +@@ -730,7 +740,7 @@ + CREATE VIEW a1 (t_CRASHME) AS SELECT f1 FROM t_crashme GROUP BY f1; + CREATE VIEW a2 AS SELECT t_CRASHME FROM a1; + count(*) +-102 ++106 + drop view a2, a1; + drop table t_crashme; + select table_schema,table_name, column_name from +@@ -790,18 +800,20 @@ + TABLE_NAME COLUMN_NAME PRIVILEGES + COLUMNS TABLE_NAME select + COLUMN_PRIVILEGES TABLE_NAME select ++INDEX_STATISTICS TABLE_NAME select + KEY_COLUMN_USAGE TABLE_NAME select + STATISTICS TABLE_NAME select + TABLES TABLE_NAME select + TABLE_CONSTRAINTS TABLE_NAME select + TABLE_PRIVILEGES TABLE_NAME select ++TABLE_STATISTICS TABLE_NAME select + VIEWS TABLE_NAME select + delete from mysql.user where user='mysqltest_4'; + delete from mysql.db where user='mysqltest_4'; + flush privileges; + SELECT table_schema, count(*) FROM information_schema.TABLES GROUP BY TABLE_SCHEMA; + table_schema count(*) +-information_schema 17 ++information_schema 21 + mysql 17 + create table t1 (i int, j int); + create trigger trg1 before insert on t1 for each row +@@ -1187,10 +1199,12 @@ + ); + table_name column_name + CHARACTER_SETS CHARACTER_SET_NAME ++CLIENT_STATISTICS CLIENT + COLLATIONS COLLATION_NAME + COLLATION_CHARACTER_SET_APPLICABILITY COLLATION_NAME + COLUMNS TABLE_SCHEMA + COLUMN_PRIVILEGES TABLE_SCHEMA ++INDEX_STATISTICS TABLE_SCHEMA + KEY_COLUMN_USAGE CONSTRAINT_SCHEMA + PROFILING QUERY_ID + ROUTINES ROUTINE_SCHEMA +@@ -1200,8 +1214,10 @@ + TABLES TABLE_SCHEMA + TABLE_CONSTRAINTS CONSTRAINT_SCHEMA + TABLE_PRIVILEGES TABLE_SCHEMA ++TABLE_STATISTICS TABLE_SCHEMA + TRIGGERS TRIGGER_SCHEMA + USER_PRIVILEGES GRANTEE ++USER_STATISTICS USER + VIEWS TABLE_SCHEMA + SELECT t.table_name, c1.column_name + FROM information_schema.tables t +@@ -1219,10 +1235,12 @@ + ); + table_name column_name + CHARACTER_SETS CHARACTER_SET_NAME ++CLIENT_STATISTICS CLIENT + COLLATIONS COLLATION_NAME + COLLATION_CHARACTER_SET_APPLICABILITY COLLATION_NAME + COLUMNS TABLE_SCHEMA + COLUMN_PRIVILEGES TABLE_SCHEMA ++INDEX_STATISTICS TABLE_SCHEMA + KEY_COLUMN_USAGE CONSTRAINT_SCHEMA + PROFILING QUERY_ID + ROUTINES ROUTINE_SCHEMA +@@ -1232,8 +1250,10 @@ + TABLES TABLE_SCHEMA + TABLE_CONSTRAINTS CONSTRAINT_SCHEMA + TABLE_PRIVILEGES TABLE_SCHEMA ++TABLE_STATISTICS TABLE_SCHEMA + TRIGGERS TRIGGER_SCHEMA + USER_PRIVILEGES GRANTEE ++USER_STATISTICS USER + VIEWS TABLE_SCHEMA + SELECT MAX(table_name) FROM information_schema.tables; + MAX(table_name) +@@ -1302,10 +1322,12 @@ + group by t.table_name order by num1, t.table_name; + table_name group_concat(t.table_schema, '.', t.table_name) num1 + CHARACTER_SETS information_schema.CHARACTER_SETS 1 ++CLIENT_STATISTICS information_schema.CLIENT_STATISTICS 1 + COLLATIONS information_schema.COLLATIONS 1 + COLLATION_CHARACTER_SET_APPLICABILITY information_schema.COLLATION_CHARACTER_SET_APPLICABILITY 1 + COLUMNS information_schema.COLUMNS 1 + COLUMN_PRIVILEGES information_schema.COLUMN_PRIVILEGES 1 ++INDEX_STATISTICS information_schema.INDEX_STATISTICS 1 + KEY_COLUMN_USAGE information_schema.KEY_COLUMN_USAGE 1 + PROFILING information_schema.PROFILING 1 + ROUTINES information_schema.ROUTINES 1 +@@ -1315,8 +1337,10 @@ + TABLES information_schema.TABLES 1 + TABLE_CONSTRAINTS information_schema.TABLE_CONSTRAINTS 1 + TABLE_PRIVILEGES information_schema.TABLE_PRIVILEGES 1 ++TABLE_STATISTICS information_schema.TABLE_STATISTICS 1 + TRIGGERS information_schema.TRIGGERS 1 + USER_PRIVILEGES information_schema.USER_PRIVILEGES 1 ++USER_STATISTICS information_schema.USER_STATISTICS 1 + VIEWS information_schema.VIEWS 1 + create table t1(f1 int); + create view v1 as select f1+1 as a from t1; +diff -r 592f6c3641ba mysql-test/r/information_schema_db.result +--- a/mysql-test/r/information_schema_db.result Wed Jul 29 13:33:34 2009 -0700 ++++ b/mysql-test/r/information_schema_db.result Wed Jul 29 13:34:11 2009 -0700 +@@ -6,10 +6,12 @@ + show tables; + Tables_in_information_schema + CHARACTER_SETS ++CLIENT_STATISTICS + COLLATIONS + COLLATION_CHARACTER_SET_APPLICABILITY + COLUMNS + COLUMN_PRIVILEGES ++INDEX_STATISTICS + KEY_COLUMN_USAGE + PROFILING + ROUTINES +@@ -19,14 +21,17 @@ + TABLES + TABLE_CONSTRAINTS + TABLE_PRIVILEGES ++TABLE_STATISTICS + TRIGGERS + USER_PRIVILEGES ++USER_STATISTICS + VIEWS + show tables from INFORMATION_SCHEMA like 'T%'; + Tables_in_information_schema (T%) + TABLES + TABLE_CONSTRAINTS + TABLE_PRIVILEGES ++TABLE_STATISTICS + TRIGGERS + create database `inf%`; + create database mbase; +diff -r 592f6c3641ba mysql-test/r/mysqlshow.result +--- a/mysql-test/r/mysqlshow.result Wed Jul 29 13:33:34 2009 -0700 ++++ b/mysql-test/r/mysqlshow.result Wed Jul 29 13:34:11 2009 -0700 +@@ -80,10 +80,12 @@ + | Tables | + +---------------------------------------+ + | CHARACTER_SETS | ++| CLIENT_STATISTICS | + | COLLATIONS | + | COLLATION_CHARACTER_SET_APPLICABILITY | + | COLUMNS | + | COLUMN_PRIVILEGES | ++| INDEX_STATISTICS | + | KEY_COLUMN_USAGE | + | PROFILING | + | ROUTINES | +@@ -93,8 +95,10 @@ + | TABLES | + | TABLE_CONSTRAINTS | + | TABLE_PRIVILEGES | ++| TABLE_STATISTICS | + | TRIGGERS | + | USER_PRIVILEGES | ++| USER_STATISTICS | + | VIEWS | + +---------------------------------------+ + Database: INFORMATION_SCHEMA +@@ -102,10 +106,12 @@ + | Tables | + +---------------------------------------+ + | CHARACTER_SETS | ++| CLIENT_STATISTICS | + | COLLATIONS | + | COLLATION_CHARACTER_SET_APPLICABILITY | + | COLUMNS | + | COLUMN_PRIVILEGES | ++| INDEX_STATISTICS | + | KEY_COLUMN_USAGE | + | PROFILING | + | ROUTINES | +@@ -115,8 +121,10 @@ + | TABLES | + | TABLE_CONSTRAINTS | + | TABLE_PRIVILEGES | ++| TABLE_STATISTICS | + | TRIGGERS | + | USER_PRIVILEGES | ++| USER_STATISTICS | + | VIEWS | + +---------------------------------------+ + Wildcard: inf_rmation_schema +diff -r 592f6c3641ba mysys/Makefile.in +--- a/mysys/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/mysys/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -228,6 +228,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/Makefile.in +--- a/ndb/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -171,6 +171,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/docs/Makefile.in +--- a/ndb/docs/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/docs/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -149,6 +149,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/include/Makefile.in +--- a/ndb/include/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/include/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -179,6 +179,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/Makefile.in +--- a/ndb/src/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -204,6 +204,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/common/Makefile.in +--- a/ndb/src/common/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/common/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -174,6 +174,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/common/debugger/Makefile.in +--- a/ndb/src/common/debugger/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/common/debugger/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -206,6 +206,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/common/debugger/signaldata/Makefile.in +--- a/ndb/src/common/debugger/signaldata/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/common/debugger/signaldata/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -211,6 +211,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/common/logger/Makefile.in +--- a/ndb/src/common/logger/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/common/logger/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -197,6 +197,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/common/mgmcommon/Makefile.in +--- a/ndb/src/common/mgmcommon/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/common/mgmcommon/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -211,6 +211,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/common/portlib/Makefile.in +--- a/ndb/src/common/portlib/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/common/portlib/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -222,6 +222,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/common/transporter/Makefile.in +--- a/ndb/src/common/transporter/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/common/transporter/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -197,6 +197,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/common/util/Makefile.in +--- a/ndb/src/common/util/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/common/util/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -217,6 +217,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/cw/Makefile.in +--- a/ndb/src/cw/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/cw/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -156,6 +156,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/cw/cpcd/Makefile.in +--- a/ndb/src/cw/cpcd/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/cw/cpcd/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -207,6 +207,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/kernel/Makefile.in +--- a/ndb/src/kernel/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/kernel/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -227,6 +227,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/kernel/blocks/Makefile.in +--- a/ndb/src/kernel/blocks/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/kernel/blocks/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -156,6 +156,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/kernel/blocks/backup/Makefile.in +--- a/ndb/src/kernel/blocks/backup/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/kernel/blocks/backup/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -196,6 +196,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/kernel/blocks/cmvmi/Makefile.in +--- a/ndb/src/kernel/blocks/cmvmi/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/kernel/blocks/cmvmi/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -196,6 +196,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/kernel/blocks/dbacc/Makefile.in +--- a/ndb/src/kernel/blocks/dbacc/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/kernel/blocks/dbacc/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -196,6 +196,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/kernel/blocks/dbdict/Makefile.in +--- a/ndb/src/kernel/blocks/dbdict/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/kernel/blocks/dbdict/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -206,6 +206,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/kernel/blocks/dbdih/Makefile.in +--- a/ndb/src/kernel/blocks/dbdih/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/kernel/blocks/dbdih/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -203,6 +203,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/kernel/blocks/dblqh/Makefile.in +--- a/ndb/src/kernel/blocks/dblqh/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/kernel/blocks/dblqh/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -204,6 +204,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/kernel/blocks/dbtc/Makefile.in +--- a/ndb/src/kernel/blocks/dbtc/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/kernel/blocks/dbtc/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -196,6 +196,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/kernel/blocks/dbtup/Makefile.in +--- a/ndb/src/kernel/blocks/dbtup/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/kernel/blocks/dbtup/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -204,6 +204,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/kernel/blocks/dbtux/Makefile.in +--- a/ndb/src/kernel/blocks/dbtux/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/kernel/blocks/dbtux/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -199,6 +199,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/kernel/blocks/dbutil/Makefile.in +--- a/ndb/src/kernel/blocks/dbutil/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/kernel/blocks/dbutil/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -196,6 +196,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/kernel/blocks/ndbcntr/Makefile.in +--- a/ndb/src/kernel/blocks/ndbcntr/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/kernel/blocks/ndbcntr/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -197,6 +197,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/kernel/blocks/ndbfs/Makefile.in +--- a/ndb/src/kernel/blocks/ndbfs/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/kernel/blocks/ndbfs/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -197,6 +197,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/kernel/blocks/qmgr/Makefile.in +--- a/ndb/src/kernel/blocks/qmgr/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/kernel/blocks/qmgr/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -196,6 +196,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/kernel/blocks/suma/Makefile.in +--- a/ndb/src/kernel/blocks/suma/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/kernel/blocks/suma/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -196,6 +196,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/kernel/blocks/trix/Makefile.in +--- a/ndb/src/kernel/blocks/trix/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/kernel/blocks/trix/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -196,6 +196,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/kernel/error/Makefile.in +--- a/ndb/src/kernel/error/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/kernel/error/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -206,6 +206,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/kernel/vm/Makefile.in +--- a/ndb/src/kernel/vm/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/kernel/vm/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -207,6 +207,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/mgmapi/Makefile.in +--- a/ndb/src/mgmapi/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/mgmapi/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -205,6 +205,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/mgmclient/Makefile.in +--- a/ndb/src/mgmclient/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/mgmclient/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -216,6 +216,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/mgmsrv/Makefile.in +--- a/ndb/src/mgmsrv/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/mgmsrv/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -213,6 +213,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/src/ndbapi/Makefile.in +--- a/ndb/src/ndbapi/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/src/ndbapi/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -215,6 +215,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/test/Makefile.in +--- a/ndb/test/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/test/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -156,6 +156,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/test/ndbapi/Makefile.in +--- a/ndb/test/ndbapi/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/test/ndbapi/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -595,6 +595,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/test/ndbapi/bank/Makefile.in +--- a/ndb/test/ndbapi/bank/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/test/ndbapi/bank/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -282,6 +282,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/test/run-test/Makefile.in +--- a/ndb/test/run-test/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/test/run-test/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -243,6 +243,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/test/src/Makefile.in +--- a/ndb/test/src/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/test/src/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -213,6 +213,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/test/tools/Makefile.in +--- a/ndb/test/tools/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/test/tools/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -325,6 +325,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba ndb/tools/Makefile.in +--- a/ndb/tools/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/ndb/tools/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -344,6 +344,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba netware/Makefile.in +--- a/netware/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/netware/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -199,6 +199,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba os2/Makefile.in +--- a/os2/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/os2/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -156,6 +156,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba os2/include/Makefile.in +--- a/os2/include/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/os2/include/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -156,6 +156,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba os2/include/sys/Makefile.in +--- a/os2/include/sys/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/os2/include/sys/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -144,6 +144,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba patch_info/userstats.info +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/patch_info/userstats.info Wed Jul 29 13:34:11 2009 -0700 +@@ -0,0 +1,14 @@ ++File=userstatsv2.patch ++Name=SHOW USER/TABLE/INDEX statistics ++Version=V2 ++Author=Google ++License=GPL ++Comment=Added INFORMATION_SCHEMA.*_STATISTICS ++2008-12-01 ++YK: fix behavior for prepared statements ++ ++2008-11-26 ++YK: add switch variable "userstat_running" to control INFORMATION_SCHEMA.*_STATISTICS (default:OFF) ++ ++2008-12-09 ++YK: fixed "Row_sent: 0" problem at microslow_innodb.patch +diff -r 592f6c3641ba pstack/Makefile.in +--- a/pstack/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/pstack/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -196,6 +196,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba pstack/aout/Makefile.in +--- a/pstack/aout/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/pstack/aout/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -134,6 +134,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba regex/Makefile.in +--- a/regex/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/regex/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -180,6 +180,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba scripts/Makefile.in +--- a/scripts/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/scripts/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -176,6 +176,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba server-tools/Makefile.in +--- a/server-tools/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/server-tools/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -155,6 +155,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba server-tools/instance-manager/Makefile.in +--- a/server-tools/instance-manager/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/server-tools/instance-manager/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -205,6 +205,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba sql/Makefile.in +--- a/sql/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/sql/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -274,6 +274,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba sql/ha_innodb.cc +--- a/sql/ha_innodb.cc Wed Jul 29 13:33:34 2009 -0700 ++++ b/sql/ha_innodb.cc Wed Jul 29 13:34:11 2009 -0700 +@@ -3341,6 +3341,8 @@ + + error = row_insert_for_mysql((byte*) record, prebuilt); + ++ if (error == DB_SUCCESS) rows_changed++; ++ + if (error == DB_SUCCESS && auto_inc_used) { + + /* Fetch the value that was set in the autoincrement field */ +@@ -3613,6 +3615,8 @@ + } + } + ++ if (error == DB_SUCCESS) rows_changed++; ++ + innodb_srv_conc_exit_innodb(prebuilt->trx); + + error = convert_error_code_to_mysql(error, user_thd); +@@ -3661,6 +3665,8 @@ + + error = row_update_for_mysql((byte*) record, prebuilt); + ++ if (error == DB_SUCCESS) rows_changed++; ++ + innodb_srv_conc_exit_innodb(prebuilt->trx); + + error = convert_error_code_to_mysql(error, user_thd); +@@ -4092,6 +4098,9 @@ + if (ret == DB_SUCCESS) { + error = 0; + table->status = 0; ++ rows_read++; ++ if (active_index >= 0 && active_index < MAX_KEY) ++ index_rows_read[active_index]++; + + } else if (ret == DB_RECORD_NOT_FOUND) { + error = HA_ERR_END_OF_FILE; +diff -r 592f6c3641ba sql/ha_myisam.cc +--- a/sql/ha_myisam.cc Wed Jul 29 13:33:34 2009 -0700 ++++ b/sql/ha_myisam.cc Wed Jul 29 13:34:11 2009 -0700 +@@ -670,7 +670,9 @@ + if ((error= update_auto_increment())) + return error; + } +- return mi_write(file,buf); ++ int error=mi_write(file,buf); ++ if (!error) rows_changed++; ++ return error; + } + + int ha_myisam::check(THD* thd, HA_CHECK_OPT* check_opt) +@@ -1521,13 +1523,17 @@ + statistic_increment(table->in_use->status_var.ha_update_count,&LOCK_status); + if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE) + table->timestamp_field->set_time(); +- return mi_update(file,old_data,new_data); ++ int error=mi_update(file,old_data,new_data); ++ if (!error) rows_changed++; ++ return error; + } + + int ha_myisam::delete_row(const byte * buf) + { + statistic_increment(table->in_use->status_var.ha_delete_count,&LOCK_status); +- return mi_delete(file,buf); ++ int error=mi_delete(file,buf); ++ if (!error) rows_changed++; ++ return error; + } + + int ha_myisam::index_read(byte * buf, const byte * key, +@@ -1538,6 +1544,13 @@ + &LOCK_status); + int error=mi_rkey(file,buf,active_index, key, key_len, find_flag); + table->status=error ? STATUS_NOT_FOUND: 0; ++ if (!error) { ++ rows_read++; ++ ++ int inx = (active_index == -1) ? file->lastinx : active_index; ++ if (inx >= 0 && inx < MAX_KEY) ++ index_rows_read[inx]++; ++ } + return error; + } + +@@ -1548,6 +1561,13 @@ + &LOCK_status); + int error=mi_rkey(file,buf,index, key, key_len, find_flag); + table->status=error ? STATUS_NOT_FOUND: 0; ++ if (!error) { ++ rows_read++; ++ ++ int inx = index; ++ if (inx >= 0 && inx < MAX_KEY) ++ index_rows_read[inx]++; ++ } + return error; + } + +@@ -1558,6 +1578,13 @@ + &LOCK_status); + int error=mi_rkey(file,buf,active_index, key, key_len, HA_READ_PREFIX_LAST); + table->status=error ? STATUS_NOT_FOUND: 0; ++ if (!error) { ++ rows_read++; ++ ++ int inx = (active_index == -1) ? file->lastinx : active_index; ++ if (inx >= 0 && inx < MAX_KEY) ++ index_rows_read[inx]++; ++ } + return error; + } + +@@ -1568,6 +1595,13 @@ + &LOCK_status); + int error=mi_rnext(file,buf,active_index); + table->status=error ? STATUS_NOT_FOUND: 0; ++ if (!error) { ++ rows_read++; ++ ++ int inx = (active_index == -1) ? file->lastinx : active_index; ++ if (inx >= 0 && inx < MAX_KEY) ++ index_rows_read[inx]++; ++ } + return error; + } + +@@ -1578,6 +1612,13 @@ + &LOCK_status); + int error=mi_rprev(file,buf, active_index); + table->status=error ? STATUS_NOT_FOUND: 0; ++ if (!error) { ++ rows_read++; ++ ++ int inx = (active_index == -1) ? file->lastinx : active_index; ++ if (inx >= 0 && inx < MAX_KEY) ++ index_rows_read[inx]++; ++ } + return error; + } + +@@ -1588,6 +1629,13 @@ + &LOCK_status); + int error=mi_rfirst(file, buf, active_index); + table->status=error ? STATUS_NOT_FOUND: 0; ++ if (!error) { ++ rows_read++; ++ ++ int inx = (active_index == -1) ? file->lastinx : active_index; ++ if (inx >= 0 && inx < MAX_KEY) ++ index_rows_read[inx]++; ++ } + return error; + } + +@@ -1598,6 +1646,13 @@ + &LOCK_status); + int error=mi_rlast(file, buf, active_index); + table->status=error ? STATUS_NOT_FOUND: 0; ++ if (!error) { ++ rows_read++; ++ ++ int inx = (active_index == -1) ? file->lastinx : active_index; ++ if (inx >= 0 && inx < MAX_KEY) ++ index_rows_read[inx]++; ++ } + return error; + } + +@@ -1614,6 +1669,13 @@ + error= mi_rnext_same(file,buf); + } while (error == HA_ERR_RECORD_DELETED); + table->status=error ? STATUS_NOT_FOUND: 0; ++ if (!error) { ++ rows_read++; ++ ++ int inx = (active_index == -1) ? file->lastinx : active_index; ++ if (inx >= 0 && inx < MAX_KEY) ++ index_rows_read[inx]++; ++ } + return error; + } + +@@ -1631,6 +1693,7 @@ + &LOCK_status); + int error=mi_scan(file, buf); + table->status=error ? STATUS_NOT_FOUND: 0; ++ if (!error) rows_read++; + return error; + } + +@@ -1645,6 +1708,7 @@ + &LOCK_status); + int error=mi_rrnd(file, buf, my_get_ptr(pos,ref_length)); + table->status=error ? STATUS_NOT_FOUND: 0; ++ if (!error) rows_read++; + return error; + } + +diff -r 592f6c3641ba sql/handler.cc +--- a/sql/handler.cc Wed Jul 29 13:33:34 2009 -0700 ++++ b/sql/handler.cc Wed Jul 29 13:34:11 2009 -0700 +@@ -726,6 +726,8 @@ + if (cookie) + tc_log->unlog(cookie, xid); + DBUG_EXECUTE_IF("crash_commit_after", abort();); ++ if (is_real_trans) ++ thd->diff_commit_trans++; + end: + if (is_real_trans) + start_waiting_global_read_lock(thd); +@@ -783,6 +785,7 @@ + thd->transaction.cleanup(); + } + } ++ thd->diff_rollback_trans++; + #endif /* USING_TRANSACTIONS */ + DBUG_RETURN(error); + } +@@ -1223,6 +1226,7 @@ + statistic_increment(thd->status_var.ha_rollback_count,&LOCK_status); + *ht=0; // keep it conveniently zero-filled + } ++ thd->diff_rollback_trans++; + DBUG_RETURN(error); + } + +@@ -1453,6 +1457,8 @@ + else + dupp_ref=ref+ALIGN_SIZE(ref_length); + } ++ rows_read = rows_changed = 0; ++ memset(index_rows_read, 0, sizeof(index_rows_read)); + DBUG_RETURN(error); + } + +@@ -2287,6 +2293,111 @@ + return error; + } + ++// Updates the global table stats with the TABLE this handler represents. ++void handler::update_global_table_stats() { ++ if (!opt_userstat_running) { ++ rows_read = rows_changed = 0; ++ return; ++ } ++ ++ if (!rows_read && !rows_changed) return; // Nothing to update. ++ // table_cache_key is db_name + '\0' + table_name + '\0'. ++ if (!table->s || !table->s->table_cache_key || !table->s->table_name) return; ++ ++ TABLE_STATS* table_stats; ++ char key[NAME_LEN * 2 + 2]; ++ // [db] + '.' + [table] ++ sprintf(key, "%s.%s", table->s->table_cache_key, table->s->table_name); ++ ++ pthread_mutex_lock(&LOCK_global_table_stats); ++ // Gets the global table stats, creating one if necessary. ++ if (!(table_stats = (TABLE_STATS*)hash_search(&global_table_stats, ++ (byte*)key, ++ strlen(key)))) { ++ if (!(table_stats = ((TABLE_STATS*) ++ my_malloc(sizeof(TABLE_STATS), MYF(MY_WME | MY_ZEROFILL))))) { ++ // Out of memory. ++ sql_print_error("Allocating table stats failed."); ++ goto end; ++ } ++ strncpy(table_stats->table, key, sizeof(table_stats->table)); ++ table_stats->rows_read = 0; ++ table_stats->rows_changed = 0; ++ table_stats->rows_changed_x_indexes = 0; ++ table_stats->engine_type = (int) ht->db_type; ++ ++ if (my_hash_insert(&global_table_stats, (byte*)table_stats)) { ++ // Out of memory. ++ sql_print_error("Inserting table stats failed."); ++ my_free((char*)table_stats, 0); ++ goto end; ++ } ++ } ++ // Updates the global table stats. ++ table_stats->rows_read += rows_read; ++ table_stats->rows_changed += rows_changed; ++ table_stats->rows_changed_x_indexes += ++ rows_changed * (table->s->keys ? table->s->keys : 1); ++ current_thd->diff_total_read_rows += rows_read; ++ rows_read = rows_changed = 0; ++end: ++ pthread_mutex_unlock(&LOCK_global_table_stats); ++} ++ ++// Updates the global index stats with this handler's accumulated index reads. ++void handler::update_global_index_stats() { ++ // table_cache_key is db_name + '\0' + table_name + '\0'. ++ if (!table->s || !table->s->table_cache_key || !table->s->table_name) return; ++ ++ if (!opt_userstat_running) { ++ for (int x = 0; x < table->s->keys; x++) { ++ index_rows_read[x] = 0; ++ } ++ return; ++ } ++ ++ for (int x = 0; x < table->s->keys; x++) { ++ if (index_rows_read[x]) { ++ // Rows were read using this index. ++ KEY* key_info = &table->key_info[x]; ++ ++ if (!key_info->name) continue; ++ ++ INDEX_STATS* index_stats; ++ char key[NAME_LEN * 3 + 3]; ++ // [db] + '.' + [table] + '.' + [index] ++ sprintf(key, "%s.%s.%s", table->s->table_cache_key, ++ table->s->table_name, key_info->name); ++ ++ pthread_mutex_lock(&LOCK_global_index_stats); ++ // Gets the global index stats, creating one if necessary. ++ if (!(index_stats = (INDEX_STATS*)hash_search(&global_index_stats, ++ (byte*)key, ++ strlen(key)))) { ++ if (!(index_stats = ((INDEX_STATS*) ++ my_malloc(sizeof(INDEX_STATS), MYF(MY_WME | MY_ZEROFILL))))) { ++ // Out of memory. ++ sql_print_error("Allocating index stats failed."); ++ goto end; ++ } ++ strncpy(index_stats->index, key, sizeof(index_stats->index)); ++ index_stats->rows_read = 0; ++ ++ if (my_hash_insert(&global_index_stats, (byte*)index_stats)) { ++ // Out of memory. ++ sql_print_error("Inserting index stats failed."); ++ my_free((char*)index_stats, 0); ++ goto end; ++ } ++ } ++ // Updates the global index stats. ++ index_stats->rows_read += index_rows_read[x]; ++ index_rows_read[x] = 0; ++end: ++ pthread_mutex_unlock(&LOCK_global_index_stats); ++ } ++ } ++} + + /**************************************************************************** + ** Some general functions that isn't in the handler class +diff -r 592f6c3641ba sql/handler.h +--- a/sql/handler.h Wed Jul 29 13:33:34 2009 -0700 ++++ b/sql/handler.h Wed Jul 29 13:34:11 2009 -0700 +@@ -32,6 +32,10 @@ + #define USING_TRANSACTIONS + #endif + ++#if MAX_KEY > 128 ++#error MAX_KEY is too large. Values up to 128 are supported. ++#endif ++ + // the following is for checking tables + + #define HA_ADMIN_ALREADY_DONE 1 +@@ -604,6 +608,9 @@ + bool auto_increment_column_changed; + bool implicit_emptied; /* Can be !=0 only if HEAP */ + const COND *pushed_cond; ++ ulonglong rows_read; ++ ulonglong rows_changed; ++ ulonglong index_rows_read[MAX_KEY]; + + handler(const handlerton *ht_arg, TABLE *table_arg) :table(table_arg), + ht(ht_arg), +@@ -615,8 +622,10 @@ + ref_length(sizeof(my_off_t)), block_size(0), + raid_type(0), ft_handler(0), inited(NONE), + locked(FALSE), implicit_emptied(0), +- pushed_cond(NULL) +- {} ++ pushed_cond(NULL), rows_read(0), rows_changed(0) ++ { ++ memset(index_rows_read, 0, sizeof(index_rows_read)); ++ } + virtual ~handler(void) { DBUG_ASSERT(locked == FALSE); /* TODO: DBUG_ASSERT(inited == NONE); */ } + virtual handler *clone(MEM_ROOT *mem_root); + int ha_open(const char *name, int mode, int test_if_locked); +@@ -625,7 +634,11 @@ + virtual void print_error(int error, myf errflag); + virtual bool get_error_message(int error, String *buf); + uint get_dup_key(int error); +- void change_table_ptr(TABLE *table_arg) { table=table_arg; } ++ void change_table_ptr(TABLE *table_arg) { ++ table=table_arg; ++ rows_read = rows_changed = 0; ++ memset(index_rows_read, 0, sizeof(index_rows_read)); ++ } + virtual double scan_time() + { return ulonglong2double(data_file_length) / IO_SIZE + 2; } + virtual double read_time(uint index, uint ranges, ha_rows rows) +@@ -886,6 +899,9 @@ + virtual bool is_crashed() const { return 0; } + virtual bool auto_repair() const { return 0; } + ++ void update_global_table_stats(); ++ void update_global_index_stats(); ++ + /* + default rename_table() and delete_table() rename/delete files with a + given name and extensions from bas_ext() +diff -r 592f6c3641ba sql/lex.h +--- a/sql/lex.h Wed Jul 29 13:33:34 2009 -0700 ++++ b/sql/lex.h Wed Jul 29 13:34:11 2009 -0700 +@@ -109,6 +109,7 @@ + { "CHECKSUM", SYM(CHECKSUM_SYM)}, + { "CIPHER", SYM(CIPHER_SYM)}, + { "CLIENT", SYM(CLIENT_SYM)}, ++ { "CLIENT_STATISTICS", SYM(CLIENT_STATS_SYM)}, + { "CLOSE", SYM(CLOSE_SYM)}, + { "CODE", SYM(CODE_SYM)}, + { "COLLATE", SYM(COLLATE_SYM)}, +@@ -238,6 +239,7 @@ + { "IN", SYM(IN_SYM)}, + { "INDEX", SYM(INDEX_SYM)}, + { "INDEXES", SYM(INDEXES)}, ++ { "INDEX_STATISTICS", SYM(INDEX_STATS_SYM)}, + { "INFILE", SYM(INFILE)}, + { "INNER", SYM(INNER_SYM)}, + { "INNOBASE", SYM(INNOBASE_SYM)}, +@@ -443,6 +445,7 @@ + { "SIGNED", SYM(SIGNED_SYM)}, + { "SIMPLE", SYM(SIMPLE_SYM)}, + { "SLAVE", SYM(SLAVE)}, ++ { "SLOW", SYM(SLOW_SYM)}, + { "SNAPSHOT", SYM(SNAPSHOT_SYM)}, + { "SMALLINT", SYM(SMALLINT)}, + { "SOME", SYM(ANY_SYM)}, +@@ -488,6 +491,7 @@ + { "TABLE", SYM(TABLE_SYM)}, + { "TABLES", SYM(TABLES)}, + { "TABLESPACE", SYM(TABLESPACE)}, ++ { "TABLE_STATISTICS", SYM(TABLE_STATS_SYM)}, + { "TEMPORARY", SYM(TEMPORARY)}, + { "TEMPTABLE", SYM(TEMPTABLE_SYM)}, + { "TERMINATED", SYM(TERMINATED)}, +@@ -525,6 +529,7 @@ + { "USE", SYM(USE_SYM)}, + { "USER", SYM(USER)}, + { "USER_RESOURCES", SYM(RESOURCES)}, ++ { "USER_STATISTICS", SYM(USER_STATS_SYM)}, + { "USE_FRM", SYM(USE_FRM)}, + { "USING", SYM(USING)}, + { "UTC_DATE", SYM(UTC_DATE_SYM)}, +diff -r 592f6c3641ba sql/log.cc +--- a/sql/log.cc Wed Jul 29 13:33:34 2009 -0700 ++++ b/sql/log.cc Wed Jul 29 13:34:11 2009 -0700 +@@ -1958,18 +1958,24 @@ + thd->current_insert_id); + if (e.write(file)) + goto err; ++ if (file == &log_file) ++ thd->binlog_bytes_written += e.data_written; + } + if (thd->insert_id_used) + { + Intvar_log_event e(thd,(uchar) INSERT_ID_EVENT,thd->last_insert_id); + if (e.write(file)) + goto err; ++ if (file == &log_file) ++ thd->binlog_bytes_written += e.data_written; + } + if (thd->rand_used) + { + Rand_log_event e(thd,thd->rand_saved_seed1,thd->rand_saved_seed2); + if (e.write(file)) + goto err; ++ if (file == &log_file) ++ thd->binlog_bytes_written += e.data_written; + } + if (thd->user_var_events.elements) + { +@@ -1985,6 +1991,8 @@ + user_var_event->charset_number); + if (e.write(file)) + goto err; ++ if (file == &log_file) ++ thd->binlog_bytes_written += e.data_written; + } + } + } +@@ -1995,6 +2003,8 @@ + + if (event_info->write(file)) + goto err; ++ if (file == &log_file) ++ thd->binlog_bytes_written += event_info->data_written; + + if (file == &log_file) // we are writing to the real log (disk) + { +@@ -2117,6 +2127,7 @@ + */ + if (qinfo.write(&log_file)) + goto err; ++ thd->binlog_bytes_written += qinfo.data_written; + + /* Read from the file used to cache the queries .*/ + if (reinit_io_cache(cache, READ_CACHE, 0, 0, 0)) +@@ -2163,6 +2174,7 @@ + /* write the first half of the split header */ + if (my_b_write(&log_file, header, carry)) + goto err; ++ thd->binlog_bytes_written += carry; + + /* + copy fixed second half of header to cache so the correct +@@ -2231,6 +2243,8 @@ + /* Write data to the binary log file */ + if (my_b_write(&log_file, cache->read_pos, length)) + goto err; ++ thd->binlog_bytes_written += length; ++ + cache->read_pos=cache->read_end; // Mark buffer used up + DBUG_EXECUTE_IF("half_binlogged_transaction", goto DBUG_skip_commit;); + } while ((length=my_b_fill(cache))); +@@ -2239,6 +2253,8 @@ + + if (commit_event->write(&log_file)) + goto err; ++ thd->binlog_bytes_written += commit_event->data_written; ++ + #ifndef DBUG_OFF + DBUG_skip_commit: + #endif +diff -r 592f6c3641ba sql/mysql_priv.h +--- a/sql/mysql_priv.h Wed Jul 29 13:33:34 2009 -0700 ++++ b/sql/mysql_priv.h Wed Jul 29 13:34:11 2009 -0700 +@@ -837,7 +837,15 @@ + bool multi_delete_set_locks_and_link_aux_tables(LEX *lex); + void init_max_user_conn(void); + void init_update_queries(void); ++void init_global_user_stats(void); ++void init_global_table_stats(void); ++void init_global_index_stats(void); ++void init_global_client_stats(void); + void free_max_user_conn(void); ++void free_global_user_stats(void); ++void free_global_table_stats(void); ++void free_global_index_stats(void); ++void free_global_client_stats(void); + pthread_handler_t handle_one_connection(void *arg); + pthread_handler_t handle_bootstrap(void *arg); + void end_thread(THD *thd,bool put_in_cache); +@@ -1416,6 +1424,7 @@ + extern ulong max_connections,max_connect_errors, connect_timeout; + extern ulong slave_net_timeout, slave_trans_retries; + extern uint max_user_connections; ++extern ulonglong denied_connections; + extern ulong what_to_log,flush_time; + extern ulong query_buff_size, thread_stack; + extern ulong max_prepared_stmt_count, prepared_stmt_count; +@@ -1446,6 +1455,7 @@ + extern my_bool opt_safe_show_db, opt_local_infile; + extern my_bool opt_slave_compressed_protocol, use_temp_pool; + extern my_bool opt_readonly, lower_case_file_system; ++extern my_bool opt_userstat_running; + extern my_bool opt_enable_named_pipe, opt_sync_frm, opt_allow_suspicious_udfs; + extern my_bool opt_secure_auth; + extern char* opt_secure_file_priv; +@@ -1493,6 +1503,14 @@ + extern struct system_variables max_system_variables; + extern struct system_status_var global_status_var; + extern struct rand_struct sql_rand; ++extern HASH global_user_stats; ++extern HASH global_client_stats; ++extern pthread_mutex_t LOCK_global_user_client_stats; ++extern HASH global_table_stats; ++extern pthread_mutex_t LOCK_global_table_stats; ++extern HASH global_index_stats; ++extern pthread_mutex_t LOCK_global_index_stats; ++extern pthread_mutex_t LOCK_stats; + + extern const char *opt_date_time_formats[]; + extern KNOWN_DATE_TIME_FORMAT known_date_time_formats[]; +diff -r 592f6c3641ba sql/mysqld.cc +--- a/sql/mysqld.cc Wed Jul 29 13:33:34 2009 -0700 ++++ b/sql/mysqld.cc Wed Jul 29 13:34:11 2009 -0700 +@@ -417,6 +417,7 @@ + uint opt_large_page_size= 0; + my_bool opt_old_style_user_limits= 0, trust_function_creators= 0; + char* opt_slow_logname= 0; ++my_bool opt_userstat_running= 0; + /* + True if there is at least one per-hour limit for some user, so we should + check them before each query (and possibly reset counters when hour is +@@ -453,6 +454,7 @@ + ulong binlog_cache_use= 0, binlog_cache_disk_use= 0; + ulong max_connections, max_connect_errors; + uint max_user_connections= 0; ++ulonglong denied_connections = 0; + /* + Limit of the total number of prepared statements in the server. + Is necessary to protect the server against out-of-memory attacks. +@@ -555,6 +557,10 @@ + LOCK_crypt, LOCK_bytes_sent, LOCK_bytes_received, + LOCK_global_system_variables, + LOCK_user_conn, LOCK_slave_list, LOCK_active_mi; ++pthread_mutex_t LOCK_stats; ++pthread_mutex_t LOCK_global_user_client_stats; ++pthread_mutex_t LOCK_global_table_stats; ++pthread_mutex_t LOCK_global_index_stats; + /* + The below lock protects access to two global server variables: + max_prepared_stmt_count and prepared_stmt_count. These variables +@@ -1196,6 +1202,10 @@ + x_free(opt_secure_file_priv); + bitmap_free(&temp_pool); + free_max_user_conn(); ++ free_global_user_stats(); ++ free_global_client_stats(); ++ free_global_table_stats(); ++ free_global_index_stats(); + #ifdef HAVE_REPLICATION + end_slave_list(); + free_list(&replicate_do_db); +@@ -1310,6 +1320,10 @@ + (void) pthread_cond_destroy(&COND_thread_cache); + (void) pthread_cond_destroy(&COND_flush_thread_cache); + (void) pthread_cond_destroy(&COND_manager); ++ (void) pthread_mutex_destroy(&LOCK_stats); ++ (void) pthread_mutex_destroy(&LOCK_global_user_client_stats); ++ (void) pthread_mutex_destroy(&LOCK_global_table_stats); ++ (void) pthread_mutex_destroy(&LOCK_global_index_stats); + } + + #endif /*EMBEDDED_LIBRARY*/ +@@ -3157,6 +3171,10 @@ + (void) pthread_mutex_init(&LOCK_rpl_status, MY_MUTEX_INIT_FAST); + (void) pthread_cond_init(&COND_rpl_status, NULL); + #endif ++ (void) pthread_mutex_init(&LOCK_stats, MY_MUTEX_INIT_FAST); ++ (void) pthread_mutex_init(&LOCK_global_user_client_stats, MY_MUTEX_INIT_FAST); ++ (void) pthread_mutex_init(&LOCK_global_table_stats, MY_MUTEX_INIT_FAST); ++ (void) pthread_mutex_init(&LOCK_global_index_stats, MY_MUTEX_INIT_FAST); + sp_cache_init(); + /* Parameter for threads created for connections */ + (void) pthread_attr_init(&connection_attrib); +@@ -3428,6 +3446,10 @@ + sql_print_error("Out of memory"); + unireg_abort(1); + } ++ ++ init_global_table_stats(); ++ init_global_index_stats(); ++ + if (ha_init()) + { + sql_print_error("Can't init databases"); +@@ -3510,6 +3532,8 @@ + + init_max_user_conn(); + init_update_queries(); ++ init_global_user_stats(); ++ init_global_client_stats(); + DBUG_RETURN(0); + } + +@@ -4236,6 +4260,7 @@ + { + DBUG_PRINT("error",("Too many connections")); + close_connection(thd, ER_CON_COUNT_ERROR, 1); ++ statistic_increment(denied_connections, &LOCK_status); + delete thd; + DBUG_VOID_RETURN; + } +@@ -5056,6 +5081,7 @@ + OPT_PROFILING_USE_GETRUSAGE, + OPT_SLOW_LOG, + OPT_SLOW_QUERY_LOG_FILE, ++ OPT_USERSTAT_RUNNING, + OPT_USE_GLOBAL_LONG_QUERY_TIME, + OPT_INNODB_ROLLBACK_ON_TIMEOUT, + OPT_SECURE_FILE_PRIV, +@@ -6523,6 +6549,10 @@ + (gptr*) &max_system_variables.net_wait_timeout, 0, GET_ULONG, + REQUIRED_ARG, NET_WAIT_TIMEOUT, 1, IF_WIN(INT_MAX32/1000, LONG_TIMEOUT), + 0, 1, 0}, ++ {"userstat_running", OPT_USERSTAT_RUNNING, ++ "Control USER_STATISTICS, CLIENT_STATISTICS, INDEX_STATISTICS and TABLE_STATISTICS running", ++ (gptr*) &opt_userstat_running, (gptr*) &opt_userstat_running, ++ 0, GET_BOOL, NO_ARG, 0, 0, 1, 0, 1, 0}, + {0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} + }; + +diff -r 592f6c3641ba sql/set_var.cc +--- a/sql/set_var.cc Wed Jul 29 13:33:34 2009 -0700 ++++ b/sql/set_var.cc Wed Jul 29 13:34:11 2009 -0700 +@@ -325,6 +325,7 @@ + sys_var_thd_ulong sys_read_buff_size("read_buffer_size", + &SV::read_buff_size); + sys_var_bool_ptr sys_readonly("read_only", &opt_readonly); ++sys_var_bool_ptr sys_userstat_running("userstat_running", &opt_userstat_running); + sys_var_thd_ulong sys_read_rnd_buff_size("read_rnd_buffer_size", + &SV::read_rnd_buff_size); + sys_var_thd_ulong sys_div_precincrement("div_precision_increment", +@@ -837,6 +838,7 @@ + &sys_trans_alloc_block_size, + &sys_trans_prealloc_size, + &sys_tx_isolation, ++ &sys_userstat_running, + &sys_version, + #ifdef HAVE_BERKELEY_DB + &sys_version_bdb, +@@ -1190,6 +1192,7 @@ + {sys_tx_isolation.name, (char*) &sys_tx_isolation, SHOW_SYS}, + {sys_updatable_views_with_limit.name, + (char*) &sys_updatable_views_with_limit,SHOW_SYS}, ++ {sys_userstat_running.name, (char*) &sys_userstat_running, SHOW_SYS}, + {sys_use_global_long_query_time.name, (char*) &sys_use_global_long_query_time, SHOW_SYS}, + {sys_version.name, (char*) &sys_version, SHOW_SYS}, + #ifdef HAVE_BERKELEY_DB +diff -r 592f6c3641ba sql/share/Makefile.in +--- a/sql/share/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/sql/share/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -144,6 +144,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba sql/sql_base.cc +--- a/sql/sql_base.cc Wed Jul 29 13:33:34 2009 -0700 ++++ b/sql/sql_base.cc Wed Jul 29 13:34:11 2009 -0700 +@@ -624,6 +624,12 @@ + DBUG_ENTER("close_thread_table"); + DBUG_ASSERT(table->key_read == 0); + DBUG_ASSERT(!table->file || table->file->inited == handler::NONE); ++ ++ if(table->file) ++ { ++ table->file->update_global_table_stats(); ++ table->file->update_global_index_stats(); ++ } + + *table_ptr=table->next; + if (table->needs_reopen_or_name_lock() || +@@ -670,6 +676,9 @@ + { + DBUG_ENTER("close_temporary"); + char path[FN_REFLEN]; ++ ++ table->file->update_global_table_stats(); ++ table->file->update_global_index_stats(); + db_type table_type=table->s->db_type; + strmov(path,table->s->path); + free_io_cache(table); +diff -r 592f6c3641ba sql/sql_class.cc +--- a/sql/sql_class.cc Wed Jul 29 13:33:34 2009 -0700 ++++ b/sql/sql_class.cc Wed Jul 29 13:34:11 2009 -0700 +@@ -239,6 +239,13 @@ + bzero(ha_data, sizeof(ha_data)); + mysys_var=0; + binlog_evt_union.do_union= FALSE; ++ busy_time = 0; ++ cpu_time = 0; ++ bytes_received = 0; ++ bytes_sent = 0; ++ binlog_bytes_written = 0; ++ updated_row_count = 0; ++ sent_row_count_2 = 0; + #ifndef DBUG_OFF + dbug_sentry=THD_SENTRY_MAGIC; + #endif +@@ -378,6 +385,88 @@ + total_warn_count= 0; + update_charset(); + bzero((char *) &status_var, sizeof(status_var)); ++ reset_stats(); ++} ++ ++// Resets stats in a THD. ++void THD::reset_stats(void) { ++ current_connect_time = time(NULL); ++ last_global_update_time = current_connect_time; ++ reset_diff_stats(); ++} ++ ++// Resets the 'diff' stats, which are used to update global stats. ++void THD::reset_diff_stats(void) { ++ diff_total_busy_time = 0; ++ diff_total_cpu_time = 0; ++ diff_total_bytes_received = 0; ++ diff_total_bytes_sent = 0; ++ diff_total_binlog_bytes_written = 0; ++ diff_total_sent_rows = 0; ++ diff_total_updated_rows = 0; ++ diff_total_read_rows = 0; ++ diff_select_commands = 0; ++ diff_update_commands = 0; ++ diff_other_commands = 0; ++ diff_commit_trans = 0; ++ diff_rollback_trans = 0; ++ diff_denied_connections = 0; ++ diff_lost_connections = 0; ++ diff_access_denied_errors = 0; ++ diff_empty_queries = 0; ++} ++ ++// Updates 'diff' stats of a THD. ++void THD::update_stats(bool ran_command) { ++ if (opt_userstat_running) { ++ diff_total_busy_time += busy_time; ++ diff_total_cpu_time += cpu_time; ++ diff_total_bytes_received += bytes_received; ++ diff_total_bytes_sent += bytes_sent; ++ diff_total_binlog_bytes_written += binlog_bytes_written; ++ diff_total_sent_rows += sent_row_count_2; ++ diff_total_updated_rows += updated_row_count; ++ // diff_total_read_rows is updated in handler.cc. ++ ++ if (ran_command) { ++ // The replication thread has the COM_CONNECT command. ++ if ((old_command == COM_QUERY || command == COM_CONNECT) && ++ (lex->sql_command >= 0 && lex->sql_command < SQLCOM_END)) { ++ // A SQL query. ++ if (lex->sql_command == SQLCOM_SELECT) { ++ if (lex->orig_sql_command == SQLCOM_END) { ++ diff_select_commands++; ++ if (!sent_row_count_2) ++ diff_empty_queries++; ++ } else { ++ // 'SHOW ' commands become SQLCOM_SELECT. ++ diff_other_commands++; ++ // 'SHOW ' commands shouldn't inflate total sent row count. ++ diff_total_sent_rows -= sent_row_count_2; ++ } ++ } else if (is_update_query(lex->sql_command)) { ++ diff_update_commands++; ++ } else { ++ diff_other_commands++; ++ } ++ } ++ } ++ // diff_commit_trans is updated in handler.cc. ++ // diff_rollback_trans is updated in handler.cc. ++ // diff_denied_connections is updated in sql_parse.cc. ++ // diff_lost_connections is updated in sql_parse.cc. ++ // diff_access_denied_errors is updated in sql_parse.cc. ++ ++ /* reset counters to zero to avoid double-counting since values ++ are already store in diff_total_*. */ ++ } ++ busy_time = 0; ++ cpu_time = 0; ++ bytes_received = 0; ++ bytes_sent = 0; ++ binlog_bytes_written = 0; ++ updated_row_count = 0; ++ sent_row_count_2 = 0; + } + + +@@ -907,6 +996,33 @@ + } + #endif + ++char *THD::get_client_host_port(THD *client) ++{ ++ Security_context *client_sctx= client->security_ctx; ++ char *client_host= NULL; ++ ++ if (client->peer_port && (client_sctx->host || client_sctx->ip) && ++ security_ctx->host_or_ip[0]) ++ { ++ if ((client_host= this->alloc(LIST_PROCESS_HOST_LEN+1))) ++ my_snprintf((char *) client_host, LIST_PROCESS_HOST_LEN, ++ "%s:%u", client_sctx->host_or_ip, client->peer_port); ++ } ++ else ++ client_host= this->strdup(client_sctx->host_or_ip[0] ? ++ client_sctx->host_or_ip : ++ client_sctx->host ? client_sctx->host : ""); ++ ++ return client_host; ++} ++ ++const char *get_client_host(THD *client) ++{ ++ return client->security_ctx->host_or_ip[0] ? ++ client->security_ctx->host_or_ip : ++ client->security_ctx->host ? client->security_ctx->host : ""; ++} ++ + + struct Item_change_record: public ilink + { +@@ -1082,6 +1198,7 @@ + buffer.set(buff, sizeof(buff), &my_charset_bin); + } + thd->sent_row_count++; ++ thd->sent_row_count_2++; + if (!thd->vio_ok()) + DBUG_RETURN(0); + if (!thd->net.report_error) +@@ -1174,6 +1291,7 @@ + select_export::~select_export() + { + thd->sent_row_count=row_count; ++ thd->sent_row_count_2=row_count; + } + + +@@ -2108,6 +2226,7 @@ + if (likely(thd != 0)) + { /* current_thd==0 when close_connection() calls net_send_error() */ + thd->status_var.bytes_sent+= length; ++ thd->bytes_sent+= length; + } + } + +@@ -2115,6 +2234,7 @@ + void thd_increment_bytes_received(ulong length) + { + current_thd->status_var.bytes_received+= length; ++ current_thd->bytes_received+= length; + } + + +diff -r 592f6c3641ba sql/sql_class.h +--- a/sql/sql_class.h Wed Jul 29 13:33:34 2009 -0700 ++++ b/sql/sql_class.h Wed Jul 29 13:34:11 2009 -0700 +@@ -1302,6 +1302,8 @@ + first byte of the packet in do_command() + */ + enum enum_server_command command; ++ // Used to save the command, before it is set to COM_SLEEP. ++ enum enum_server_command old_command; + uint32 server_id; + uint32 file_id; // for LOAD DATA INFILE + /* +@@ -1498,6 +1500,8 @@ + /* variables.transaction_isolation is reset to this after each commit */ + enum_tx_isolation session_tx_isolation; + enum_check_fields count_cuted_fields; ++ ha_rows updated_row_count; ++ ha_rows sent_row_count_2; /* for userstat */ + + DYNAMIC_ARRAY user_var_events; /* For user variables replication */ + MEM_ROOT *user_var_events_alloc; /* Allocate above array elements here */ +@@ -1607,6 +1611,49 @@ + */ + LOG_INFO* current_linfo; + NET* slave_net; // network connection from slave -> m. ++ ++ /* ++ Used to update global user stats. The global user stats are updated ++ occasionally with the 'diff' variables. After the update, the 'diff' ++ variables are reset to 0. ++ */ ++ // Time when the current thread connected to MySQL. ++ time_t current_connect_time; ++ // Last time when THD stats were updated in global_user_stats. ++ time_t last_global_update_time; ++ // Busy (non-idle) time for just one command. ++ double busy_time; ++ // Busy time not updated in global_user_stats yet. ++ double diff_total_busy_time; ++ // Cpu (non-idle) time for just one thread. ++ double cpu_time; ++ // Cpu time not updated in global_user_stats yet. ++ double diff_total_cpu_time; ++ /* bytes counting */ ++ ulonglong bytes_received; ++ ulonglong diff_total_bytes_received; ++ ulonglong bytes_sent; ++ ulonglong diff_total_bytes_sent; ++ ulonglong binlog_bytes_written; ++ ulonglong diff_total_binlog_bytes_written; ++ ++ // Number of rows not reflected in global_user_stats yet. ++ ha_rows diff_total_sent_rows, diff_total_updated_rows, diff_total_read_rows; ++ // Number of commands not reflected in global_user_stats yet. ++ ulonglong diff_select_commands, diff_update_commands, diff_other_commands; ++ // Number of transactions not reflected in global_user_stats yet. ++ ulonglong diff_commit_trans, diff_rollback_trans; ++ // Number of connection errors not reflected in global_user_stats yet. ++ ulonglong diff_denied_connections, diff_lost_connections; ++ // Number of db access denied, not reflected in global_user_stats yet. ++ ulonglong diff_access_denied_errors; ++ // Number of queries that return 0 rows ++ ulonglong diff_empty_queries; ++ ++ // Per account query delay in miliseconds. When not 0, sleep this number of ++ // milliseconds before every SQL command. ++ ulonglong query_delay_millis; ++ + /* Used by the sys_var class to store temporary values */ + union + { +@@ -1662,6 +1709,11 @@ + alloc_root. + */ + void init_for_queries(); ++ void reset_stats(void); ++ void reset_diff_stats(void); ++ // ran_command is true when this is called immediately after a ++ // command has been run. ++ void update_stats(bool ran_command); + void change_user(void); + void cleanup(void); + void cleanup_after_query(); +@@ -1891,8 +1943,14 @@ + if (p_db_length) + *p_db_length= db_length; + return FALSE; ++ ++ // Returns string as 'IP:port' for the client-side of the connnection represented ++ // by 'client' as displayed by SHOW PROCESSLIST. Allocates memory from the heap of ++ // this THD and that is not reclaimed immediately, so use sparingly. May return NULL. + } + ++ char *get_client_host_port(THD *client); ++ + public: + /** + Add an internal error handler to the thread execution context. +@@ -1935,6 +1993,10 @@ + MEM_ROOT main_mem_root; + }; + ++// Returns string as 'IP' for the client-side of the connection represented by ++// 'client'. Does not allocate memory. May return "". ++const char *get_client_host(THD *client); ++ + + #define tmp_disable_binlog(A) \ + {ulonglong tmp_disable_binlog__save_options= (A)->options; \ +diff -r 592f6c3641ba sql/sql_delete.cc +--- a/sql/sql_delete.cc Wed Jul 29 13:33:34 2009 -0700 ++++ b/sql/sql_delete.cc Wed Jul 29 13:34:11 2009 -0700 +@@ -358,6 +358,7 @@ + send_ok(thd,deleted); + DBUG_PRINT("info",("%ld records deleted",(long) deleted)); + } ++ thd->updated_row_count += deleted; + DBUG_RETURN(error >= 0 || thd->net.report_error); + } + +@@ -887,6 +888,7 @@ + thd->row_count_func= deleted; + ::send_ok(thd, deleted); + } ++ thd->updated_row_count += deleted; + return 0; + } + +diff -r 592f6c3641ba sql/sql_insert.cc +--- a/sql/sql_insert.cc Wed Jul 29 13:33:34 2009 -0700 ++++ b/sql/sql_insert.cc Wed Jul 29 13:34:11 2009 -0700 +@@ -990,6 +990,7 @@ + thd->row_count_func= info.copied + info.deleted + updated; + ::send_ok(thd, (ulong) thd->row_count_func, id, buff); + } ++ thd->updated_row_count += thd->row_count_func; + thd->abort_on_warning= 0; + DBUG_RETURN(FALSE); + +@@ -3094,6 +3095,7 @@ + autoinc_value_of_first_inserted_row : thd->insert_id_used ? + thd->last_insert_id : 0; + ::send_ok(thd, (ulong) thd->row_count_func, id, buff); ++ thd->updated_row_count += thd->row_count_func; + DBUG_RETURN(0); + } + +diff -r 592f6c3641ba sql/sql_lex.h +--- a/sql/sql_lex.h Wed Jul 29 13:33:34 2009 -0700 ++++ b/sql/sql_lex.h Wed Jul 29 13:34:11 2009 -0700 +@@ -101,6 +101,9 @@ + When a command is added here, be sure it's also added in mysqld.cc + in "struct show_var_st status_vars[]= {" ... + */ ++ // TODO(mcallaghan): update status_vars in mysqld to export these ++ SQLCOM_SHOW_USER_STATS, SQLCOM_SHOW_TABLE_STATS, SQLCOM_SHOW_INDEX_STATS, ++ SQLCOM_SHOW_CLIENT_STATS, + /* This should be the last !!! */ + SQLCOM_END + }; +diff -r 592f6c3641ba sql/sql_parse.cc +--- a/sql/sql_parse.cc Wed Jul 29 13:33:34 2009 -0700 ++++ b/sql/sql_parse.cc Wed Jul 29 13:34:11 2009 -0700 +@@ -78,6 +78,12 @@ + const char *table_name); + static bool check_show_create_table_access(THD *thd, TABLE_LIST *table); + ++// Increments connection count for user. ++static int increment_connection_count(THD* thd, bool use_lock); ++ ++// Uses the THD to update the global stats by user name and client IP ++void update_global_user_stats(THD* thd, bool create_user, time_t now); ++ + const char *any_db="*any*"; // Special symbol for check_access + + const char *command_name[]={ +@@ -146,6 +152,17 @@ + static bool do_command(THD *thd); + #endif // EMBEDDED_LIBRARY + ++HASH global_user_stats; ++HASH global_client_stats; ++// Protects global_user_stats and global_client_stats ++extern pthread_mutex_t LOCK_global_user_client_stats; ++ ++HASH global_table_stats; ++extern pthread_mutex_t LOCK_global_table_stats; ++ ++HASH global_index_stats; ++extern pthread_mutex_t LOCK_global_index_stats; ++ + #ifdef __WIN__ + extern void win_install_sigabrt_handler(void); + #endif +@@ -504,6 +521,7 @@ + mysql_log.write(thd,COM_CONNECT,"%s",ER(ER_NOT_SUPPORTED_AUTH_MODE)); + DBUG_RETURN(-1); + } ++ thd->diff_access_denied_errors++; + net_printf_error(thd, ER_ACCESS_DENIED_ERROR, + thd->main_security_ctx.user, + thd->main_security_ctx.host_or_ip, +@@ -536,12 +554,190 @@ + void init_max_user_conn(void) + { + #ifndef NO_EMBEDDED_ACCESS_CHECKS +- (void) hash_init(&hash_user_connections,system_charset_info,max_connections, +- 0,0, +- (hash_get_key) get_key_conn, (hash_free_key) free_user, +- 0); +-#endif +-} ++ if (hash_init(&hash_user_connections,system_charset_info,max_connections, ++ 0,0, ++ (hash_get_key) get_key_conn, (hash_free_key) free_user, ++ 0)) { ++ sql_print_error("Initializing hash_user_connections failed."); ++ exit(1); ++ } ++#endif ++} ++ ++byte *get_key_user_stats(USER_STATS *user_stats, uint *length, ++ my_bool not_used __attribute__((unused))) ++{ ++ *length = strlen(user_stats->user); ++ return (byte*)user_stats->user; ++} ++ ++void free_user_stats(USER_STATS* user_stats) ++{ ++ my_free((char*)user_stats, MYF(0)); ++} ++ ++void init_user_stats(USER_STATS *user_stats, ++ const char *user, ++ const char *priv_user, ++ uint total_connections, ++ uint concurrent_connections, ++ time_t connected_time, ++ double busy_time, ++ double cpu_time, ++ ulonglong bytes_received, ++ ulonglong bytes_sent, ++ ulonglong binlog_bytes_written, ++ ha_rows rows_fetched, ++ ha_rows rows_updated, ++ ha_rows rows_read, ++ ulonglong select_commands, ++ ulonglong update_commands, ++ ulonglong other_commands, ++ ulonglong commit_trans, ++ ulonglong rollback_trans, ++ ulonglong denied_connections, ++ ulonglong lost_connections, ++ ulonglong access_denied_errors, ++ ulonglong empty_queries) ++{ ++ DBUG_ENTER("init_user_stats"); ++ DBUG_PRINT("info", ++ ("Add user_stats entry for user %s - priv_user %s", ++ user, priv_user)); ++ strncpy(user_stats->user, user, sizeof(user_stats->user)); ++ strncpy(user_stats->priv_user, priv_user, sizeof(user_stats->priv_user)); ++ ++ user_stats->total_connections = total_connections; ++ user_stats->concurrent_connections = concurrent_connections; ++ user_stats->connected_time = connected_time; ++ user_stats->busy_time = busy_time; ++ user_stats->cpu_time = cpu_time; ++ user_stats->bytes_received = bytes_received; ++ user_stats->bytes_sent = bytes_sent; ++ user_stats->binlog_bytes_written = binlog_bytes_written; ++ user_stats->rows_fetched = rows_fetched; ++ user_stats->rows_updated = rows_updated; ++ user_stats->rows_read = rows_read; ++ user_stats->select_commands = select_commands; ++ user_stats->update_commands = update_commands; ++ user_stats->other_commands = other_commands; ++ user_stats->commit_trans = commit_trans; ++ user_stats->rollback_trans = rollback_trans; ++ user_stats->denied_connections = denied_connections; ++ user_stats->lost_connections = lost_connections; ++ user_stats->access_denied_errors = access_denied_errors; ++ user_stats->empty_queries = empty_queries; ++ DBUG_VOID_RETURN; ++} ++ ++void add_user_stats(USER_STATS *user_stats, ++ uint total_connections, ++ uint concurrent_connections, ++ time_t connected_time, ++ double busy_time, ++ double cpu_time, ++ ulonglong bytes_received, ++ ulonglong bytes_sent, ++ ulonglong binlog_bytes_written, ++ ha_rows rows_fetched, ++ ha_rows rows_updated, ++ ha_rows rows_read, ++ ulonglong select_commands, ++ ulonglong update_commands, ++ ulonglong other_commands, ++ ulonglong commit_trans, ++ ulonglong rollback_trans, ++ ulonglong denied_connections, ++ ulonglong lost_connections, ++ ulonglong access_denied_errors, ++ ulonglong empty_queries) ++{ ++ user_stats->total_connections += total_connections; ++ user_stats->concurrent_connections += concurrent_connections; ++ user_stats->connected_time += connected_time; ++ user_stats->busy_time += busy_time; ++ user_stats->cpu_time += cpu_time; ++ user_stats->bytes_received += bytes_received; ++ user_stats->bytes_sent += bytes_sent; ++ user_stats->binlog_bytes_written += binlog_bytes_written; ++ user_stats->rows_fetched += rows_fetched; ++ user_stats->rows_updated += rows_updated; ++ user_stats->rows_read += rows_read; ++ user_stats->select_commands += select_commands; ++ user_stats->update_commands += update_commands; ++ user_stats->other_commands += other_commands; ++ user_stats->commit_trans += commit_trans; ++ user_stats->rollback_trans += rollback_trans; ++ user_stats->denied_connections += denied_connections; ++ user_stats->lost_connections += lost_connections; ++ user_stats->access_denied_errors += access_denied_errors; ++ user_stats->empty_queries += empty_queries; ++} ++ ++void init_global_user_stats(void) ++{ ++ if (hash_init(&global_user_stats, system_charset_info, max_connections, ++ 0, 0, (hash_get_key)get_key_user_stats, ++ (hash_free_key)free_user_stats, 0)) { ++ sql_print_error("Initializing global_user_stats failed."); ++ exit(1); ++ } ++} ++ ++void init_global_client_stats(void) ++{ ++ if (hash_init(&global_client_stats, system_charset_info, max_connections, ++ 0, 0, (hash_get_key)get_key_user_stats, ++ (hash_free_key)free_user_stats, 0)) { ++ sql_print_error("Initializing global_client_stats failed."); ++ exit(1); ++ } ++} ++ ++extern "C" byte *get_key_table_stats(TABLE_STATS *table_stats, uint *length, ++ my_bool not_used __attribute__((unused))) ++{ ++ *length = strlen(table_stats->table); ++ return (byte*)table_stats->table; ++} ++ ++extern "C" void free_table_stats(TABLE_STATS* table_stats) ++{ ++ my_free((char*)table_stats, MYF(0)); ++} ++ ++void init_global_table_stats(void) ++{ ++ if (hash_init(&global_table_stats, system_charset_info, max_connections, ++ 0, 0, (hash_get_key)get_key_table_stats, ++ (hash_free_key)free_table_stats, 0)) { ++ sql_print_error("Initializing global_table_stats failed."); ++ exit(1); ++ } ++} ++ ++extern "C" byte *get_key_index_stats(INDEX_STATS *index_stats, uint *length, ++ my_bool not_used __attribute__((unused))) ++{ ++ *length = strlen(index_stats->index); ++ return (byte*)index_stats->index; ++} ++ ++extern "C" void free_index_stats(INDEX_STATS* index_stats) ++{ ++ my_free((char*)index_stats, MYF(0)); ++} ++ ++void init_global_index_stats(void) ++{ ++ if (hash_init(&global_index_stats, system_charset_info, max_connections, ++ 0, 0, (hash_get_key)get_key_index_stats, ++ (hash_free_key)free_index_stats, 0)) { ++ sql_print_error("Initializing global_index_stats failed."); ++ exit(1); ++ } ++} ++ + + + /* +@@ -599,7 +795,10 @@ + + end: + if (error) ++ { ++ statistic_increment(denied_connections, &LOCK_status); + uc->connections--; // no need for decrease_user_connections() here ++ } + (void) pthread_mutex_unlock(&LOCK_user_conn); + DBUG_RETURN(error); + } +@@ -646,6 +845,25 @@ + #endif /* NO_EMBEDDED_ACCESS_CHECKS */ + } + ++void free_global_user_stats(void) ++{ ++ hash_free(&global_user_stats); ++} ++ ++void free_global_table_stats(void) ++{ ++ hash_free(&global_table_stats); ++} ++ ++void free_global_index_stats(void) ++{ ++ hash_free(&global_index_stats); ++} ++ ++void free_global_client_stats(void) ++{ ++ hash_free(&global_client_stats); ++} + + + /* +@@ -698,6 +916,214 @@ + return uc_update_queries[command] != 0; + } + ++// 'mysql_system_user' is used for when the user is not defined for a THD. ++static char mysql_system_user[] = "#mysql_system#"; ++ ++// Returns 'user' if it's not NULL. Returns 'mysql_system_user' otherwise. ++static char* get_valid_user_string(char* user) { ++ return user ? user : mysql_system_user; ++} ++ ++// Increments the global stats connection count for an entry from ++// global_client_stats or global_user_stats. Returns 0 on success ++// and 1 on error. ++static int increment_count_by_name(const char *name, const char *role_name, ++ HASH *users_or_clients, THD *thd) ++{ ++ USER_STATS* user_stats; ++ ++ if (!(user_stats = (USER_STATS*)hash_search(users_or_clients, name, ++ strlen(name)))) ++ { ++ // First connection for this user or client ++ if (!(user_stats = ((USER_STATS*) ++ my_malloc(sizeof(USER_STATS), MYF(MY_WME | MY_ZEROFILL))))) ++ { ++ return 1; // Out of memory ++ } ++ ++ init_user_stats(user_stats, name, role_name, ++ 0, 0, // connections ++ 0, 0, 0, // time ++ 0, 0, 0, // bytes sent, received and written ++ 0, 0, 0, // rows fetched, updated and read ++ 0, 0, 0, // select, update and other commands ++ 0, 0, // commit and rollback trans ++ thd->diff_denied_connections, ++ 0, // lost connections ++ 0, // access denied errors ++ 0); // empty queries ++ ++ if (my_hash_insert(users_or_clients, (byte*)user_stats)) ++ { ++ my_free((char*)user_stats, 0); ++ return 1; // Out of memory ++ } ++ } ++ user_stats->total_connections++; ++ return 0; ++} ++ ++// Increments the global user and client stats connection count. If 'use_lock' ++// is true, LOCK_global_user_client_stats will be locked/unlocked. Returns ++// 0 on success, 1 on error. ++static int increment_connection_count(THD* thd, bool use_lock) ++{ ++ char* user_string = get_valid_user_string(thd->main_security_ctx.user); ++ const char* client_string = get_client_host(thd); ++ int return_value = 0; ++ ++ if (!opt_userstat_running) ++ return return_value; ++ ++ if (use_lock) pthread_mutex_lock(&LOCK_global_user_client_stats); ++ ++ if (increment_count_by_name(user_string, user_string, ++ &global_user_stats, thd)) ++ { ++ return_value = 1; ++ goto end; ++ } ++ if (increment_count_by_name(client_string, ++ user_string, ++ &global_client_stats, thd)) ++ { ++ return_value = 1; ++ goto end; ++ } ++ ++end: ++ if (use_lock) pthread_mutex_unlock(&LOCK_global_user_client_stats); ++ return return_value; ++} ++ ++// Used to update the global user and client stats. ++static void update_global_user_stats_with_user(THD* thd, ++ USER_STATS* user_stats, ++ time_t now) ++{ ++ user_stats->connected_time += now - thd->last_global_update_time; ++ thd->last_global_update_time = now; ++ user_stats->busy_time += thd->diff_total_busy_time; ++ user_stats->cpu_time += thd->diff_total_cpu_time; ++ user_stats->bytes_received += thd->diff_total_bytes_received; ++ user_stats->bytes_sent += thd->diff_total_bytes_sent; ++ user_stats->binlog_bytes_written += thd->diff_total_binlog_bytes_written; ++ user_stats->rows_fetched += thd->diff_total_sent_rows; ++ user_stats->rows_updated += thd->diff_total_updated_rows; ++ user_stats->rows_read += thd->diff_total_read_rows; ++ user_stats->select_commands += thd->diff_select_commands; ++ user_stats->update_commands += thd->diff_update_commands; ++ user_stats->other_commands += thd->diff_other_commands; ++ user_stats->commit_trans += thd->diff_commit_trans; ++ user_stats->rollback_trans += thd->diff_rollback_trans; ++ user_stats->denied_connections += thd->diff_denied_connections; ++ user_stats->lost_connections += thd->diff_lost_connections; ++ user_stats->access_denied_errors += thd->diff_access_denied_errors; ++ user_stats->empty_queries += thd->diff_empty_queries; ++} ++ ++// Updates the global stats of a user or client ++void update_global_user_stats(THD* thd, bool create_user, time_t now) ++{ ++ if (opt_userstat_running) { ++ char* user_string = get_valid_user_string(thd->main_security_ctx.user); ++ const char* client_string = get_client_host(thd); ++ ++ USER_STATS* user_stats; ++ pthread_mutex_lock(&LOCK_global_user_client_stats); ++ ++ // Update by user name ++ if ((user_stats = (USER_STATS*)hash_search(&global_user_stats, ++ (byte*)user_string, ++ strlen(user_string)))) { ++ // Found user. ++ update_global_user_stats_with_user(thd, user_stats, now); ++ } else { ++ // Create the entry ++ if (create_user) { ++ increment_count_by_name(user_string, user_string, ++ &global_user_stats, thd); ++ } ++ } ++ ++ // Update by client IP ++ if ((user_stats = (USER_STATS*)hash_search(&global_client_stats, ++ (byte*)client_string, ++ strlen(client_string)))) { ++ // Found by client IP ++ update_global_user_stats_with_user(thd, user_stats, now); ++ } else { ++ // Create the entry ++ if (create_user) { ++ increment_count_by_name(client_string, ++ user_string, ++ &global_client_stats, thd); ++ } ++ } ++ thd->reset_diff_stats(); ++ ++ pthread_mutex_unlock(&LOCK_global_user_client_stats); ++ } else { ++ thd->reset_diff_stats(); ++ } ++} ++ ++// Determines the concurrent number of connections of current threads. ++static void set_connections_stats() ++{ ++ USER_STATS* user_stats; ++ ++ pthread_mutex_lock(&LOCK_global_user_client_stats); ++ pthread_mutex_lock(&LOCK_thread_count); ++ ++ // Resets all concurrent connections to 0. ++ for (int i = 0; i < global_user_stats.records; ++i) { ++ user_stats = (USER_STATS*)hash_element(&global_user_stats, i); ++ user_stats->concurrent_connections = 0; ++ } ++ for (int i = 0; i < global_client_stats.records; ++i) { ++ user_stats = (USER_STATS*)hash_element(&global_client_stats, i); ++ user_stats->concurrent_connections = 0; ++ } ++ ++ I_List_iterator<THD> it(threads); ++ THD* thd; ++ time_t now = time(NULL); ++ // Iterates through the current threads. ++ while ((thd = it++)) { ++ char* user_string = get_valid_user_string(thd->main_security_ctx.user); ++ if ((user_stats = (USER_STATS*)hash_search(&global_user_stats, ++ (byte*)user_string, ++ strlen(user_string)))) { ++ // Found user. ++ user_stats->concurrent_connections++; ++ update_global_user_stats_with_user(thd, user_stats, now); ++ } else { ++ // The user name should exist. ++ if (user_string == mysql_system_user) { ++ // Only create the user if it is the mysql_system_user ++ increment_count_by_name(user_string, user_string, ++ &global_user_stats, thd); ++ } ++ } ++ ++ const char* client_string = get_client_host(thd); ++ if ((user_stats = (USER_STATS*)hash_search(&global_client_stats, ++ (byte*)client_string, ++ strlen(client_string)))) { ++ // Found user. ++ user_stats->concurrent_connections++; ++ update_global_user_stats_with_user(thd, user_stats, now); ++ } else { ++ // Do nothing, unlike what is done for global_user_stats ++ } ++ thd->reset_diff_stats(); ++ } ++ pthread_mutex_unlock(&LOCK_thread_count); ++ pthread_mutex_unlock(&LOCK_global_user_client_stats); ++} ++ + /* + Reset per-hour user resource limits when it has been more than + an hour since they were last checked +@@ -1184,6 +1610,8 @@ + my_net_set_read_timeout(net, connect_timeout); + my_net_set_write_timeout(net, connect_timeout); + ++ bool create_user = true; ++ + if ((error=check_connection(thd))) + { // Wrong permissions + if (error > 0) +@@ -1193,8 +1621,22 @@ + my_sleep(1000); /* must wait after eof() */ + #endif + statistic_increment(aborted_connects,&LOCK_status); ++ thd->diff_denied_connections++; ++ if (error == -2) { ++ // Do not create statistics for a user who does not exist, or failed ++ // to authenticate. ++ create_user = false; ++ } + goto end_thread; + } ++ ++ thd->reset_stats(); ++ // Updates global user connection stats. ++ if (increment_connection_count(thd, true)) { ++ net_send_error(thd, ER_OUTOFMEMORY); // Out of memory ++ goto end_thread; ++ } ++ + #ifdef __NETWARE__ + netware_reg_user(sctx->ip, sctx->user, "MySQL"); + #endif +@@ -1251,6 +1693,7 @@ + (net->vio && net->error && net->report_error)) + { + statistic_increment(aborted_threads, &LOCK_status); ++ thd->diff_lost_connections++; + } + + if (net->error && net->vio != 0 && net->report_error) +@@ -1270,6 +1713,8 @@ + + end_thread: + close_connection(thd, 0, 1); ++ thd->update_stats(false); ++ update_global_user_stats(thd, create_user, time(NULL)); + end_thread(thd,1); + /* + If end_thread returns, we are either running with --one-thread +@@ -1601,6 +2046,13 @@ + + thd->clear_error(); // Clear error message + ++ thd->updated_row_count=0; ++ thd->busy_time=0; ++ thd->cpu_time=0; ++ thd->bytes_received=0; ++ thd->bytes_sent=0; ++ thd->binlog_bytes_written=0; ++ + net_new_transaction(net); + + packet_length= my_net_read(net); +@@ -1759,6 +2211,9 @@ + } + + thd->command=command; ++ // To increment the corrent command counter for user stats, 'command' must ++ // be saved because it is set to COM_SLEEP at the end of this function. ++ thd->old_command = command; + /* + Commands which always take a long time are logged into + the slow log only if opt_log_slow_admin_statements is set. +@@ -4539,6 +4994,15 @@ + if (check_global_access(thd,RELOAD_ACL)) + goto error; + ++ if(lex->type & REFRESH_SLOW_QUERY_LOG) { ++ /* We are only flushing slow query log */ ++ mysql_slow_log.new_file(1); ++ ++ send_ok(thd); ++ break; ++ } ++ ++ + /* + reload_acl_and_cache() will tell us if we are allowed to write to the + binlog or not. +@@ -4847,6 +5311,7 @@ + { + if (check_global_access(thd, SUPER_ACL)) + { ++ thd->diff_access_denied_errors++; + my_error(ER_SPECIFIC_ACCESS_DENIED_ERROR, MYF(0), "SUPER"); + goto create_sp_error; + } +@@ -5691,6 +6156,7 @@ + if (!no_errors) + { + const char *db_name= db ? db : thd->db; ++ thd->diff_access_denied_errors++; + my_error(ER_DBACCESS_DENIED_ERROR, MYF(0), + sctx->priv_user, sctx->priv_host, db_name); + } +@@ -5726,6 +6192,7 @@ + { // We can never grant this + DBUG_PRINT("error",("No possible access")); + if (!no_errors) ++ thd->diff_access_denied_errors++; + my_error(ER_ACCESS_DENIED_ERROR, MYF(0), + sctx->priv_user, + sctx->priv_host, +@@ -5758,11 +6225,15 @@ + + DBUG_PRINT("error",("Access denied")); + if (!no_errors) ++ { ++ // increment needs !no_errors condition, otherwise double counting. ++ thd->diff_access_denied_errors++; + my_error(ER_DBACCESS_DENIED_ERROR, MYF(0), + sctx->priv_user, sctx->priv_host, + (db ? db : (thd->db ? + thd->db : + "unknown"))); /* purecov: tested */ ++ } + DBUG_RETURN(TRUE); /* purecov: tested */ + #endif /* NO_EMBEDDED_ACCESS_CHECKS */ + } +@@ -5796,6 +6267,7 @@ + if ((thd->security_ctx->master_access & want_access)) + return 0; + get_privilege_desc(command, sizeof(command), want_access); ++ thd->diff_access_denied_errors++; + my_error(ER_SPECIFIC_ACCESS_DENIED_ERROR, MYF(0), command); + return 1; + #endif /* NO_EMBEDDED_ACCESS_CHECKS */ +@@ -5828,6 +6300,7 @@ + + if (!thd->col_access && check_grant_db(thd, dst_db_name)) + { ++ thd->diff_access_denied_errors++; + my_error(ER_DBACCESS_DENIED_ERROR, MYF(0), + thd->security_ctx->priv_user, + thd->security_ctx->priv_host, +@@ -5859,6 +6332,12 @@ + check_grant(thd, SELECT_ACL, dst_table, 2, UINT_MAX, FALSE); + } + ++ ++ case SCH_USER_STATS: ++ case SCH_CLIENT_STATS: ++ return check_global_access(thd, SUPER_ACL | PROCESS_ACL); ++ case SCH_TABLE_STATS: ++ case SCH_INDEX_STATS: + case SCH_OPEN_TABLES: + case SCH_VARIABLES: + case SCH_STATUS: +@@ -5912,8 +6391,8 @@ + #ifndef NO_EMBEDDED_ACCESS_CHECKS + TABLE_LIST *org_tables= tables; + #endif ++ Security_context *sctx= thd->security_ctx, *backup_ctx= thd->security_ctx; + TABLE_LIST *first_not_own_table= thd->lex->first_not_own_table(); +- Security_context *sctx= thd->security_ctx, *backup_ctx= thd->security_ctx; + /* + The check that first_not_own_table is not reached is for the case when + the given table list refers to the list for prelocking (contains tables +@@ -5930,9 +6409,12 @@ + (want_access & ~(SELECT_ACL | EXTRA_ACL | FILE_ACL))) + { + if (!no_errors) ++ { ++ thd->diff_access_denied_errors++; + my_error(ER_DBACCESS_DENIED_ERROR, MYF(0), + sctx->priv_user, sctx->priv_host, + INFORMATION_SCHEMA_NAME.str); ++ } + return TRUE; + } + /* +@@ -6442,6 +6924,30 @@ + lex_start(thd); + mysql_reset_thd_for_next_command(thd); + ++ int start_time_error = 0; ++ int end_time_error = 0; ++ struct timeval start_time, end_time; ++ double start_usecs = 0; ++ double end_usecs = 0; ++ /* cpu time */ ++ int cputime_error = 0; ++ struct timespec tp; ++ double start_cpu_nsecs = 0; ++ double end_cpu_nsecs = 0; ++ ++ if (opt_userstat_running) { ++#ifdef HAVE_CLOCK_GETTIME ++ /* get start cputime */ ++ if (!(cputime_error = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tp))) ++ start_cpu_nsecs = tp.tv_sec*1000000000.0+tp.tv_nsec; ++#endif ++ ++ // Gets the start time, in order to measure how long this command takes. ++ if (!(start_time_error = gettimeofday(&start_time, NULL))) { ++ start_usecs = start_time.tv_sec * 1000000.0 + start_time.tv_usec; ++ } ++ } ++ + if (query_cache_send_result_to_client(thd, (char*) inBuf, length) <= 0) + { + LEX *lex= thd->lex; +@@ -6520,6 +7026,43 @@ + *found_semicolon= NULL; + } + ++ if (opt_userstat_running) { ++ // Gets the end time. ++ if (!(end_time_error = gettimeofday(&end_time, NULL))) { ++ end_usecs = end_time.tv_sec * 1000000.0 + end_time.tv_usec; ++ } ++ ++ // Calculates the difference between the end and start times. ++ if (start_usecs && end_usecs >= start_usecs && !start_time_error && !end_time_error) { ++ thd->busy_time = (end_usecs - start_usecs) / 1000000; ++ // In case there are bad values, 2629743 is the #seconds in a month. ++ if (thd->busy_time > 2629743) { ++ thd->busy_time = 0; ++ } ++ } else { ++ // end time went back in time, or gettimeofday() failed. ++ thd->busy_time = 0; ++ } ++ ++#ifdef HAVE_CLOCK_GETTIME ++ /* get end cputime */ ++ if (!cputime_error && ++ !(cputime_error = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tp))) ++ end_cpu_nsecs = tp.tv_sec*1000000000.0+tp.tv_nsec; ++#endif ++ if (start_cpu_nsecs && !cputime_error) { ++ thd->cpu_time = (end_cpu_nsecs - start_cpu_nsecs) / 1000000000; ++ // In case there are bad values, 2629743 is the #seconds in a month. ++ if (thd->cpu_time > 2629743) { ++ thd->cpu_time = 0; ++ } ++ } else ++ thd->cpu_time = 0; ++ } ++ // Updates THD stats and the global user stats. ++ thd->update_stats(true); ++ update_global_user_stats(thd, true, time(NULL)); ++ + DBUG_VOID_RETURN; + } + +@@ -7531,8 +8074,35 @@ + pthread_mutex_unlock(&LOCK_active_mi); + } + #endif +- if (options & REFRESH_USER_RESOURCES) +- reset_mqh((LEX_USER *) NULL); ++ if (options & REFRESH_TABLE_STATS) ++ { ++ pthread_mutex_lock(&LOCK_global_table_stats); ++ free_global_table_stats(); ++ init_global_table_stats(); ++ pthread_mutex_unlock(&LOCK_global_table_stats); ++ } ++ if (options & REFRESH_INDEX_STATS) ++ { ++ pthread_mutex_lock(&LOCK_global_index_stats); ++ free_global_index_stats(); ++ init_global_index_stats(); ++ pthread_mutex_unlock(&LOCK_global_index_stats); ++ } ++ if (options & (REFRESH_USER_STATS | REFRESH_CLIENT_STATS)) ++ { ++ pthread_mutex_lock(&LOCK_global_user_client_stats); ++ if (options & REFRESH_USER_STATS) ++ { ++ free_global_user_stats(); ++ init_global_user_stats(); ++ } ++ if (options & REFRESH_CLIENT_STATS) ++ { ++ free_global_client_stats(); ++ init_global_client_stats(); ++ } ++ pthread_mutex_unlock(&LOCK_global_user_client_stats); ++ } + *write_to_binlog= tmp_write_to_binlog; + return result; + } +diff -r 592f6c3641ba sql/sql_prepare.cc +--- a/sql/sql_prepare.cc Wed Jul 29 13:33:34 2009 -0700 ++++ b/sql/sql_prepare.cc Wed Jul 29 13:34:11 2009 -0700 +@@ -81,6 +81,9 @@ + #include <mysql_com.h> + #endif + ++// Uses the THD to update the global stats by user name and client IP ++void update_global_user_stats(THD* thd, bool create_user, time_t now); ++ + /* A result class used to send cursor rows using the binary protocol. */ + + class Select_fetch_protocol_prep: public select_send +@@ -1910,8 +1913,32 @@ + /* First of all clear possible warnings from the previous command */ + mysql_reset_thd_for_next_command(thd); + ++ int start_time_error = 0; ++ int end_time_error = 0; ++ struct timeval start_time, end_time; ++ double start_usecs = 0; ++ double end_usecs = 0; ++ /* cpu time */ ++ int cputime_error = 0; ++ struct timespec tp; ++ double start_cpu_nsecs = 0; ++ double end_cpu_nsecs = 0; ++ ++ if (opt_userstat_running) { ++#ifdef HAVE_CLOCK_GETTIME ++ /* get start cputime */ ++ if (!(cputime_error = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tp))) ++ start_cpu_nsecs = tp.tv_sec*1000000000.0+tp.tv_nsec; ++#endif ++ ++ // Gets the start time, in order to measure how long this command takes. ++ if (!(start_time_error = gettimeofday(&start_time, NULL))) { ++ start_usecs = start_time.tv_sec * 1000000.0 + start_time.tv_usec; ++ } ++ } ++ + if (! (stmt= new Prepared_statement(thd, &thd->protocol_prep))) +- DBUG_VOID_RETURN; /* out of memory: error is set in Sql_alloc */ ++ goto end; /* out of memory: error is set in Sql_alloc */ + + if (thd->stmt_map.insert(thd, stmt)) + { +@@ -1919,7 +1946,7 @@ + The error is set in the insert. The statement itself + will be also deleted there (this is how the hash works). + */ +- DBUG_VOID_RETURN; ++ goto end; + } + + /* Reset warnings from previous command */ +@@ -1941,6 +1968,44 @@ + thd->stmt_map.erase(stmt); + } + /* check_prepared_statemnt sends the metadata packet in case of success */ ++end: ++ if (opt_userstat_running) { ++ // Gets the end time. ++ if (!(end_time_error = gettimeofday(&end_time, NULL))) { ++ end_usecs = end_time.tv_sec * 1000000.0 + end_time.tv_usec; ++ } ++ ++ // Calculates the difference between the end and start times. ++ if (start_usecs && end_usecs >= start_usecs && !start_time_error && !end_time_error) { ++ thd->busy_time = (end_usecs - start_usecs) / 1000000; ++ // In case there are bad values, 2629743 is the #seconds in a month. ++ if (thd->busy_time > 2629743) { ++ thd->busy_time = 0; ++ } ++ } else { ++ // end time went back in time, or gettimeofday() failed. ++ thd->busy_time = 0; ++ } ++ ++#ifdef HAVE_CLOCK_GETTIME ++ /* get end cputime */ ++ if (!cputime_error && ++ !(cputime_error = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tp))) ++ end_cpu_nsecs = tp.tv_sec*1000000000.0+tp.tv_nsec; ++#endif ++ if (start_cpu_nsecs && !cputime_error) { ++ thd->cpu_time = (end_cpu_nsecs - start_cpu_nsecs) / 1000000000; ++ // In case there are bad values, 2629743 is the #seconds in a month. ++ if (thd->cpu_time > 2629743) { ++ thd->cpu_time = 0; ++ } ++ } else ++ thd->cpu_time = 0; ++ } ++ // Updates THD stats and the global user stats. ++ thd->update_stats(true); ++ update_global_user_stats(thd, true, time(NULL)); ++ + DBUG_VOID_RETURN; + } + +@@ -2281,8 +2346,32 @@ + /* First of all clear possible warnings from the previous command */ + mysql_reset_thd_for_next_command(thd); + ++ int start_time_error = 0; ++ int end_time_error = 0; ++ struct timeval start_time, end_time; ++ double start_usecs = 0; ++ double end_usecs = 0; ++ /* cpu time */ ++ int cputime_error = 0; ++ struct timespec tp; ++ double start_cpu_nsecs = 0; ++ double end_cpu_nsecs = 0; ++ ++ if (opt_userstat_running) { ++#ifdef HAVE_CLOCK_GETTIME ++ /* get start cputime */ ++ if (!(cputime_error = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tp))) ++ start_cpu_nsecs = tp.tv_sec*1000000000.0+tp.tv_nsec; ++#endif ++ ++ // Gets the start time, in order to measure how long this command takes. ++ if (!(start_time_error = gettimeofday(&start_time, NULL))) { ++ start_usecs = start_time.tv_sec * 1000000.0 + start_time.tv_usec; ++ } ++ } ++ + if (!(stmt= find_prepared_statement(thd, stmt_id, "mysql_stmt_execute"))) +- DBUG_VOID_RETURN; ++ goto end; + + #if defined(ENABLED_PROFILING) && defined(COMMUNITY_SERVER) + thd->profiling.set_query_source(stmt->query, stmt->query_length); +@@ -2325,11 +2414,50 @@ + test(flags & (ulong) CURSOR_TYPE_READ_ONLY)); + if (!(specialflag & SPECIAL_NO_PRIOR)) + my_pthread_setprio(pthread_self(), WAIT_PRIOR); +- DBUG_VOID_RETURN; ++ goto end; + + set_params_data_err: + my_error(ER_WRONG_ARGUMENTS, MYF(0), "mysql_stmt_execute"); + reset_stmt_params(stmt); ++ ++end: ++ if (opt_userstat_running) { ++ // Gets the end time. ++ if (!(end_time_error = gettimeofday(&end_time, NULL))) { ++ end_usecs = end_time.tv_sec * 1000000.0 + end_time.tv_usec; ++ } ++ ++ // Calculates the difference between the end and start times. ++ if (start_usecs && end_usecs >= start_usecs && !start_time_error && !end_time_error) { ++ thd->busy_time = (end_usecs - start_usecs) / 1000000; ++ // In case there are bad values, 2629743 is the #seconds in a month. ++ if (thd->busy_time > 2629743) { ++ thd->busy_time = 0; ++ } ++ } else { ++ // end time went back in time, or gettimeofday() failed. ++ thd->busy_time = 0; ++ } ++ ++#ifdef HAVE_CLOCK_GETTIME ++ /* get end cputime */ ++ if (!cputime_error && ++ !(cputime_error = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tp))) ++ end_cpu_nsecs = tp.tv_sec*1000000000.0+tp.tv_nsec; ++#endif ++ if (start_cpu_nsecs && !cputime_error) { ++ thd->cpu_time = (end_cpu_nsecs - start_cpu_nsecs) / 1000000000; ++ // In case there are bad values, 2629743 is the #seconds in a month. ++ if (thd->cpu_time > 2629743) { ++ thd->cpu_time = 0; ++ } ++ } else ++ thd->cpu_time = 0; ++ } ++ // Updates THD stats and the global user stats. ++ thd->update_stats(true); ++ update_global_user_stats(thd, true, time(NULL)); ++ + DBUG_VOID_RETURN; + } + +@@ -2423,6 +2551,31 @@ + + /* First of all clear possible warnings from the previous command */ + mysql_reset_thd_for_next_command(thd); ++ ++ int start_time_error = 0; ++ int end_time_error = 0; ++ struct timeval start_time, end_time; ++ double start_usecs = 0; ++ double end_usecs = 0; ++ /* cpu time */ ++ int cputime_error = 0; ++ struct timespec tp; ++ double start_cpu_nsecs = 0; ++ double end_cpu_nsecs = 0; ++ ++ if (opt_userstat_running) { ++#ifdef HAVE_CLOCK_GETTIME ++ /* get start cputime */ ++ if (!(cputime_error = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tp))) ++ start_cpu_nsecs = tp.tv_sec*1000000000.0+tp.tv_nsec; ++#endif ++ ++ // Gets the start time, in order to measure how long this command takes. ++ if (!(start_time_error = gettimeofday(&start_time, NULL))) { ++ start_usecs = start_time.tv_sec * 1000000.0 + start_time.tv_usec; ++ } ++ } ++ + statistic_increment(thd->status_var.com_stmt_fetch, &LOCK_status); + if (!(stmt= find_prepared_statement(thd, stmt_id, "mysql_stmt_fetch"))) + DBUG_VOID_RETURN; +@@ -2455,6 +2608,43 @@ + thd->restore_backup_statement(stmt, &stmt_backup); + thd->stmt_arena= thd; + ++ if (opt_userstat_running) { ++ // Gets the end time. ++ if (!(end_time_error = gettimeofday(&end_time, NULL))) { ++ end_usecs = end_time.tv_sec * 1000000.0 + end_time.tv_usec; ++ } ++ ++ // Calculates the difference between the end and start times. ++ if (start_usecs && end_usecs >= start_usecs && !start_time_error && !end_time_error) { ++ thd->busy_time = (end_usecs - start_usecs) / 1000000; ++ // In case there are bad values, 2629743 is the #seconds in a month. ++ if (thd->busy_time > 2629743) { ++ thd->busy_time = 0; ++ } ++ } else { ++ // end time went back in time, or gettimeofday() failed. ++ thd->busy_time = 0; ++ } ++ ++#ifdef HAVE_CLOCK_GETTIME ++ /* get end cputime */ ++ if (!cputime_error && ++ !(cputime_error = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tp))) ++ end_cpu_nsecs = tp.tv_sec*1000000000.0+tp.tv_nsec; ++#endif ++ if (start_cpu_nsecs && !cputime_error) { ++ thd->cpu_time = (end_cpu_nsecs - start_cpu_nsecs) / 1000000000; ++ // In case there are bad values, 2629743 is the #seconds in a month. ++ if (thd->cpu_time > 2629743) { ++ thd->cpu_time = 0; ++ } ++ } else ++ thd->cpu_time = 0; ++ } ++ // Updates THD stats and the global user stats. ++ thd->update_stats(true); ++ update_global_user_stats(thd, true, time(NULL)); ++ + DBUG_VOID_RETURN; + } + +@@ -2487,6 +2677,30 @@ + /* First of all clear possible warnings from the previous command */ + mysql_reset_thd_for_next_command(thd); + ++ int start_time_error = 0; ++ int end_time_error = 0; ++ struct timeval start_time, end_time; ++ double start_usecs = 0; ++ double end_usecs = 0; ++ /* cpu time */ ++ int cputime_error = 0; ++ struct timespec tp; ++ double start_cpu_nsecs = 0; ++ double end_cpu_nsecs = 0; ++ ++ if (opt_userstat_running) { ++#ifdef HAVE_CLOCK_GETTIME ++ /* get start cputime */ ++ if (!(cputime_error = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tp))) ++ start_cpu_nsecs = tp.tv_sec*1000000000.0+tp.tv_nsec; ++#endif ++ ++ // Gets the start time, in order to measure how long this command takes. ++ if (!(start_time_error = gettimeofday(&start_time, NULL))) { ++ start_usecs = start_time.tv_sec * 1000000.0 + start_time.tv_usec; ++ } ++ } ++ + statistic_increment(thd->status_var.com_stmt_reset, &LOCK_status); + if (!(stmt= find_prepared_statement(thd, stmt_id, "mysql_stmt_reset"))) + DBUG_VOID_RETURN; +@@ -2503,6 +2717,43 @@ + + send_ok(thd); + ++ if (opt_userstat_running) { ++ // Gets the end time. ++ if (!(end_time_error = gettimeofday(&end_time, NULL))) { ++ end_usecs = end_time.tv_sec * 1000000.0 + end_time.tv_usec; ++ } ++ ++ // Calculates the difference between the end and start times. ++ if (start_usecs && end_usecs >= start_usecs && !start_time_error && !end_time_error) { ++ thd->busy_time = (end_usecs - start_usecs) / 1000000; ++ // In case there are bad values, 2629743 is the #seconds in a month. ++ if (thd->busy_time > 2629743) { ++ thd->busy_time = 0; ++ } ++ } else { ++ // end time went back in time, or gettimeofday() failed. ++ thd->busy_time = 0; ++ } ++ ++#ifdef HAVE_CLOCK_GETTIME ++ /* get end cputime */ ++ if (!cputime_error && ++ !(cputime_error = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tp))) ++ end_cpu_nsecs = tp.tv_sec*1000000000.0+tp.tv_nsec; ++#endif ++ if (start_cpu_nsecs && !cputime_error) { ++ thd->cpu_time = (end_cpu_nsecs - start_cpu_nsecs) / 1000000000; ++ // In case there are bad values, 2629743 is the #seconds in a month. ++ if (thd->cpu_time > 2629743) { ++ thd->cpu_time = 0; ++ } ++ } else ++ thd->cpu_time = 0; ++ } ++ // Updates THD stats and the global user stats. ++ thd->update_stats(true); ++ update_global_user_stats(thd, true, time(NULL)); ++ + DBUG_VOID_RETURN; + } + +diff -r 592f6c3641ba sql/sql_show.cc +--- a/sql/sql_show.cc Wed Jul 29 13:33:34 2009 -0700 ++++ b/sql/sql_show.cc Wed Jul 29 13:34:11 2009 -0700 +@@ -540,6 +540,7 @@ + sctx->master_access); + if (!(db_access & DB_ACLS) && (!grant_option || check_grant_db(thd,dbname))) + { ++ thd->diff_access_denied_errors++; + my_error(ER_DBACCESS_DENIED_ERROR, MYF(0), + sctx->priv_user, sctx->host_or_ip, dbname); + mysql_log.write(thd,COM_INIT_DB,ER(ER_DBACCESS_DENIED_ERROR), +@@ -1890,6 +1891,300 @@ + DBUG_RETURN(FALSE); + } + ++/* ++ Aggregate values for mapped_user entries by their role. ++ ++ SYNOPSIS ++ aggregate_user_stats ++ all_user_stats - input to aggregate ++ agg_user_stats - returns aggregated values ++ ++ RETURN ++ 0 - OK ++ 1 - error ++ */ ++static int ++aggregate_user_stats(HASH *all_user_stats, HASH *agg_user_stats) ++{ ++ DBUG_ENTER("aggregate_user_stats"); ++ if (hash_init(agg_user_stats, system_charset_info, ++ max(all_user_stats->records, 1), ++ 0, 0, (hash_get_key)get_key_user_stats, ++ (hash_free_key)free_user_stats, 0)) ++ { ++ sql_print_error("Malloc in aggregate_user_stats failed"); ++ DBUG_RETURN(1); ++ } ++ ++ for (int i = 0; i < all_user_stats->records; ++i) { ++ USER_STATS *user = (USER_STATS*)hash_element(all_user_stats, i); ++ USER_STATS *agg_user; ++ if (!(agg_user = (USER_STATS*)hash_search(agg_user_stats, ++ (byte*)user->priv_user, ++ strlen(user->priv_user)))) ++ { ++ // First entry for this role. ++ if (!(agg_user = ++ (USER_STATS*) my_malloc(sizeof(USER_STATS), MYF(MY_WME | MY_ZEROFILL)))) ++ { ++ sql_print_error("Malloc in aggregate_user_stats failed"); ++ DBUG_RETURN(1); ++ } ++ ++ init_user_stats(agg_user, user->priv_user, user->priv_user, ++ user->total_connections, user->concurrent_connections, ++ user->connected_time, user->busy_time, user->cpu_time, ++ user->bytes_received, user->bytes_sent, ++ user->binlog_bytes_written, ++ user->rows_fetched, user->rows_updated, user->rows_read, ++ user->select_commands, user->update_commands, ++ user->other_commands, ++ user->commit_trans, user->rollback_trans, ++ user->denied_connections, user->lost_connections, ++ user->access_denied_errors, user->empty_queries); ++ ++ if (my_hash_insert(agg_user_stats, (byte*)agg_user)) ++ { ++ // Out of memory. ++ my_free((char*)agg_user, 0); ++ sql_print_error("Malloc in aggregate_user_stats failed"); ++ DBUG_RETURN(1); ++ } ++ } ++ else ++ { ++ // Aggregate with existing values for this role. ++ add_user_stats(agg_user, ++ user->total_connections, user->concurrent_connections, ++ user->connected_time, user->busy_time, user->cpu_time, ++ user->bytes_received, user->bytes_sent, ++ user->binlog_bytes_written, ++ user->rows_fetched, user->rows_updated, user->rows_read, ++ user->select_commands, user->update_commands, ++ user->other_commands, ++ user->commit_trans, user->rollback_trans, ++ user->denied_connections, user->lost_connections, ++ user->access_denied_errors, user->empty_queries); ++ } ++ } ++ DBUG_PRINT("exit", ("aggregated %d input into %d output entries", ++ all_user_stats->records, agg_user_stats->records)); ++ DBUG_RETURN(0); ++} ++ ++/* ++ Write result to network for SHOW USER_STATISTICS ++ ++ SYNOPSIS ++ send_user_stats ++ all_user_stats - values to return ++ table - I_S table ++ ++ RETURN ++ 0 - OK ++ 1 - error ++ */ ++int send_user_stats(THD* thd, HASH *all_user_stats, TABLE *table) ++{ ++ DBUG_ENTER("send_user_stats"); ++ for (int i = 0; i < all_user_stats->records; ++i) { ++ restore_record(table, s->default_values); ++ USER_STATS *user_stats = (USER_STATS*)hash_element(all_user_stats, i); ++ table->field[0]->store(user_stats->user, strlen(user_stats->user), system_charset_info); ++ table->field[1]->store((longlong)user_stats->total_connections); ++ table->field[2]->store((longlong)user_stats->concurrent_connections); ++ table->field[3]->store((longlong)user_stats->connected_time); ++ table->field[4]->store((longlong)user_stats->busy_time); ++ table->field[5]->store((longlong)user_stats->cpu_time); ++ table->field[6]->store((longlong)user_stats->bytes_received); ++ table->field[7]->store((longlong)user_stats->bytes_sent); ++ table->field[8]->store((longlong)user_stats->binlog_bytes_written); ++ table->field[9]->store((longlong)user_stats->rows_fetched); ++ table->field[10]->store((longlong)user_stats->rows_updated); ++ table->field[11]->store((longlong)user_stats->rows_read); ++ table->field[12]->store((longlong)user_stats->select_commands); ++ table->field[13]->store((longlong)user_stats->update_commands); ++ table->field[14]->store((longlong)user_stats->other_commands); ++ table->field[15]->store((longlong)user_stats->commit_trans); ++ table->field[16]->store((longlong)user_stats->rollback_trans); ++ table->field[17]->store((longlong)user_stats->denied_connections); ++ table->field[18]->store((longlong)user_stats->lost_connections); ++ table->field[19]->store((longlong)user_stats->access_denied_errors); ++ table->field[20]->store((longlong)user_stats->empty_queries); ++ if (schema_table_store_record(thd, table)) ++ { ++ DBUG_PRINT("error", ("store record error")); ++ DBUG_RETURN(1); ++ } ++ } ++ DBUG_RETURN(0); ++} ++ ++/* ++ Process SHOW USER_STATISTICS ++ ++ SYNOPSIS ++ mysqld_show_user_stats ++ thd - current thread ++ wild - limit results to the entry for this user ++ with_roles - when true, display role for mapped users ++ ++ RETURN ++ 0 - OK ++ 1 - error ++ */ ++ ++ ++int fill_schema_user_stats(THD* thd, TABLE_LIST* tables, COND* cond) ++{ ++ TABLE *table= tables->table; ++ DBUG_ENTER("fill_schema_user_stats"); ++ ++ if (check_global_access(thd, SUPER_ACL | PROCESS_ACL)) ++ DBUG_RETURN(1); ++ ++ // Iterates through all the global stats and sends them to the client. ++ // Pattern matching on the client IP is supported. ++ ++ pthread_mutex_lock(&LOCK_global_user_client_stats); ++ int result= send_user_stats(thd, &global_user_stats, table); ++ pthread_mutex_unlock(&LOCK_global_user_client_stats); ++ if (result) ++ goto err; ++ ++ DBUG_PRINT("exit", ("fill_schema_user_stats result is 0")); ++ DBUG_RETURN(0); ++ ++ err: ++ DBUG_PRINT("exit", ("fill_schema_user_stats result is 1")); ++ DBUG_RETURN(1); ++} ++ ++/* ++ Process SHOW CLIENT_STATISTICS ++ ++ SYNOPSIS ++ mysqld_show_client_stats ++ thd - current thread ++ wild - limit results to the entry for this client ++ ++ RETURN ++ 0 - OK ++ 1 - error ++ */ ++ ++ ++int fill_schema_client_stats(THD* thd, TABLE_LIST* tables, COND* cond) ++{ ++ TABLE *table= tables->table; ++ DBUG_ENTER("fill_schema_client_stats"); ++ ++ if (check_global_access(thd, SUPER_ACL | PROCESS_ACL)) ++ DBUG_RETURN(1); ++ ++ // Iterates through all the global stats and sends them to the client. ++ // Pattern matching on the client IP is supported. ++ ++ pthread_mutex_lock(&LOCK_global_user_client_stats); ++ int result= send_user_stats(thd, &global_client_stats, table); ++ pthread_mutex_unlock(&LOCK_global_user_client_stats); ++ if (result) ++ goto err; ++ ++ DBUG_PRINT("exit", ("mysqld_show_client_stats result is 0")); ++ DBUG_RETURN(0); ++ ++ err: ++ DBUG_PRINT("exit", ("mysqld_show_client_stats result is 1")); ++ DBUG_RETURN(1); ++} ++ ++ ++// Sends the global table stats back to the client. ++int fill_schema_table_stats(THD* thd, TABLE_LIST* tables, COND* cond) ++{ ++ TABLE *table= tables->table; ++ DBUG_ENTER("fill_schema_table_stats"); ++ char *table_full_name, *table_schema; ++ ++ pthread_mutex_lock(&LOCK_global_table_stats); ++ for (int i = 0; i < global_table_stats.records; ++i) { ++ restore_record(table, s->default_values); ++ TABLE_STATS *table_stats = ++ (TABLE_STATS*)hash_element(&global_table_stats, i); ++ ++ table_full_name= thd->strdup(table_stats->table); ++ table_schema= strsep(&table_full_name, "."); ++ ++ TABLE_LIST tmp_table; ++ bzero((char*) &tmp_table,sizeof(tmp_table)); ++ tmp_table.table_name= table_full_name; ++ tmp_table.db= table_schema; ++ tmp_table.grant.privilege= 0; ++ if (check_access(thd, SELECT_ACL | EXTRA_ACL, tmp_table.db, ++ &tmp_table.grant.privilege, 0, 0, ++ is_schema_db(table_schema)) || ++ grant_option && check_grant(thd, SELECT_ACL, &tmp_table, 1, UINT_MAX, 1)) ++ continue; ++ ++ table->field[0]->store(table_schema, strlen(table_schema), system_charset_info); ++ table->field[1]->store(table_full_name, strlen(table_full_name), system_charset_info); ++ table->field[2]->store((longlong)table_stats->rows_read, TRUE); ++ table->field[3]->store((longlong)table_stats->rows_changed, TRUE); ++ table->field[4]->store((longlong)table_stats->rows_changed_x_indexes, TRUE); ++ ++ if (schema_table_store_record(thd, table)) ++ { ++ VOID(pthread_mutex_unlock(&LOCK_global_table_stats)); ++ DBUG_RETURN(1); ++ } ++ } ++ pthread_mutex_unlock(&LOCK_global_table_stats); ++ DBUG_RETURN(0); ++} ++ ++// Sends the global index stats back to the client. ++int fill_schema_index_stats(THD* thd, TABLE_LIST* tables, COND* cond) ++{ ++ TABLE *table= tables->table; ++ DBUG_ENTER("fill_schema_index_stats"); ++ char *index_full_name, *table_schema, *table_name; ++ ++ pthread_mutex_lock(&LOCK_global_index_stats); ++ for (int i = 0; i < global_index_stats.records; ++i) { ++ restore_record(table, s->default_values); ++ INDEX_STATS *index_stats = ++ (INDEX_STATS*)hash_element(&global_index_stats, i); ++ ++ index_full_name= thd->strdup(index_stats->index); ++ table_schema= strsep(&index_full_name, "."); ++ table_name= strsep(&index_full_name, "."); ++ ++ TABLE_LIST tmp_table; ++ bzero((char*) &tmp_table,sizeof(tmp_table)); ++ tmp_table.table_name= table_name; ++ tmp_table.db= table_schema; ++ tmp_table.grant.privilege= 0; ++ if (check_access(thd, SELECT_ACL | EXTRA_ACL, tmp_table.db, ++ &tmp_table.grant.privilege, 0, 0, ++ is_schema_db(table_schema)) || ++ grant_option && check_grant(thd, SELECT_ACL, &tmp_table, 1, UINT_MAX, 1)) ++ continue; ++ ++ table->field[0]->store(table_schema, strlen(table_schema), system_charset_info); ++ table->field[1]->store(table_name, strlen(table_name), system_charset_info); ++ table->field[2]->store(index_full_name, strlen(index_full_name), system_charset_info); ++ table->field[3]->store((longlong)index_stats->rows_read, TRUE); ++ ++ if (schema_table_store_record(thd, table)) ++ { ++ VOID(pthread_mutex_unlock(&LOCK_global_index_stats)); ++ DBUG_RETURN(1); ++ } ++ } ++ pthread_mutex_unlock(&LOCK_global_index_stats); ++ DBUG_RETURN(0); ++} + + /* collect status for all running threads */ + +@@ -4500,6 +4795,77 @@ + {0, 0, MYSQL_TYPE_STRING, 0, 0, 0} + }; + ++ST_FIELD_INFO user_stats_fields_info[]= ++{ ++ {"USER", USERNAME_LENGTH, MYSQL_TYPE_STRING, 0, 0, "User"}, ++ {"TOTAL_CONNECTIONS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Total_connections"}, ++ {"CONCURRENT_CONNECTIONS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Concurrent_connections"}, ++ {"CONNECTED_TIME", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Connected_time"}, ++ {"BUSY_TIME", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Busy_time"}, ++ {"CPU_TIME", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Cpu_time"}, ++ {"BYTES_RECEIVED", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Bytes_received"}, ++ {"BYTES_SENT", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Bytes_sent"}, ++ {"BINLOG_BYTES_WRITTEN", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Binlog_bytes_written"}, ++ {"ROWS_FETCHED", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Rows_fetched"}, ++ {"ROWS_UPDATED", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Rows_updated"}, ++ {"TABLE_ROWS_READ", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Table_rows_read"}, ++ {"SELECT_COMMANDS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Select_commands"}, ++ {"UPDATE_COMMANDS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Update_commands"}, ++ {"OTHER_COMMANDS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Other_commands"}, ++ {"COMMIT_TRANSACTIONS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Commit_transactions"}, ++ {"ROLLBACK_TRANSACTIONS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Rollback_transactions"}, ++ {"DENIED_CONNECTIONS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Denied_connections"}, ++ {"LOST_CONNECTIONS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Lost_connections"}, ++ {"ACCESS_DENIED", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Access_denied"}, ++ {"EMPTY_QUERIES", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Empty_queries"}, ++ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0} ++}; ++ ++ST_FIELD_INFO client_stats_fields_info[]= ++{ ++ {"CLIENT", LIST_PROCESS_HOST_LEN, MYSQL_TYPE_STRING, 0, 0, "Client"}, ++ {"TOTAL_CONNECTIONS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Total_connections"}, ++ {"CONCURRENT_CONNECTIONS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Concurrent_connections"}, ++ {"CONNECTED_TIME", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Connected_time"}, ++ {"BUSY_TIME", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Busy_time"}, ++ {"CPU_TIME", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Cpu_time"}, ++ {"BYTES_RECEIVED", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Bytes_received"}, ++ {"BYTES_SENT", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Bytes_sent"}, ++ {"BINLOG_BYTES_WRITTEN", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Binlog_bytes_written"}, ++ {"ROWS_FETCHED", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Rows_fetched"}, ++ {"ROWS_UPDATED", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Rows_updated"}, ++ {"TABLE_ROWS_READ", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Table_rows_read"}, ++ {"SELECT_COMMANDS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Select_commands"}, ++ {"UPDATE_COMMANDS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Update_commands"}, ++ {"OTHER_COMMANDS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Other_commands"}, ++ {"COMMIT_TRANSACTIONS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Commit_transactions"}, ++ {"ROLLBACK_TRANSACTIONS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Rollback_transactions"}, ++ {"DENIED_CONNECTIONS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Denied_connections"}, ++ {"LOST_CONNECTIONS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Lost_connections"}, ++ {"ACCESS_DENIED", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Access_denied"}, ++ {"EMPTY_QUERIES", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Empty_queries"}, ++ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0} ++}; ++ ++ ++ST_FIELD_INFO table_stats_fields_info[]= ++{ ++ {"TABLE_SCHEMA", NAME_LEN, MYSQL_TYPE_STRING, 0, 0, "Table_schema"}, ++ {"TABLE_NAME", NAME_LEN, MYSQL_TYPE_STRING, 0, 0, "Table_name"}, ++ {"ROWS_READ", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Rows_read"}, ++ {"ROWS_CHANGED", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Rows_changed"}, ++ {"ROWS_CHANGED_X_INDEXES", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Rows_changed_x_#indexes"}, ++ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0} ++}; ++ ++ST_FIELD_INFO index_stats_fields_info[]= ++{ ++ {"TABLE_SCHEMA", NAME_LEN, MYSQL_TYPE_STRING, 0, 0, "Table_schema"}, ++ {"TABLE_NAME", NAME_LEN, MYSQL_TYPE_STRING, 0, 0, "Table_name"}, ++ {"INDEX_NAME", NAME_LEN, MYSQL_TYPE_STRING, 0, 0, "Index_name"}, ++ {"ROWS_READ", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Rows_read"}, ++ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0} ++}; + + /* + Description of ST_FIELD_INFO in table.h +@@ -4509,6 +4875,8 @@ + { + {"CHARACTER_SETS", charsets_fields_info, create_schema_table, + fill_schema_charsets, make_character_sets_old_format, 0, -1, -1, 0}, ++ {"CLIENT_STATISTICS", client_stats_fields_info, create_schema_table, ++ fill_schema_client_stats, make_old_format, 0, -1, -1, 0}, + {"COLLATIONS", collation_fields_info, create_schema_table, + fill_schema_collation, make_old_format, 0, -1, -1, 0}, + {"COLLATION_CHARACTER_SET_APPLICABILITY", coll_charset_app_fields_info, +@@ -4517,6 +4885,8 @@ + get_all_tables, make_columns_old_format, get_schema_column_record, 1, 2, 0}, + {"COLUMN_PRIVILEGES", column_privileges_fields_info, create_schema_table, + fill_schema_column_privileges, 0, 0, -1, -1, 0}, ++ {"INDEX_STATISTICS", index_stats_fields_info, create_schema_table, ++ fill_schema_index_stats, make_old_format, 0, -1, -1, 0}, + {"KEY_COLUMN_USAGE", key_column_usage_fields_info, create_schema_table, + get_all_tables, 0, get_schema_key_column_usage_record, 4, 5, 0}, + {"OPEN_TABLES", open_tables_fields_info, create_schema_table, +@@ -4542,10 +4912,14 @@ + get_all_tables, make_table_names_old_format, 0, 1, 2, 1}, + {"TABLE_PRIVILEGES", table_privileges_fields_info, create_schema_table, + fill_schema_table_privileges, 0, 0, -1, -1, 0}, ++ {"TABLE_STATISTICS", table_stats_fields_info, create_schema_table, ++ fill_schema_table_stats, make_old_format, 0, -1, -1, 0}, + {"TRIGGERS", triggers_fields_info, create_schema_table, + get_all_tables, make_old_format, get_schema_triggers_record, 5, 6, 0}, + {"USER_PRIVILEGES", user_privileges_fields_info, create_schema_table, + fill_schema_user_privileges, 0, 0, -1, -1, 0}, ++ {"USER_STATISTICS", user_stats_fields_info, create_schema_table, ++ fill_schema_user_stats, make_old_format, 0, -1, -1, 0}, + {"VARIABLES", variables_fields_info, create_schema_table, fill_variables, + make_old_format, 0, -1, -1, 1}, + {"VIEWS", view_fields_info, create_schema_table, +diff -r 592f6c3641ba sql/sql_update.cc +--- a/sql/sql_update.cc Wed Jul 29 13:33:34 2009 -0700 ++++ b/sql/sql_update.cc Wed Jul 29 13:34:11 2009 -0700 +@@ -601,7 +601,8 @@ + (thd->client_capabilities & CLIENT_FOUND_ROWS) ? found : updated; + send_ok(thd, (ulong) thd->row_count_func, + thd->insert_id_used ? thd->last_insert_id : 0L,buff); +- DBUG_PRINT("info",("%ld records updated", (long) updated)); ++ thd->updated_row_count += thd->row_count_func; ++ DBUG_PRINT("info",("%d records updated",updated)); + } + thd->count_cuted_fields= CHECK_FIELD_IGNORE; /* calc cuted fields */ + thd->abort_on_warning= 0; +@@ -1832,5 +1833,6 @@ + (thd->client_capabilities & CLIENT_FOUND_ROWS) ? found : updated; + ::send_ok(thd, (ulong) thd->row_count_func, + thd->insert_id_used ? thd->last_insert_id : 0L,buff); ++ thd->updated_row_count += thd->row_count_func; + return FALSE; + } +diff -r 592f6c3641ba sql/sql_yacc.yy +--- a/sql/sql_yacc.yy Wed Jul 29 13:33:34 2009 -0700 ++++ b/sql/sql_yacc.yy Wed Jul 29 13:34:11 2009 -0700 +@@ -523,6 +523,7 @@ + %token CHECK_SYM + %token CIPHER_SYM + %token CLIENT_SYM ++%token CLIENT_STATS_SYM + %token CLOSE_SYM + %token COALESCE + %token CODE_SYM +@@ -680,6 +681,7 @@ + %token IMPORT + %token INDEXES + %token INDEX_SYM ++%token INDEX_STATS_SYM + %token INFILE + %token INNER_SYM + %token INNOBASE_SYM +@@ -909,6 +911,7 @@ + %token SIGNED_SYM + %token SIMPLE_SYM + %token SLAVE ++%token SLOW_SYM + %token SMALLINT + %token SNAPSHOT_SYM + %token SOUNDS_SYM +@@ -949,6 +952,7 @@ + %token TABLES + %token TABLESPACE + %token TABLE_SYM ++%token TABLE_STATS_SYM + %token TEMPORARY + %token TEMPTABLE_SYM + %token TERMINATED +@@ -991,6 +995,7 @@ + %token UPGRADE_SYM + %token USAGE + %token USER ++%token USER_STATS_SYM + %token USE_FRM + %token USE_SYM + %token USING +@@ -8255,6 +8260,38 @@ + { + Lex->sql_command = SQLCOM_SHOW_SLAVE_STAT; + } ++ | CLIENT_STATS_SYM wild_and_where ++ { ++ LEX *lex= Lex; ++ Lex->sql_command = SQLCOM_SELECT; ++ lex->orig_sql_command= SQLCOM_SHOW_CLIENT_STATS; ++ if (prepare_schema_table(YYTHD, lex, 0, SCH_CLIENT_STATS)) ++ MYSQL_YYABORT; ++ } ++ | USER_STATS_SYM wild_and_where ++ { ++ LEX *lex= Lex; ++ lex->sql_command = SQLCOM_SELECT; ++ lex->orig_sql_command= SQLCOM_SHOW_USER_STATS; ++ if (prepare_schema_table(YYTHD, lex, 0, SCH_USER_STATS)) ++ MYSQL_YYABORT; ++ } ++ | TABLE_STATS_SYM wild_and_where ++ { ++ LEX *lex= Lex; ++ lex->sql_command= SQLCOM_SELECT; ++ lex->orig_sql_command= SQLCOM_SHOW_TABLE_STATS; ++ if (prepare_schema_table(YYTHD, lex, 0, SCH_TABLE_STATS)) ++ MYSQL_YYABORT; ++ } ++ | INDEX_STATS_SYM wild_and_where ++ { ++ LEX *lex= Lex; ++ lex->sql_command= SQLCOM_SELECT; ++ lex->orig_sql_command= SQLCOM_SHOW_INDEX_STATS; ++ if (prepare_schema_table(YYTHD, lex, 0, SCH_INDEX_STATS)) ++ MYSQL_YYABORT; ++ } + | CREATE PROCEDURE sp_name + { + LEX *lex= Lex; +@@ -8459,9 +8496,14 @@ + | LOGS_SYM { Lex->type|= REFRESH_LOG; } + | STATUS_SYM { Lex->type|= REFRESH_STATUS; } + | SLAVE { Lex->type|= REFRESH_SLAVE; } ++ | SLOW_SYM QUERY_SYM LOGS_SYM { Lex->type |= REFRESH_SLOW_QUERY_LOG; } + | MASTER_SYM { Lex->type|= REFRESH_MASTER; } + | DES_KEY_FILE { Lex->type|= REFRESH_DES_KEY_FILE; } +- | RESOURCES { Lex->type|= REFRESH_USER_RESOURCES; }; ++ | RESOURCES { Lex->type|= REFRESH_USER_RESOURCES; } ++ | CLIENT_STATS_SYM { Lex->type|= REFRESH_CLIENT_STATS; } ++ | USER_STATS_SYM { Lex->type|= REFRESH_USER_STATS; } ++ | TABLE_STATS_SYM { Lex->type|= REFRESH_TABLE_STATS; } ++ | INDEX_STATS_SYM { Lex->type|= REFRESH_INDEX_STATS; }; + + opt_table_list: + /* empty */ {;} +@@ -9450,6 +9492,7 @@ + | CHAIN_SYM {} + | CHANGED {} + | CIPHER_SYM {} ++ | CLIENT_STATS_SYM {} + | CLIENT_SYM {} + | CODE_SYM {} + | COLLATION_SYM {} +@@ -9502,6 +9545,7 @@ + | HOSTS_SYM {} + | HOUR_SYM {} + | IDENTIFIED_SYM {} ++ | INDEX_STATS_SYM {} + | INVOKER_SYM {} + | IMPORT {} + | INDEXES {} +@@ -9611,6 +9655,7 @@ + | SIMPLE_SYM {} + | SHARE_SYM {} + | SHUTDOWN {} ++ | SLOW_SYM {} + | SNAPSHOT_SYM {} + | SOUNDS_SYM {} + | SOURCE_SYM {} +@@ -9627,6 +9672,7 @@ + | SUSPEND_SYM {} + | SWAPS_SYM {} + | SWITCHES_SYM {} ++ | TABLE_STATS_SYM {} + | TABLES {} + | TABLESPACE {} + | TEMPORARY {} +@@ -9647,6 +9693,7 @@ + | UNKNOWN_SYM {} + | UNTIL_SYM {} + | USER {} ++ | USER_STATS_SYM {} + | USE_FRM {} + | VARIABLES {} + | VIEW_SYM {} +diff -r 592f6c3641ba sql/structs.h +--- a/sql/structs.h Wed Jul 29 13:33:34 2009 -0700 ++++ b/sql/structs.h Wed Jul 29 13:34:11 2009 -0700 +@@ -273,6 +273,98 @@ + time_t intime; + } USER_CONN; + ++typedef struct st_user_stats { ++ char user[max(USERNAME_LENGTH, LIST_PROCESS_HOST_LEN) + 1]; ++ // Account name the user is mapped to when this is a user from mapped_user. ++ // Otherwise, the same value as user. ++ char priv_user[max(USERNAME_LENGTH, LIST_PROCESS_HOST_LEN) + 1]; ++ uint total_connections; ++ uint concurrent_connections; ++ time_t connected_time; // in seconds ++ double busy_time; // in seconds ++ double cpu_time; // in seconds ++ ulonglong bytes_received; ++ ulonglong bytes_sent; ++ ulonglong binlog_bytes_written; ++ ha_rows rows_fetched, rows_updated, rows_read; ++ ulonglong select_commands, update_commands, other_commands; ++ ulonglong commit_trans, rollback_trans; ++ ulonglong denied_connections, lost_connections; ++ ulonglong access_denied_errors; ++ ulonglong empty_queries; ++} USER_STATS; ++ ++/* Lookup function for hash tables with USER_STATS entries */ ++extern byte *get_key_user_stats(USER_STATS *user_stats, uint *length, ++ my_bool not_used __attribute__((unused))); ++ ++/* Free all memory for a hash table with USER_STATS entries */ ++extern void free_user_stats(USER_STATS* user_stats); ++ ++/* Intialize an instance of USER_STATS */ ++extern void ++init_user_stats(USER_STATS *user_stats, ++ const char *user, ++ const char *priv_user, ++ uint total_connections, ++ uint concurrent_connections, ++ time_t connected_time, ++ double busy_time, ++ double cpu_time, ++ ulonglong bytes_received, ++ ulonglong bytes_sent, ++ ulonglong binlog_bytes_written, ++ ha_rows rows_fetched, ++ ha_rows rows_updated, ++ ha_rows rows_read, ++ ulonglong select_commands, ++ ulonglong update_commands, ++ ulonglong other_commands, ++ ulonglong commit_trans, ++ ulonglong rollback_trans, ++ ulonglong denied_connections, ++ ulonglong lost_connections, ++ ulonglong access_denied_errors, ++ ulonglong empty_queries); ++ ++/* Increment values of an instance of USER_STATS */ ++extern void ++add_user_stats(USER_STATS *user_stats, ++ uint total_connections, ++ uint concurrent_connections, ++ time_t connected_time, ++ double busy_time, ++ double cpu_time, ++ ulonglong bytes_received, ++ ulonglong bytes_sent, ++ ulonglong binlog_bytes_written, ++ ha_rows rows_fetched, ++ ha_rows rows_updated, ++ ha_rows rows_read, ++ ulonglong select_commands, ++ ulonglong update_commands, ++ ulonglong other_commands, ++ ulonglong commit_trans, ++ ulonglong rollback_trans, ++ ulonglong denied_connections, ++ ulonglong lost_connections, ++ ulonglong access_denied_errors, ++ ulonglong empty_queries); ++ ++typedef struct st_table_stats { ++ char table[NAME_LEN * 2 + 2]; // [db] + '.' + [table] + '\0' ++ ulonglong rows_read, rows_changed; ++ ulonglong rows_changed_x_indexes; ++ /* Stores enum db_type, but forward declarations cannot be done */ ++ int engine_type; ++} TABLE_STATS; ++ ++typedef struct st_index_stats { ++ char index[NAME_LEN * 3 + 3]; // [db] + '.' + [table] + '.' + [index] + '\0' ++ ulonglong rows_read; ++} INDEX_STATS; ++ ++ + /* Bits in form->update */ + #define REG_MAKE_DUPP 1 /* Make a copy of record when read */ + #define REG_NEW_RECORD 2 /* Write a new record if not found */ +diff -r 592f6c3641ba sql/table.h +--- a/sql/table.h Wed Jul 29 13:33:34 2009 -0700 ++++ b/sql/table.h Wed Jul 29 13:34:11 2009 -0700 +@@ -371,10 +371,12 @@ + enum enum_schema_tables + { + SCH_CHARSETS= 0, ++ SCH_CLIENT_STATS, + SCH_COLLATIONS, + SCH_COLLATION_CHARACTER_SET_APPLICABILITY, + SCH_COLUMNS, + SCH_COLUMN_PRIVILEGES, ++ SCH_INDEX_STATS, + SCH_KEY_COLUMN_USAGE, + SCH_OPEN_TABLES, + SCH_PROFILES, +@@ -387,8 +389,10 @@ + SCH_TABLE_CONSTRAINTS, + SCH_TABLE_NAMES, + SCH_TABLE_PRIVILEGES, ++ SCH_TABLE_STATS, + SCH_TRIGGERS, + SCH_USER_PRIVILEGES, ++ SCH_USER_STATS, + SCH_VARIABLES, + SCH_VIEWS + }; +diff -r 592f6c3641ba strings/Makefile.in +--- a/strings/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/strings/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -342,6 +342,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba support-files/MacOSX/Makefile.in +--- a/support-files/MacOSX/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/support-files/MacOSX/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -148,6 +148,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba support-files/Makefile.in +--- a/support-files/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/support-files/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -171,6 +171,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba support-files/RHEL4-SElinux/Makefile.in +--- a/support-files/RHEL4-SElinux/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/support-files/RHEL4-SElinux/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -146,6 +146,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba tests/Makefile.in +--- a/tests/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/tests/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -193,6 +193,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @CLIENT_LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba tools/Makefile.in +--- a/tools/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/tools/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -167,6 +167,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba vio/Makefile.in +--- a/vio/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/vio/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -176,6 +176,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba win/Makefile.in +--- a/win/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/win/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -144,6 +144,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = @LIBS@ + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -r 592f6c3641ba zlib/Makefile.in +--- a/zlib/Makefile.in Wed Jul 29 13:33:34 2009 -0700 ++++ b/zlib/Makefile.in Wed Jul 29 13:34:11 2009 -0700 +@@ -187,6 +187,7 @@ + LIBDL = @LIBDL@ + LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@ + LIBOBJS = @LIBOBJS@ ++LIBRT = @LIBRT@ + LIBS = $(NON_THREADED_LIBS) + LIBTOOL = @LIBTOOL@ + LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@ +diff -Nur a/include/mysql_com.h b/include/mysql_com.h +--- a/include/mysql_com.h 2010-05-22 00:26:45.000000000 -0700 ++++ b/include/mysql_com.h 2010-05-22 00:27:14.000000000 -0700 +@@ -228,7 +228,7 @@ + + my_bool report_error; /* We should report error (we have unreported error) */ + my_bool return_errno; +-#if defined(MYSQL_SERVER) && !defined(EMBEDDED_LIBRARY) ++#if defined(MYSQL_SERVER) + /* + Controls whether a big packet should be skipped. + |