diff options
author | Jorge Manuel B. S. Vicetto (jmbsvicetto) <jmbsvicetto@gentoo.org> | 2009-11-25 04:24:02 -0100 |
---|---|---|
committer | Jorge Manuel B. S. Vicetto (jmbsvicetto) <jmbsvicetto@gentoo.org> | 2009-11-25 04:24:02 -0100 |
commit | 30afbb10fb42cb7d2b861dbc925a033f1e33ab7e (patch) | |
tree | 43fa792ecdbb3e8ba85db51261a59c9985b4017e /percona | |
parent | Added percona patches for 5.0.87 and updated index for 5.0.87 release. (diff) | |
download | mysql-extras-30afbb10fb42cb7d2b861dbc925a033f1e33ab7e.tar.gz mysql-extras-30afbb10fb42cb7d2b861dbc925a033f1e33ab7e.tar.bz2 mysql-extras-30afbb10fb42cb7d2b861dbc925a033f1e33ab7e.zip |
Added missing upstream patches and note about them not being applied by upstream and on Gentoo.
Diffstat (limited to 'percona')
-rw-r--r-- | percona/5.0.87-b20-20091116/README-GENTOO | 8 | ||||
-rw-r--r-- | percona/5.0.87-b20-20091116/innodb_extra_status.patch | 747 | ||||
-rw-r--r-- | percona/5.0.87-b20-20091116/innodb_io_tune.patch | 1823 | ||||
-rw-r--r-- | percona/5.0.87-b20-20091116/innodb_rw_lock_old.patch | 1357 | ||||
-rw-r--r-- | percona/5.0.87-b20-20091116/innodb_show_hashed_memory_standalone.patch | 264 | ||||
-rw-r--r-- | percona/5.0.87-b20-20091116/mirror_binlog.patch | 2694 |
6 files changed, 6893 insertions, 0 deletions
diff --git a/percona/5.0.87-b20-20091116/README-GENTOO b/percona/5.0.87-b20-20091116/README-GENTOO new file mode 100644 index 0000000..a4e2724 --- /dev/null +++ b/percona/5.0.87-b20-20091116/README-GENTOO @@ -0,0 +1,8 @@ +The following patches, while distributed by Percona, are NOT applied in their +specfile. As such, we do not apply them in Gentoo either: +========= +innodb_extra_status.patch +innodb_io_tune.patch +innodb_rw_lock_old.patch +innodb_show_hashed_memory_standalone.patch +mirror_binlog.patch diff --git a/percona/5.0.87-b20-20091116/innodb_extra_status.patch b/percona/5.0.87-b20-20091116/innodb_extra_status.patch new file mode 100644 index 0000000..adc1642 --- /dev/null +++ b/percona/5.0.87-b20-20091116/innodb_extra_status.patch @@ -0,0 +1,747 @@ +diff -r b059d02ec814 innobase/buf/buf0buf.c +--- a/innobase/buf/buf0buf.c Mon Nov 03 05:08:52 2008 -0800 ++++ b/innobase/buf/buf0buf.c Mon Nov 03 05:09:34 2008 -0800 +@@ -2353,6 +2353,7 @@ + "AWE: Database pages and free buffers mapped in frames %lu\n", + (ulong) UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped)); + } ++ if (file) { + fprintf(file, + "Buffer pool size %lu\n" + "Free buffers %lu\n" +@@ -2371,11 +2372,13 @@ + + buf_pool->init_flush[BUF_FLUSH_LIST], + (ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]); + ++ } // if (file) + current_time = time(NULL); + time_elapsed = 0.001 + difftime(current_time, + buf_pool->last_printout_time); + buf_pool->last_printout_time = current_time; + ++ if (file) { + fprintf(file, + "Pages read %lu, created %lu, written %lu\n" + "%.2f reads/s, %.2f creates/s, %.2f writes/s\n", +@@ -2405,6 +2408,7 @@ + } else { + fputs("No buffer pool page gets since the last printout\n", + file); ++ } + } + + buf_pool->n_page_gets_old = buf_pool->n_page_gets; +diff -r b059d02ec814 innobase/ibuf/ibuf0ibuf.c +--- a/innobase/ibuf/ibuf0ibuf.c Mon Nov 03 05:08:52 2008 -0800 ++++ b/innobase/ibuf/ibuf0ibuf.c Mon Nov 03 05:09:34 2008 -0800 +@@ -3519,9 +3519,15 @@ + + mutex_enter(&ibuf_mutex); + ++ inno_ibuf_size = 0; ++ inno_ibuf_inserts = 0; ++ inno_ibuf_merged_recs = 0; ++ inno_ibuf_merges = 0; ++ + data = UT_LIST_GET_FIRST(ibuf->data_list); + + while (data) { ++ if (file) { + fprintf(file, + "Ibuf: size %lu, free list len %lu, seg size %lu,\n" + "%lu inserts, %lu merged recs, %lu merges\n", +@@ -3542,6 +3548,12 @@ + } + } + #endif ++ } // if (file) ++ inno_ibuf_size += (ulong) data->size; ++ inno_ibuf_inserts += (ulong) data->n_inserts; ++ inno_ibuf_merged_recs += (ulong) data->n_merged_recs; ++ inno_ibuf_merges += (ulong) data->n_merges; ++ + data = UT_LIST_GET_NEXT(data_list, data); + } + +diff -r b059d02ec814 innobase/include/lock0lock.h +--- a/innobase/include/lock0lock.h Mon Nov 03 05:08:52 2008 -0800 ++++ b/innobase/include/lock0lock.h Mon Nov 03 05:09:34 2008 -0800 +@@ -24,6 +24,10 @@ + #endif /* UNIV_DEBUG */ + /* Buffer for storing information about the most recent deadlock error */ + extern FILE* lock_latest_err_file; ++ ++/* number of deadlocks happened so far */ ++extern ulint innodb_deadlocks; ++ + + /************************************************************************* + Gets the size of a lock struct. */ +diff -r b059d02ec814 innobase/include/srv0srv.h +--- a/innobase/include/srv0srv.h Mon Nov 03 05:08:52 2008 -0800 ++++ b/innobase/include/srv0srv.h Mon Nov 03 05:09:34 2008 -0800 +@@ -261,6 +261,12 @@ + /* variable to count the number of random read-aheads were done */ + extern ulint srv_read_ahead_rnd; + ++/* variable to identify if there is currently a long semaphore wait */ ++extern ibool srv_long_lock_wait; ++ ++/* variable to count the number long semaphore waits noticed */ ++extern ulint srv_long_lock_waits; ++ + /* Number of IO operations read/write done for all threads */ + extern ulint os_aio_read_requests; + extern ulint os_aio_write_requests; +@@ -278,6 +284,26 @@ + extern ulint inno_pending_ibuf_aio_reads; + extern ulint inno_pending_log_ios; + extern ulint inno_pending_sync_ios; ++ ++/* all 24 innodb status variables, exported to status */ ++extern ulint inno_transaction_count; ++extern ulint inno_transaction_purge_count; ++extern ulint inno_transaction_purge_lag; ++extern ulint inno_num_active_transactions; ++extern ulint inno_summed_transaction_age; ++extern ulint inno_longest_transaction_age; ++extern ulint inno_lock_wait_timeouts; ++extern ulint inno_num_lock_waiters; ++extern ulint inno_summed_lock_wait_time; ++extern ulint inno_longest_lock_wait; ++extern ulint inno_os_reads; ++extern ulint inno_os_writes; ++extern ulint inno_os_fsyncs; ++extern ulint inno_ibuf_size; ++extern ulint inno_ibuf_inserts; ++extern ulint inno_ibuf_merged_recs; ++extern ulint inno_ibuf_merges; ++extern ulint inno_log_ios_done; + + /* In this structure we store status variables to be passed to MySQL */ + typedef struct export_var_struct export_struc; +@@ -552,6 +578,7 @@ + ulint innodb_data_writes; + ulint innodb_data_written; + ulint innodb_data_reads; ++ ulint innodb_dict_size; + ulint innodb_buffer_pool_pages_total; + ulint innodb_buffer_pool_pages_data; + ulint innodb_buffer_pool_pages_dirty; +@@ -587,6 +614,43 @@ + ulint innodb_rows_inserted; + ulint innodb_rows_updated; + ulint innodb_rows_deleted; ++ ibool innodb_long_lock_wait; ++ ulint innodb_long_lock_waits; ++ ++ ulint innodb_os_aio_read_requests; ++ ulint innodb_os_aio_write_requests; ++ ulint innodb_os_aio_pages_read; ++ ulint innodb_os_aio_pages_written; ++ ib_longlong innodb_os_aio_read_time; ++ ib_longlong innodb_os_aio_write_time; ++ ib_longlong innodb_os_aio_read_time_avg; ++ ib_longlong innodb_os_aio_write_time_avg; ++ ulint innodb_deadlocks; ++ ++ // the following 24 variables are exported to "show status" ++ ulint inno_transaction_count; ++ ulint inno_transaction_purge_count; ++ ulint inno_transaction_purge_lag; ++ ulint inno_num_active_transactions; ++ ulint inno_summed_transaction_age; ++ ulint inno_longest_transaction_age; ++ ulint inno_lock_wait_timeouts; ++ ulint inno_num_lock_waiters; ++ ulint inno_summed_lock_wait_time; ++ ulint inno_longest_lock_wait; ++ ulint inno_pending_normal_aio_reads; ++ ulint inno_pending_normal_aio_writes; ++ ulint inno_pending_ibuf_aio_reads; ++ ulint inno_pending_log_ios; ++ ulint inno_pending_sync_ios; ++ ulint inno_os_reads; ++ ulint inno_os_writes; ++ ulint inno_os_fsyncs; ++ ulint inno_ibuf_size; ++ ulint inno_ibuf_inserts; ++ ulint inno_ibuf_merged_recs; ++ ulint inno_ibuf_merges; ++ ulint inno_log_ios_done; + }; + + /* The server system struct */ +diff -r b059d02ec814 innobase/lock/lock0lock.c +--- a/innobase/lock/lock0lock.c Mon Nov 03 05:08:52 2008 -0800 ++++ b/innobase/lock/lock0lock.c Mon Nov 03 05:09:34 2008 -0800 +@@ -360,6 +360,9 @@ + ibool lock_deadlock_found = FALSE; + FILE* lock_latest_err_file; + ++/* number of deadlocks happened so far */ ++ulint innodb_deadlocks = 0; ++ + /* Flags for recursive deadlock search */ + #define LOCK_VICTIM_IS_START 1 + #define LOCK_VICTIM_IS_OTHER 2 +@@ -3304,6 +3307,7 @@ + + FILE* ef = lock_latest_err_file; + ++ innodb_deadlocks++; + rewind(ef); + ut_print_timestamp(ef); + +@@ -4238,6 +4242,7 @@ + innobase_mysql_prepare_print_arbitrary_thd(); + lock_mutex_enter_kernel(); + ++ if (file) { + if (lock_deadlock_found) { + fputs( + "------------------------\n" +@@ -4269,6 +4274,12 @@ + fprintf(file, + "Total number of lock structs in row lock hash table %lu\n", + (ulong) lock_get_n_rec_locks()); ++ } // if (file) ++ inno_transaction_purge_count = ++ (ulong) ut_dulint_get_low(purge_sys->purge_trx_no); ++ inno_transaction_count = ++ (ulong) ut_dulint_get_low(trx_sys->max_trx_id); ++ inno_transaction_purge_lag = (ulong) trx_sys->rseg_history_len; + } + + /************************************************************************* +@@ -4289,7 +4300,17 @@ + ulint i; + mtr_t mtr; + trx_t* trx; +- ++ time_t current_time = time(NULL); ++ ++ /* init all counters to be updated */ ++ inno_num_lock_waiters = 0; ++ inno_summed_lock_wait_time = 0; ++ inno_longest_lock_wait = 0; ++ inno_num_active_transactions = 0; ++ inno_summed_transaction_age = 0; ++ inno_longest_transaction_age = 0; ++ ++ if (file) { + fprintf(file, "LIST OF TRANSACTIONS FOR EACH SESSION:\n"); + + /* First print info on non-active transactions */ +@@ -4304,6 +4325,7 @@ + + trx = UT_LIST_GET_NEXT(mysql_trx_list, trx); + } ++ } // if (file) + + loop: + trx = UT_LIST_GET_FIRST(trx_sys->trx_list); +@@ -4330,6 +4352,7 @@ + } + + if (nth_lock == 0) { ++ if (file) { + fputs("---", file); + trx_print(file, trx, 600); + +@@ -4341,11 +4364,27 @@ + (ulong) ut_dulint_get_high(trx->read_view->up_limit_id), + (ulong) ut_dulint_get_low(trx->read_view->up_limit_id)); + } ++ } // if (file) ++ ++ if (trx->conc_state == TRX_ACTIVE) { ++ ulong trx_age = (ulong)difftime(time(NULL), trx->start_time); ++ inno_num_active_transactions++; ++ inno_summed_transaction_age += trx_age; ++ if (inno_longest_transaction_age > trx_age) ++ inno_longest_transaction_age = trx_age; ++ } + + if (trx->que_state == TRX_QUE_LOCK_WAIT) { ++ ulong wait_time = (ulong)difftime(current_time, ++ trx->wait_started); ++ inno_num_lock_waiters++; ++ inno_summed_lock_wait_time += wait_time; ++ if (inno_longest_lock_wait < wait_time) ++ inno_longest_lock_wait = wait_time; ++ if (file) { + fprintf(file, + "------- TRX HAS BEEN WAITING %lu SEC FOR THIS LOCK TO BE GRANTED:\n", +- (ulong)difftime(time(NULL), trx->wait_started)); ++ wait_time); + + if (lock_get_type(trx->wait_lock) == LOCK_REC) { + lock_rec_print(file, trx->wait_lock); +@@ -4354,10 +4393,16 @@ + } + + fputs("------------------\n", file); +- } +- } +- +- if (!srv_print_innodb_lock_monitor) { ++ } // if (file) ++ } ++ } ++ ++ /* don't print locks per transaction if either ++ 1) srv_print_innodb_lock_monitor is NOT set, ++ ie no magic table innodb_lock_monitor is created, or ++ 2) file == NULL, ie, at counter updating stage from "show status" ++ */ ++ if (!srv_print_innodb_lock_monitor || !file) { + nth_trx++; + goto loop; + } +diff -r b059d02ec814 innobase/srv/srv0srv.c +--- a/innobase/srv/srv0srv.c Mon Nov 03 05:08:52 2008 -0800 ++++ b/innobase/srv/srv0srv.c Mon Nov 03 05:09:34 2008 -0800 +@@ -267,6 +267,35 @@ + ulint inno_pending_log_ios = 0; + ulint inno_pending_sync_ios = 0; + ++/* variable to identify if there is currently a long semaphore wait */ ++ibool srv_long_lock_wait = FALSE; ++ ++/* variable to count the number long semaphore waits noticed */ ++ulint srv_long_lock_waits = 0; ++ ++/* time interval in seconds allowed to calling innodb_show_status functions */ ++extern long innobase_min_status_update_time_interval; ++ ++/* all 24 innodb status variables, exported to status */ ++ulint inno_transaction_count = 0; ++ulint inno_transaction_purge_count = 0; ++ulint inno_transaction_purge_lag = 0; ++ulint inno_num_active_transactions = 0; ++ulint inno_summed_transaction_age = 0; ++ulint inno_longest_transaction_age = 0; ++ulint inno_lock_wait_timeouts = 0; /* Counts number of lock wait timeouts. */ ++ulint inno_num_lock_waiters = 0; ++ulint inno_summed_lock_wait_time = 0; ++ulint inno_longest_lock_wait = 0; ++ulint inno_os_reads = 0; ++ulint inno_os_writes = 0; ++ulint inno_os_fsyncs = 0; ++ulint inno_ibuf_size = 0; ++ulint inno_ibuf_inserts = 0; ++ulint inno_ibuf_merged_recs = 0; ++ulint inno_ibuf_merges = 0; ++ulint inno_log_ios_done = 0; ++ + /* structure to pass status variables to MySQL */ + export_struc export_vars; + +@@ -419,6 +448,10 @@ + const char* srv_io_thread_function[SRV_MAX_N_IO_THREADS]; + + time_t srv_last_monitor_time; ++ ++/* last time innodb status were updated thru show status */ ++time_t srv_last_innodb_status_time = 0; ++ + + mutex_t srv_innodb_monitor_mutex; + +@@ -677,6 +710,24 @@ + + ulint srv_n_threads_active[SRV_MASTER + 1]; + ulint srv_n_threads[SRV_MASTER + 1]; ++ ++/************************************************************************* ++Prints counters for work done by srv_master_thread. */ ++ ++static ++void ++srv_print_extra( ++/*===================*/ ++ FILE *file) /* in: output stream */ ++{ ++ fprintf(file, "srv_master_thread loops: %lu 1_second, %lu sleeps, " ++ "%lu 10_second, %lu background, %lu flush\n", ++ srv_main_1_second_loops, srv_main_sleeps, ++ srv_main_10_second_loops, srv_main_background_loops, ++ srv_main_flush_loops); ++ fprintf(file, "srv_master_thread log flush: %lu sync, %lu async\n", ++ srv_sync_flush, srv_async_flush); ++} + + /************************************************************************* + Sets the info describing an i/o thread current state. */ +@@ -1685,12 +1736,13 @@ + fputs("----------\n" + "BACKGROUND THREAD\n" + "----------\n", file); ++ srv_print_extra(file); + fil_print(file); +- + + fputs("----------\n" + "SEMAPHORES\n" + "----------\n", file); ++ fprintf(file, "Lock wait timeouts %lu\n", inno_lock_wait_timeouts); + sync_print(file); + + /* Conceptually, srv_innodb_monitor_mutex has a very high latching +@@ -1709,24 +1761,6 @@ + + mutex_exit(&dict_foreign_err_mutex); + +- lock_print_info_summary(file); +- if (trx_start) { +- long t = ftell(file); +- if (t < 0) { +- *trx_start = ULINT_UNDEFINED; +- } else { +- *trx_start = (ulint) t; +- } +- } +- lock_print_info_all_transactions(file); +- if (trx_end) { +- long t = ftell(file); +- if (t < 0) { +- *trx_end = ULINT_UNDEFINED; +- } else { +- *trx_end = (ulint) t; +- } +- } + fputs("--------\n" + "FILE I/O\n" + "--------\n", file); +@@ -1815,6 +1849,27 @@ + (srv_n_rows_read - srv_n_rows_read_old) + / time_elapsed); + ++ /* Print open transaction details */ ++ lock_print_info_summary(file); ++ ++ if (trx_start) { ++ long t = ftell(file); ++ if (t < 0) { ++ *trx_start = ULINT_UNDEFINED; ++ } else { ++ *trx_start = (ulint) t; ++ } ++ } ++ lock_print_info_all_transactions(file); ++ if (trx_end) { ++ long t = ftell(file); ++ if (t < 0) { ++ *trx_end = ULINT_UNDEFINED; ++ } else { ++ *trx_end = (ulint) t; ++ } ++ } ++ + srv_n_rows_inserted_old = srv_n_rows_inserted; + srv_n_rows_updated_old = srv_n_rows_updated; + srv_n_rows_deleted_old = srv_n_rows_deleted; +@@ -1833,7 +1888,8 @@ + void + srv_export_innodb_status(void) + { +- ++ long time_elapsed; ++ time_t current_time; + mutex_enter(&srv_innodb_monitor_mutex); + export_vars.innodb_data_pending_reads= os_n_pending_reads; + export_vars.innodb_data_pending_writes= os_n_pending_writes; +@@ -1844,6 +1900,7 @@ + export_vars.innodb_data_reads= os_n_file_reads; + export_vars.innodb_data_writes= os_n_file_writes; + export_vars.innodb_data_written= srv_data_written; ++ export_vars.innodb_dict_size= dict_sys->size; + export_vars.innodb_buffer_pool_read_requests= buf_pool->n_page_gets; + export_vars.innodb_buffer_pool_write_requests= srv_buf_pool_write_requests; + export_vars.innodb_buffer_pool_wait_free= srv_buf_pool_wait_free; +@@ -1854,10 +1911,12 @@ + export_vars.innodb_buffer_pool_pages_data= UT_LIST_GET_LEN(buf_pool->LRU); + export_vars.innodb_buffer_pool_pages_dirty= UT_LIST_GET_LEN(buf_pool->flush_list); + export_vars.innodb_buffer_pool_pages_free= UT_LIST_GET_LEN(buf_pool->free); +- export_vars.innodb_buffer_pool_pages_latched= buf_get_latched_pages_number(); ++ /* This function uses too much CPU for large buffer caches. */ ++ export_vars.innodb_buffer_pool_pages_latched= 1; /* buf_get_latched_pages_number(); */ + export_vars.innodb_buffer_pool_pages_total= buf_pool->curr_size; + export_vars.innodb_buffer_pool_pages_misc= buf_pool->max_size - + UT_LIST_GET_LEN(buf_pool->LRU) - UT_LIST_GET_LEN(buf_pool->free); ++ + export_vars.innodb_page_size= UNIV_PAGE_SIZE; + export_vars.innodb_log_waits= srv_log_waits; + export_vars.innodb_os_log_written= srv_os_log_written; +@@ -1885,6 +1944,103 @@ + export_vars.innodb_rows_inserted= srv_n_rows_inserted; + export_vars.innodb_rows_updated= srv_n_rows_updated; + export_vars.innodb_rows_deleted= srv_n_rows_deleted; ++ export_vars.innodb_long_lock_wait = srv_long_lock_wait; ++ export_vars.innodb_long_lock_waits = srv_long_lock_waits; ++ ++ export_vars.innodb_os_aio_read_requests = os_aio_read_requests; ++ export_vars.innodb_os_aio_write_requests = os_aio_write_requests; ++ ++ export_vars.innodb_os_aio_pages_read = os_aio_pages_read; ++ export_vars.innodb_os_aio_pages_written = os_aio_pages_written; ++ ++ export_vars.innodb_os_aio_read_time = os_aio_read_time; ++ export_vars.innodb_os_aio_write_time = os_aio_write_time; ++ ++ if (os_aio_read_requests > 0 ) { ++ export_vars.innodb_os_aio_read_time_avg ++ = os_aio_read_time / os_aio_read_requests; ++ } else { ++ export_vars.innodb_os_aio_read_time_avg = 0; ++ } ++ if (os_aio_write_requests > 0 ) { ++ export_vars.innodb_os_aio_write_time_avg ++ = os_aio_write_time / os_aio_write_requests; ++ } else { ++ export_vars.innodb_os_aio_write_time_avg = 0; ++ } ++ ++ export_vars.innodb_deadlocks = innodb_deadlocks; ++ ++ // simulate srv_printf_innodb_monitor, invoked by innodb_show_status ++ // 0. direct printout inno_lock_wait_timeouts, declared in srv0srv.c ++ // total # of variable(s) updated: 1 ++ export_vars.inno_lock_wait_timeouts = inno_lock_wait_timeouts; ++ ++ // *_print functions are allowed to be called once every ++ // some seconds to prevent too frequent invocation. ++ // the number is innobase_min_status_update_time_interval ++ current_time = time(NULL); ++ time_elapsed = difftime(current_time, srv_last_innodb_status_time); ++ if (time_elapsed >= innobase_min_status_update_time_interval) { ++ os_aio_print(NULL); ++ ibuf_print(NULL); ++ buf_print_io(NULL); ++ lock_print_info_summary(NULL); ++ lock_print_info_all_transactions(NULL); ++ ++ srv_last_innodb_status_time = current_time; ++ } ++ ++ // 1. os_aio_print ++ // the following were filled by calling os_aio_print ++ // total # of variable(s) updated: 8 ++ ++ export_vars.inno_pending_normal_aio_reads = ++ inno_pending_normal_aio_reads; ++ export_vars.inno_pending_normal_aio_writes = ++ inno_pending_normal_aio_writes; ++ export_vars.inno_pending_ibuf_aio_reads = inno_pending_ibuf_aio_reads; ++ export_vars.inno_pending_log_ios = inno_pending_log_ios; ++ export_vars.inno_pending_sync_ios = inno_pending_sync_ios; ++ export_vars.inno_os_reads = os_n_file_reads; ++ export_vars.inno_os_writes = os_n_file_writes; ++ export_vars.inno_os_fsyncs = os_n_fsyncs; ++ ++ // 2. ibuf_print() ++ // total # of variable(s) updated: 4 ++ ++ export_vars.inno_ibuf_size = inno_ibuf_size; ++ export_vars.inno_ibuf_inserts = inno_ibuf_inserts; ++ export_vars.inno_ibuf_merged_recs = inno_ibuf_merged_recs; ++ export_vars.inno_ibuf_merges = inno_ibuf_merges; ++ ++ // 3. log_print ++ // total # of variable(s) updated: 1 ++ export_vars.inno_log_ios_done = (ulong) log_sys->n_log_ios; ++ ++ // 5. lock_print_info_summary ++ // it enters the mutexes ++ // 1) innobase_mysql_prepare_print_arbitrary_thd() ++ // 2) lock_mutex_enter_kernel() ++ // total # of variable(s) updated: 3 ++ ++ export_vars.inno_transaction_count = inno_transaction_count; ++ export_vars.inno_transaction_purge_count = ++ inno_transaction_purge_count; ++ export_vars.inno_transaction_purge_lag = inno_transaction_purge_lag; ++ ++ // 6. lock_print_info_all_transactions(NULL) ++ // it exits two mutexes entered from lock_print_info_summary(NULL) ++ // total # of variable(s) updated: 6 ++ ++ export_vars.inno_num_active_transactions = inno_num_active_transactions; ++ export_vars.inno_summed_transaction_age = inno_summed_transaction_age; ++ export_vars.inno_longest_transaction_age = inno_longest_transaction_age; ++ ++ export_vars.inno_num_lock_waiters = inno_num_lock_waiters; ++ export_vars.inno_summed_lock_wait_time = inno_summed_lock_wait_time; ++ export_vars.inno_longest_lock_wait = inno_longest_lock_wait; ++ + mutex_exit(&srv_innodb_monitor_mutex); + + } +@@ -2026,6 +2182,7 @@ + if (thr_get_trx(slot->thr)->wait_lock) { + lock_cancel_waiting_and_release( + thr_get_trx(slot->thr)->wait_lock); ++ ++inno_lock_wait_timeouts; + } + } + } +diff -r b059d02ec814 patch_info/innodb_extra_status.info +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/patch_info/innodb_extra_status.info Mon Nov 03 05:09:34 2008 -0800 +@@ -0,0 +1,9 @@ ++File=innodb_extra_status.patch ++Name=Adds additional information of InnoDB counters into SHOW STATUS ++Version=1.0 ++Author=Google ++License=GPL ++Comment= ++ChangeLog= ++2008-11-03 ++VT: Initial porting +diff -r b059d02ec814 sql/ha_innodb.cc +--- a/sql/ha_innodb.cc Mon Nov 03 05:08:52 2008 -0800 ++++ b/sql/ha_innodb.cc Mon Nov 03 05:09:34 2008 -0800 +@@ -299,12 +299,36 @@ + (char*) &export_vars.innodb_dblwr_pages_written, SHOW_LONG}, + {"dblwr_writes", + (char*) &export_vars.innodb_dblwr_writes, SHOW_LONG}, ++ {"dict_size", ++ (char*) &export_vars.innodb_dict_size, SHOW_LONG}, + {"log_waits", + (char*) &export_vars.innodb_log_waits, SHOW_LONG}, + {"log_write_requests", + (char*) &export_vars.innodb_log_write_requests, SHOW_LONG}, + {"log_writes", + (char*) &export_vars.innodb_log_writes, SHOW_LONG}, ++ {"long_lock_wait", ++ (char*) &export_vars.innodb_long_lock_wait, SHOW_BOOL}, ++ {"long_lock_waits", ++ (char*) &export_vars.innodb_long_lock_waits, SHOW_LONG}, ++ ++ {"os_read_requests", ++ (char*) &export_vars.innodb_os_aio_read_requests, SHOW_LONG}, ++ {"os_write_requests", ++ (char*) &export_vars.innodb_os_aio_write_requests, SHOW_LONG}, ++ {"os_pages_read", ++ (char*) &export_vars.innodb_os_aio_pages_read, SHOW_LONG}, ++ {"os_pages_written", ++ (char*) &export_vars.innodb_os_aio_pages_written, SHOW_LONG}, ++ {"os_read_time", ++ (char*) &export_vars.innodb_os_aio_read_time, SHOW_LONGLONG}, ++ {"os_write_time", ++ (char*) &export_vars.innodb_os_aio_write_time, SHOW_LONGLONG}, ++ {"time_per_read", ++ (char*) &export_vars.innodb_os_aio_read_time_avg, SHOW_LONGLONG}, ++ {"time_per_write", ++ (char*) &export_vars.innodb_os_aio_write_time_avg, SHOW_LONGLONG}, ++ + {"os_log_fsyncs", + (char*) &export_vars.innodb_os_log_fsyncs, SHOW_LONG}, + {"os_log_pending_fsyncs", +@@ -339,6 +363,56 @@ + (char*) &export_vars.innodb_rows_read, SHOW_LONG}, + {"rows_updated", + (char*) &export_vars.innodb_rows_updated, SHOW_LONG}, ++ {"deadlocks", ++ (char*) &export_vars.innodb_deadlocks, SHOW_LONG}, ++ ++ /* 24 innodb status variables exported to status */ ++ {"transaction_count", ++ (char*) &export_vars.inno_transaction_count, SHOW_LONG}, ++ {"transaction_purge_count", ++ (char*) &export_vars.inno_transaction_purge_count, SHOW_LONG}, ++ {"transaction_purge_lag", ++ (char*) &export_vars.inno_transaction_purge_lag, SHOW_LONG}, ++ {"active_transactions", ++ (char*) &export_vars.inno_num_active_transactions, SHOW_LONG}, ++ {"summed_transaction_age", ++ (char*) &export_vars.inno_summed_transaction_age, SHOW_LONG}, ++ {"longest_transaction_age", ++ (char*) &export_vars.inno_longest_transaction_age, SHOW_LONG}, ++ {"lock_wait_timeouts", ++ (char*) &export_vars.inno_lock_wait_timeouts, SHOW_LONG}, ++ {"lock_waiters", ++ (char*) &export_vars.inno_num_lock_waiters, SHOW_LONG}, ++ {"summed_lock_wait_time", ++ (char*) &export_vars.inno_summed_lock_wait_time, SHOW_LONG}, ++ {"longest_lock_wait", ++ (char*) &export_vars.inno_longest_lock_wait, SHOW_LONG}, ++ {"pending_normal_aio_reads", ++ (char*) &export_vars.inno_pending_normal_aio_reads, SHOW_LONG}, ++ {"pending_normal_aio_writes", ++ (char*) &export_vars.inno_pending_normal_aio_writes, SHOW_LONG}, ++ {"pending_ibuf_aio_reads", ++ (char*) &export_vars.inno_pending_ibuf_aio_reads, SHOW_LONG}, ++ {"pending_log_ios", ++ (char*) &export_vars.inno_pending_log_ios, SHOW_LONG}, ++ {"pending_sync_ios", ++ (char*) &export_vars.inno_pending_sync_ios, SHOW_LONG}, ++ {"os_reads", ++ (char*) &export_vars.inno_os_reads, SHOW_LONG}, ++ {"os_writes", ++ (char*) &export_vars.inno_os_writes, SHOW_LONG}, ++ {"os_fsyncs", ++ (char*) &export_vars.inno_os_fsyncs, SHOW_LONG}, ++ {"ibuf_inserts", ++ (char*) &export_vars.inno_ibuf_size, SHOW_LONG}, ++ {"ibuf_size", ++ (char*) &export_vars.inno_ibuf_inserts, SHOW_LONG}, ++ {"ibuf_merged_recs", ++ (char*) &export_vars.inno_ibuf_merged_recs, SHOW_LONG}, ++ {"ibuf_merges", ++ (char*) &export_vars.inno_ibuf_merges, SHOW_LONG}, ++ {"log_ios_done", ++ (char*) &export_vars.inno_log_ios_done, SHOW_LONG}, + {NullS, NullS, SHOW_LONG}}; + + /* General functions */ +diff -r b059d02ec814 sql/ha_innodb.h +--- a/sql/ha_innodb.h Mon Nov 03 05:08:52 2008 -0800 ++++ b/sql/ha_innodb.h Mon Nov 03 05:09:34 2008 -0800 +@@ -198,6 +198,7 @@ + extern struct show_var_st innodb_status_variables[]; + extern ulong innobase_fast_shutdown; + extern long innobase_max_merged_io; ++extern long innobase_min_status_update_time_interval; + extern ulong innobase_large_page_size; + extern long innobase_mirrored_log_groups, innobase_log_files_in_group; + extern longlong innobase_buffer_pool_size, innobase_log_file_size; +diff -r b059d02ec814 sql/mysqld.cc +--- a/sql/mysqld.cc Mon Nov 03 05:08:52 2008 -0800 ++++ b/sql/mysqld.cc Mon Nov 03 05:09:34 2008 -0800 +@@ -4950,6 +4950,7 @@ + OPT_INNODB_SYNC_SPIN_LOOPS, + OPT_INNODB_CONCURRENCY_TICKETS, + OPT_INNODB_THREAD_SLEEP_DELAY, ++ OPT_INNODB_MIN_STATUS_UPDATE_TIME_INTERVAL, + OPT_BDB_CACHE_SIZE, + OPT_BDB_LOG_BUFFER_SIZE, + OPT_BDB_MAX_LOCK, +@@ -6031,6 +6032,14 @@ + (gptr*) &srv_thread_sleep_delay, + (gptr*) &srv_thread_sleep_delay, + 0, GET_ULONG, REQUIRED_ARG, 10000L, 0L, ULONG_MAX, 0, 1L, 0}, ++ {"innodb_status_update_interval", ++ OPT_INNODB_MIN_STATUS_UPDATE_TIME_INTERVAL, ++ "Minimum time interval in seconds before InnoDB status counters " ++ "are updated during SHOW STATUS. " ++ "InnoDB counters are always updated during SHOW INNODB STATUS.", ++ (gptr*) &innobase_min_status_update_time_interval, ++ (gptr*) &innobase_min_status_update_time_interval, ++ 0, GET_LONG, REQUIRED_ARG, 30, 0, 3600, 0, 1, 0}, + #endif /* HAVE_INNOBASE_DB */ + {"interactive_timeout", OPT_INTERACTIVE_TIMEOUT, + "The number of seconds the server waits for activity on an interactive connection before closing it.", +diff -r b059d02ec814 sql/set_var.cc +--- a/sql/set_var.cc Mon Nov 03 05:08:52 2008 -0800 ++++ b/sql/set_var.cc Mon Nov 03 05:09:34 2008 -0800 +@@ -948,6 +948,8 @@ + {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG }, + {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG }, + {"innodb_max_merged_io", (char*) &innobase_max_merged_io, SHOW_LONG}, ++ {"innodb_status_update_interval", ++ (char*) &innobase_min_status_update_time_interval, SHOW_LONG}, + #endif + {sys_interactive_timeout.name,(char*) &sys_interactive_timeout, SHOW_SYS}, + {sys_join_buffer_size.name, (char*) &sys_join_buffer_size, SHOW_SYS}, diff --git a/percona/5.0.87-b20-20091116/innodb_io_tune.patch b/percona/5.0.87-b20-20091116/innodb_io_tune.patch new file mode 100644 index 0000000..3953e1d --- /dev/null +++ b/percona/5.0.87-b20-20091116/innodb_io_tune.patch @@ -0,0 +1,1823 @@ +diff -r 322370200e6a innobase/include/os0file.h +--- a/innobase/include/os0file.h Mon Nov 03 05:07:57 2008 -0800 ++++ b/innobase/include/os0file.h Mon Nov 03 05:08:52 2008 -0800 +@@ -532,21 +532,16 @@ + FALSE otherwise */ + const char* path); /* in: path name */ + /**************************************************************************** +-Initializes the asynchronous io system. Creates separate aio array for +-non-ibuf read and write, a third aio array for the ibuf i/o, with just one +-segment, two aio arrays for log reads and writes with one segment, and a +-synchronous aio array of the specified size. The combined number of segments +-in the three first aio arrays is the parameter n_segments given to the +-function. The caller must create an i/o handler thread for each segment in +-the four first arrays, but not for the sync aio array. */ ++Initializes the asynchronous io system. */ + +-void ++ulint + os_aio_init( + /*========*/ +- ulint n, /* in: maximum number of pending aio operations +- allowed; n must be divisible by n_segments */ +- ulint n_segments, /* in: combined number of segments in the four +- first aio arrays; must be >= 4 */ ++ /* out: number of AIO handler threads */ ++ ulint ios_per_array, /* in: maximum number of pending aio operations ++ allowed per IO array */ ++ ulint n_read_threads, /* in: number of read threads */ ++ ulint n_write_threads, /* in: number of write threads */ + ulint n_slots_sync); /* in: number of slots in the sync aio array */ + /*********************************************************************** + Requests an asynchronous i/o operation. */ +diff -r 322370200e6a innobase/include/srv0srv.h +--- a/innobase/include/srv0srv.h Mon Nov 03 05:07:57 2008 -0800 ++++ b/innobase/include/srv0srv.h Mon Nov 03 05:08:52 2008 -0800 +@@ -87,6 +87,14 @@ + extern ulint srv_lock_table_size; + + extern ulint srv_n_file_io_threads; ++extern ulint srv_n_read_io_threads; ++extern ulint srv_n_write_io_threads; ++ ++/* Number of IO operations per second the server can do */ ++extern ulint srv_io_capacity; ++ ++/* Flush dirty pages when below max dirty percent */ ++extern ibool srv_extra_dirty_writes; + + #ifdef UNIV_LOG_ARCHIVE + extern ibool srv_log_archive_on; +@@ -252,6 +260,24 @@ + + /* variable to count the number of random read-aheads were done */ + extern ulint srv_read_ahead_rnd; ++ ++/* Number of IO operations read/write done for all threads */ ++extern ulint os_aio_read_requests; ++extern ulint os_aio_write_requests; ++ ++/* Number of pages read/written done for all threads */ ++extern ulint os_aio_pages_read; ++extern ulint os_aio_pages_written; ++ ++/* time usec used to perform read/write for all threads */ ++extern ib_longlong os_aio_read_time; ++extern ib_longlong os_aio_write_time; ++ ++extern ulint inno_pending_normal_aio_reads; ++extern ulint inno_pending_normal_aio_writes; ++extern ulint inno_pending_ibuf_aio_reads; ++extern ulint inno_pending_log_ios; ++extern ulint inno_pending_sync_ios; + + /* In this structure we store status variables to be passed to MySQL */ + typedef struct export_var_struct export_struc; +diff -r 322370200e6a innobase/log/log0log.c +--- a/innobase/log/log0log.c Mon Nov 03 05:07:57 2008 -0800 ++++ b/innobase/log/log0log.c Mon Nov 03 05:08:52 2008 -0800 +@@ -1537,6 +1537,30 @@ + + log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS, TRUE, + LOG_WRITE_FROM_BACKGROUND_SYNC); ++} ++ ++/******************************************************************** ++Flush the log buffer. Force it to disk depending on the value of ++innodb_flush_log_at_trx_commit. */ ++ ++void ++log_buffer_flush_maybe_sync(void) ++/*==========================*/ ++{ ++ dulint lsn; ++ ++ mutex_enter(&(log_sys->mutex)); ++ ++ lsn = log_sys->lsn; ++ ++ mutex_exit(&(log_sys->mutex)); ++ ++ /* Force log buffer to disk when innodb_flush_log_at_trx_commit = 1. */ ++ log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS, ++ srv_flush_log_at_trx_commit == 1 ? TRUE : FALSE, ++ srv_flush_log_at_trx_commit == 1 ? ++ LOG_WRITE_FROM_BACKGROUND_SYNC : ++ LOG_WRITE_FROM_BACKGROUND_ASYNC); + } + + /******************************************************************** +diff -r 322370200e6a innobase/os/os0file.c +--- a/innobase/os/os0file.c Mon Nov 03 05:07:57 2008 -0800 ++++ b/innobase/os/os0file.c Mon Nov 03 05:08:52 2008 -0800 +@@ -22,6 +22,8 @@ + #include <errno.h> + #endif /* UNIV_HOTBACKUP */ + ++extern long innobase_max_merged_io; ++ + #undef HAVE_FDATASYNC + + #ifdef POSIX_ASYNC_IO +@@ -63,6 +65,28 @@ + ibool os_aio_use_native_aio = FALSE; + + ibool os_aio_print_debug = FALSE; ++ ++/* State for the state of an IO request in simulated AIO. ++ Protocol for simulated aio: ++ client requests IO: find slot with reserved = FALSE. Add entry with ++ status = OS_AIO_NOT_ISSUED. ++ IO thread wakes: find adjacent slots with reserved = TRUE and status = ++ OS_AIO_NOT_ISSUED. Change status for slots to ++ OS_AIO_ISSUED. ++ IO operation completes: set status for slots to OS_AIO_DONE. set status ++ for the first slot to OS_AIO_CLAIMED and return ++ result for that slot. ++ When there are multiple read and write threads, they all compete to execute ++ the requests in the array (os_aio_array_t). This avoids the need to load ++ balance requests at the time the request is made at the cost of waking all ++ threads when a request is available. ++*/ ++typedef enum { ++ OS_AIO_NOT_ISSUED, /* Available to be processed by an IO thread. */ ++ OS_AIO_ISSUED, /* Being processed by an IO thread. */ ++ OS_AIO_DONE, /* Request processed. */ ++ OS_AIO_CLAIMED /* Result being returned to client. */ ++} os_aio_status; + + /* The aio array slot structure */ + typedef struct os_aio_slot_struct os_aio_slot_t; +@@ -72,6 +96,8 @@ + ulint pos; /* index of the slot in the aio + array */ + ibool reserved; /* TRUE if this slot is reserved */ ++ os_aio_status status; /* Status for current request. Valid when reserved ++ is TRUE. Used only in simulated aio. */ + time_t reservation_time;/* time when reserved */ + ulint len; /* length of the block to read or + write */ +@@ -82,11 +108,6 @@ + ulint offset_high; /* 32 high bits of file offset */ + os_file_t file; /* file where to read or write */ + const char* name; /* file name or path */ +- ibool io_already_done;/* used only in simulated aio: +- TRUE if the physical i/o already +- made and only the slot message +- needs to be passed to the caller +- of os_aio_simulated_handle */ + fil_node_t* message1; /* message which is given by the */ + void* message2; /* the requester of an aio operation + and which can be used to identify +@@ -116,9 +137,6 @@ + in this array */ + ulint n_slots; /* Total number of slots in the aio array. + This must be divisible by n_threads. */ +- ulint n_segments;/* Number of segments in the aio array of +- pending aio requests. A thread can wait +- separately for any one of the segments. */ + ulint n_reserved;/* Number of reserved slots in the + aio array outside the ibuf segment */ + os_aio_slot_t* slots; /* Pointer to the slots in the array */ +@@ -134,6 +152,17 @@ + + /* Array of events used in simulated aio */ + os_event_t* os_aio_segment_wait_events = NULL; ++ ++/* Number of threads for reading and writing. */ ++ulint os_aio_read_threads = 0; ++ulint os_aio_write_threads = 0; ++ ++/* Number for the first global segment for reading. */ ++const ulint os_aio_first_read_segment = 2; ++ ++/* Number for the first global segment for writing. Set to ++2 + os_aio_read_write_threads. */ ++ulint os_aio_first_write_segment = 0; + + /* The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These + are NULL when the module has not yet been initialized. */ +@@ -143,11 +172,39 @@ + static os_aio_array_t* os_aio_log_array = NULL; + static os_aio_array_t* os_aio_sync_array = NULL; + ++/* Per thread buffer used for merged IO requests. Used by ++os_aio_simulated_handle so that a buffer doesn't have to be allocated ++for each request. */ ++static char* os_aio_thread_buffer[SRV_MAX_N_IO_THREADS]; ++static ulint os_aio_thread_buffer_size[SRV_MAX_N_IO_THREADS]; ++ ++/* Count pages read and written per thread */ ++static ulint os_aio_thread_io_reads[SRV_MAX_N_IO_THREADS]; ++static ulint os_aio_thread_io_writes[SRV_MAX_N_IO_THREADS]; ++ ++/* Number of IO operations done. One request can be for N pages. */ ++static ulint os_aio_thread_io_requests[SRV_MAX_N_IO_THREADS]; ++ ++/* usecs spent blocked on an IO request */ ++static double os_aio_thread_io_wait[SRV_MAX_N_IO_THREADS]; ++/* max usecs spent blocked on an IO request */ ++static double os_aio_thread_max_io_wait[SRV_MAX_N_IO_THREADS]; ++ ++/* Number of IO global segments. An IO handler thread is created for each ++global segment, except for the segment associated with os_aio_sync_array. ++Several segments can be associated with os_aio_{read,write}_array. One ++segment is created for each of the other arrays. This is also the number ++of valid entries in srv_io_thread_reads, srv_io_thread_writes, ++srv_io_thread_op_info, srv_io_thread_function and os_aio_segment_wait_events. */ + static ulint os_aio_n_segments = ULINT_UNDEFINED; + +-/* If the following is TRUE, read i/o handler threads try to +-wait until a batch of new read requests have been posted */ +-static ibool os_aio_recommend_sleep_for_read_threads = FALSE; ++/* Set to TRUE to temporarily block reads from being scheduled while a batch ++of read requests is added to allow them to be merged by the IO handler thread ++if they are adjacent. Declared volatile because we don't want this to be ++read from a register in a loop when another thread may change the value in ++memory. ++*/ ++static volatile ibool os_aio_recommend_sleep_for_read_threads = FALSE; + + ulint os_n_file_reads = 0; + ulint os_bytes_read_since_printout = 0; +@@ -166,6 +223,19 @@ + ulint os_file_n_pending_pwrites = 0; + ulint os_n_pending_writes = 0; + ulint os_n_pending_reads = 0; ++ ++/* TODO -- does InnoDB provide a portable method for this? */ ++static double time_usecs() { ++#ifdef __WIN__ ++ return 0.0; ++#else ++ struct timeval tv; ++ if (gettimeofday(&tv, NULL)) ++ return 0; ++ else ++ return tv.tv_sec * 1000000.0 + tv.tv_usec; ++#endif ++} + + /*************************************************************************** + Gets the operating system version. Currently works only on Windows. */ +@@ -1351,6 +1421,8 @@ + /* We disable OS caching (O_DIRECT) only on data files */ + if (type != OS_LOG_FILE + && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) { ++ ++ fprintf(stderr, "Using O_DIRECT for file %s\n", name); + + os_file_set_nocache(file, name, mode_str); + } +@@ -1798,6 +1870,32 @@ + #endif /* __WIN__ */ + } + ++#ifndef __WIN__ ++/*************************************************************************** ++Possibly flushes a given file to disk. */ ++ ++ibool ++os_maybe_fsync( ++/*==========*/ ++ /* out: 0 if success, error code otherwise */ ++ os_file_t file) /* in, own: handle to a file */ ++{ ++ return (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) ? 0 : fsync(file); ++} ++ ++/*************************************************************************** ++Possibly flushes a given file to disk. */ ++ ++ibool ++os_maybe_fdatasync( ++/*==========*/ ++ /* out: 0 if success, error code otherwise */ ++ os_file_t file) /* in, own: handle to a file */ ++{ ++ return (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) ? 0 : fdatasync(file); ++} ++#endif ++ + /*************************************************************************** + Flushes the write buffers of a given file to the disk. */ + +@@ -1855,21 +1953,21 @@ + /* If we are not on an operating system that supports this, + then fall back to a plain fsync. */ + +- ret = fsync(file); ++ ret = os_maybe_fsync(file); + } else { + ret = fcntl(file, F_FULLFSYNC, NULL); + + if (ret) { + /* If we are not on a file system that supports this, + then fall back to a plain fsync. */ +- ret = fsync(file); ++ ret = os_maybe_fsync(file); + } + } + #elif HAVE_FDATASYNC +- ret = fdatasync(file); ++ ret = os_maybe_fdatasync(file); + #else + /* fprintf(stderr, "Flushing to file %p\n", file); */ +- ret = fsync(file); ++ ret = os_maybe_fsync(file); + #endif + os_n_fsyncs++; + +@@ -2298,6 +2396,9 @@ + + return(TRUE); + } ++ fprintf(stderr, ++"InnoDB: error: os_file_pread wanted %lu and got %lu.\n", ++ (ulint) n, (ulint) ret); + #endif + #ifdef __WIN__ + error_handling: +@@ -2784,9 +2885,8 @@ + os_aio_array_create( + /*================*/ + /* out, own: aio array */ +- ulint n, /* in: maximum number of pending aio operations +- allowed; n must be divisible by n_segments */ +- ulint n_segments) /* in: number of segments in the aio array */ ++ ulint n) /* in: maximum number of pending aio operations ++ allowed */ + { + os_aio_array_t* array; + ulint i; +@@ -2795,7 +2895,6 @@ + OVERLAPPED* over; + #endif + ut_a(n > 0); +- ut_a(n_segments > 0); + + array = ut_malloc(sizeof(os_aio_array_t)); + +@@ -2806,7 +2905,6 @@ + os_event_set(array->is_empty); + + array->n_slots = n; +- array->n_segments = n_segments; + array->n_reserved = 0; + array->slots = ut_malloc(n * sizeof(os_aio_slot_t)); + #ifdef __WIN__ +@@ -2833,70 +2931,75 @@ + + /**************************************************************************** + Initializes the asynchronous io system. Calls also os_io_init_simple. +-Creates a separate aio array for +-non-ibuf read and write, a third aio array for the ibuf i/o, with just one +-segment, two aio arrays for log reads and writes with one segment, and a +-synchronous aio array of the specified size. The combined number of segments +-in the three first aio arrays is the parameter n_segments given to the +-function. The caller must create an i/o handler thread for each segment in +-the four first arrays, but not for the sync aio array. */ +- +-void ++Creates an aio array for each of non-ibuf read, non-ibuf write, ibuf IO, ++log IO, and synchronous IO. The caller must create i/o handler thread for all ++but the synchronous aio array. Multiple threads can access the same array for ++the non-ibuf read (prefetch) and write (flush dirty buffer pages) arrays. ++Return the number of AIO handler threads. */ ++ ++ulint + os_aio_init( + /*========*/ +- ulint n, /* in: maximum number of pending aio operations +- allowed; n must be divisible by n_segments */ +- ulint n_segments, /* in: combined number of segments in the four +- first aio arrays; must be >= 4 */ ++ ulint ios_per_array, /* in: maximum number of pending aio operations ++ allowed per array */ ++ ulint n_read_threads, /* in: number of read threads */ ++ ulint n_write_threads, /* in: number of write threads */ + ulint n_slots_sync) /* in: number of slots in the sync aio array */ + { +- ulint n_read_segs; +- ulint n_write_segs; +- ulint n_per_seg; +- ulint i; ++ ulint i; ++ ulint n_segments = 2 + n_read_threads + n_write_threads; + #ifdef POSIX_ASYNC_IO + sigset_t sigset; + #endif +- ut_ad(n % n_segments == 0); +- ut_ad(n_segments >= 4); ++ ut_a(ios_per_array >= OS_AIO_N_PENDING_IOS_PER_THREAD); ++ ut_a(n_read_threads >= 1 && n_read_threads <= 64); ++ ut_a(n_write_threads >= 1 && n_write_threads <= 64); ++ ut_a(n_segments < SRV_MAX_N_IO_THREADS); + + os_io_init_simple(); + + for (i = 0; i < n_segments; i++) { + srv_set_io_thread_op_info(i, "not started yet"); +- } +- +- n_per_seg = n / n_segments; +- n_write_segs = (n_segments - 2) / 2; +- n_read_segs = n_segments - 2 - n_write_segs; +- +- /* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */ +- +- os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1); ++ os_aio_thread_io_reads[i] = 0; ++ os_aio_thread_io_writes[i] = 0; ++ os_aio_thread_io_requests[i] = 0; ++ os_aio_thread_buffer[i] = 0; ++ os_aio_thread_buffer_size[i] = 0; ++ os_aio_thread_io_wait[i] = 0; ++ os_aio_thread_max_io_wait[i] = 0; ++ } ++ ++ os_aio_read_threads = n_read_threads; ++ os_aio_write_threads = n_write_threads; ++ os_aio_first_write_segment = os_aio_first_read_segment + os_aio_read_threads; ++ ++ fprintf(stderr, ++ "InnoDB: ios_per_array %lu read threads %lu write threads %lu\n", ++ ios_per_array, os_aio_read_threads, os_aio_write_threads); ++ ++ os_aio_ibuf_array = os_aio_array_create(ios_per_array); + + srv_io_thread_function[0] = "insert buffer thread"; + +- os_aio_log_array = os_aio_array_create(n_per_seg, 1); ++ os_aio_log_array = os_aio_array_create(ios_per_array); + + srv_io_thread_function[1] = "log thread"; + +- os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg, +- n_read_segs); +- for (i = 2; i < 2 + n_read_segs; i++) { ++ os_aio_read_array = os_aio_array_create(ios_per_array); ++ for (i = os_aio_first_read_segment; i < os_aio_first_write_segment; i++) { + ut_a(i < SRV_MAX_N_IO_THREADS); +- srv_io_thread_function[i] = "read thread"; +- } +- +- os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg, +- n_write_segs); +- for (i = 2 + n_read_segs; i < n_segments; i++) { ++ srv_io_thread_function[i] = "read thread"; ++ } ++ ++ os_aio_write_array = os_aio_array_create(ios_per_array); ++ for (i = os_aio_first_write_segment; i < n_segments; i++) { + ut_a(i < SRV_MAX_N_IO_THREADS); +- srv_io_thread_function[i] = "write thread"; +- } +- +- os_aio_sync_array = os_aio_array_create(n_slots_sync, 1); +- +- os_aio_n_segments = n_segments; ++ srv_io_thread_function[i] = "write thread"; ++ } ++ ++ os_aio_sync_array = os_aio_array_create(n_slots_sync); ++ ++ os_aio_n_segments = 2 + os_aio_read_threads + os_aio_write_threads; + + os_aio_validate(); + +@@ -2924,6 +3027,7 @@ + + pthread_sigmask(SIG_BLOCK, &sigset, NULL); */ + #endif ++ return os_aio_n_segments; + } + + #ifdef WIN_ASYNC_IO +@@ -2981,77 +3085,32 @@ + os_event_wait(os_aio_write_array->is_empty); + } + +-/************************************************************************** +-Calculates segment number for a slot. */ +-static +-ulint +-os_aio_get_segment_no_from_slot( +-/*============================*/ +- /* out: segment number (which is the number +- used by, for example, i/o-handler threads) */ +- os_aio_array_t* array, /* in: aio wait array */ +- os_aio_slot_t* slot) /* in: slot in this array */ +-{ +- ulint segment; +- ulint seg_len; +- +- if (array == os_aio_ibuf_array) { +- segment = 0; +- +- } else if (array == os_aio_log_array) { +- segment = 1; +- +- } else if (array == os_aio_read_array) { +- seg_len = os_aio_read_array->n_slots / +- os_aio_read_array->n_segments; +- +- segment = 2 + slot->pos / seg_len; +- } else { +- ut_a(array == os_aio_write_array); +- seg_len = os_aio_write_array->n_slots / +- os_aio_write_array->n_segments; +- +- segment = os_aio_read_array->n_segments + 2 +- + slot->pos / seg_len; +- } +- +- return(segment); +-} +- +-/************************************************************************** +-Calculates local segment number and aio array from global segment number. */ +-static +-ulint +-os_aio_get_array_and_local_segment( ++ ++/************************************************************************** ++Calculates aio array from global segment number. */ ++static ++os_aio_array_t* ++os_aio_get_array( + /*===============================*/ +- /* out: local segment number within +- the aio array */ +- os_aio_array_t** array, /* out: aio wait array */ ++ /* out: aio wait array */ + ulint global_segment)/* in: global segment number */ + { +- ulint segment; + + ut_a(global_segment < os_aio_n_segments); + + if (global_segment == 0) { +- *array = os_aio_ibuf_array; +- segment = 0; ++ return os_aio_ibuf_array; + + } else if (global_segment == 1) { +- *array = os_aio_log_array; +- segment = 0; +- +- } else if (global_segment < os_aio_read_array->n_segments + 2) { +- *array = os_aio_read_array; +- +- segment = global_segment - 2; +- } else { +- *array = os_aio_write_array; +- +- segment = global_segment - (os_aio_read_array->n_segments + 2); +- } +- +- return(segment); ++ return os_aio_log_array; ++ ++ } else if (global_segment < os_aio_first_write_segment) { ++ return os_aio_read_array; ++ ++ } else { ++ return os_aio_write_array; ++ ++ } + } + + /*********************************************************************** +@@ -3160,7 +3219,7 @@ + + os_aio_simulated_wake_handler_threads(); + } +- ++ + os_event_wait(array->not_full); + + goto loop; +@@ -3173,7 +3232,7 @@ + break; + } + } +- ++ ut_a(i < array->n_slots); + array->n_reserved++; + + if (array->n_reserved == 1) { +@@ -3195,7 +3254,7 @@ + slot->buf = buf; + slot->offset = offset; + slot->offset_high = offset_high; +- slot->io_already_done = FALSE; ++ slot->status = OS_AIO_NOT_ISSUED; + + #ifdef WIN_ASYNC_IO + control = &(slot->control); +@@ -3246,8 +3305,9 @@ + os_mutex_enter(array->mutex); + + ut_ad(slot->reserved); +- ++ + slot->reserved = FALSE; ++ slot->status = OS_AIO_NOT_ISSUED; + + array->n_reserved--; + +@@ -3266,36 +3326,40 @@ + } + + /************************************************************************** +-Wakes up a simulated aio i/o-handler thread if it has something to do. */ ++Wake up the simulated aio i/o-handler threads for a given array if there ++is work to do. */ + static + void + os_aio_simulated_wake_handler_thread( + /*=================================*/ +- ulint global_segment) /* in: the number of the segment in the aio +- arrays */ +-{ +- os_aio_array_t* array; +- os_aio_slot_t* slot; +- ulint segment; ++ os_aio_array_t* array) /* in: aio array for which wakeup is done */ ++{ ++ os_aio_slot_t* slot; + ulint n; + ulint i; + + ut_ad(!os_aio_use_native_aio); + +- segment = os_aio_get_array_and_local_segment(&array, global_segment); +- +- n = array->n_slots / array->n_segments; +- +- /* Look through n slots after the segment * n'th slot */ +- +- os_mutex_enter(array->mutex); +- +- for (i = 0; i < n; i++) { +- slot = os_aio_array_get_nth_slot(array, i + segment * n); +- +- if (slot->reserved) { +- /* Found an i/o request */ +- ++ n = array->n_slots; ++ ++ /* Look through n slots */ ++ ++ os_mutex_enter(array->mutex); ++ ++ for (i = 0; i < n; i++) { ++ slot = os_aio_array_get_nth_slot(array, i ); ++ ++ if (slot->reserved && ++ (slot->status == OS_AIO_NOT_ISSUED || ++ slot->status == OS_AIO_DONE)) { ++ /* Found an i/o request ++ /* OS_AIO_NOT_ISSUED means the read or write request has ++ * yet to be done. OS_AIO_DONE means the request has been ++ * done but it was part of a set of requests merged into ++ * one read or write call and was not the first block in ++ * the request, so the handling of the IO completion for ++ * that block has not been done. */ ++ + break; + } + } +@@ -3303,7 +3367,25 @@ + os_mutex_exit(array->mutex); + + if (i < n) { +- os_event_set(os_aio_segment_wait_events[global_segment]); ++ if (array == os_aio_ibuf_array) { ++ os_event_set(os_aio_segment_wait_events[0]); ++ ++ } else if (array == os_aio_log_array) { ++ os_event_set(os_aio_segment_wait_events[1]); ++ ++ } else if (array == os_aio_read_array) { ++ ulint x; ++ for (x = os_aio_first_read_segment; x < os_aio_first_write_segment; x++) ++ os_event_set(os_aio_segment_wait_events[x]); ++ ++ } else if (array == os_aio_write_array) { ++ ulint x; ++ for (x = os_aio_first_write_segment; x < os_aio_n_segments; x++) ++ os_event_set(os_aio_segment_wait_events[x]); ++ ++ } else { ++ ut_a(0); ++ } + } + } + +@@ -3320,13 +3402,14 @@ + /* We do not use simulated aio: do nothing */ + + return; +- } +- +- os_aio_recommend_sleep_for_read_threads = FALSE; +- +- for (i = 0; i < os_aio_n_segments; i++) { +- os_aio_simulated_wake_handler_thread(i); +- } ++ } ++ ++ os_aio_recommend_sleep_for_read_threads = FALSE; ++ ++ os_aio_simulated_wake_handler_thread(os_aio_ibuf_array); ++ os_aio_simulated_wake_handler_thread(os_aio_log_array); ++ os_aio_simulated_wake_handler_thread(os_aio_read_array); ++ os_aio_simulated_wake_handler_thread(os_aio_write_array); + } + + /************************************************************************** +@@ -3339,18 +3422,13 @@ + os_aio_simulated_put_read_threads_to_sleep(void) + /*============================================*/ + { +- os_aio_array_t* array; + ulint g; + ++ /* TODO(mcallaghan): provide similar function for write? */ + os_aio_recommend_sleep_for_read_threads = TRUE; + +- for (g = 0; g < os_aio_n_segments; g++) { +- os_aio_get_array_and_local_segment(&array, g); +- +- if (array == os_aio_read_array) { +- +- os_event_reset(os_aio_segment_wait_events[g]); +- } ++ for (g = os_aio_first_read_segment; g < os_aio_first_write_segment; g++) { ++ os_event_reset(os_aio_segment_wait_events[g]); + } + } + +@@ -3480,8 +3558,7 @@ + #endif + } else { + if (!wake_later) { +- os_aio_simulated_wake_handler_thread( +- os_aio_get_segment_no_from_slot(array, slot)); ++ os_aio_simulated_wake_handler_thread(array); + } + } + } else if (type == OS_FILE_WRITE) { +@@ -3497,8 +3574,7 @@ + #endif + } else { + if (!wake_later) { +- os_aio_simulated_wake_handler_thread( +- os_aio_get_segment_no_from_slot(array, slot)); ++ os_aio_simulated_wake_handler_thread(array); + } + } + } else { +@@ -3561,7 +3637,7 @@ + os_aio_windows_handle( + /*==================*/ + /* out: TRUE if the aio operation succeeded */ +- ulint segment, /* in: the number of the segment in the aio ++ ulint global_segment, /* in: the number of the segment in the aio + arrays to wait for; segment 0 is the ibuf + i/o thread, segment 1 the log i/o thread, + then follow the non-ibuf read threads, and as +@@ -3579,7 +3655,6 @@ + void** message2, + ulint* type) /* out: OS_FILE_WRITE or ..._READ */ + { +- ulint orig_seg = segment; + os_aio_array_t* array; + os_aio_slot_t* slot; + ulint n; +@@ -3588,33 +3663,30 @@ + BOOL ret; + DWORD len; + +- if (segment == ULINT_UNDEFINED) { ++ if (global_segment == ULINT_UNDEFINED) { + array = os_aio_sync_array; +- segment = 0; +- } else { +- segment = os_aio_get_array_and_local_segment(&array, segment); ++ } else { ++ array = os_aio_get_array(global_segment); + } + + /* NOTE! We only access constant fields in os_aio_array. Therefore + we do not have to acquire the protecting mutex yet */ + + ut_ad(os_aio_validate()); +- ut_ad(segment < array->n_segments); +- +- n = array->n_slots / array->n_segments; ++ ++ n = array->n_slots; + + if (array == os_aio_sync_array) { + os_event_wait(os_aio_array_get_nth_slot(array, pos)->event); + i = pos; + } else { +- srv_set_io_thread_op_info(orig_seg, "wait Windows aio"); +- i = os_event_wait_multiple(n, +- (array->native_events) + segment * n); +- } +- +- os_mutex_enter(array->mutex); +- +- slot = os_aio_array_get_nth_slot(array, i + segment * n); ++ srv_set_io_thread_op_info(global_segment, "wait Windows aio"); ++ i = os_event_wait_multiple(n, (array->native_events)); ++ } ++ ++ os_mutex_enter(array->mutex); ++ ++ slot = os_aio_array_get_nth_slot(array, i); + + ut_a(slot->reserved); + +@@ -3787,14 +3859,16 @@ + ulint* type) /* out: OS_FILE_WRITE or ..._READ */ + { + os_aio_array_t* array; +- ulint segment; + os_aio_slot_t* slot; + os_aio_slot_t* slot2; + os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE]; ++ os_aio_slot_t* lowest_request; ++ os_aio_slot_t* oldest_request; + ulint n_consecutive; + ulint total_len; + ulint offs; + ulint lowest_offset; ++ ulint oldest_offset; + ulint biggest_age; + ulint age; + byte* combined_buf; +@@ -3802,8 +3876,10 @@ + ibool ret; + ulint n; + ulint i; +- +- segment = os_aio_get_array_and_local_segment(&array, global_segment); ++ ++ double start_usecs, stop_usecs, elapsed_usecs; ++ time_t now; ++ array = os_aio_get_array(global_segment); + + restart: + /* NOTE! We only access constant fields in os_aio_array. Therefore +@@ -3812,11 +3888,10 @@ + srv_set_io_thread_op_info(global_segment, + "looking for i/o requests (a)"); + ut_ad(os_aio_validate()); +- ut_ad(segment < array->n_segments); +- +- n = array->n_slots / array->n_segments; +- +- /* Look through n slots after the segment * n'th slot */ ++ ++ n = array->n_slots; ++ ++ /* Look through n slots */ + + if (array == os_aio_read_array + && os_aio_recommend_sleep_for_read_threads) { +@@ -3836,9 +3911,9 @@ + done */ + + for (i = 0; i < n; i++) { +- slot = os_aio_array_get_nth_slot(array, i + segment * n); +- +- if (slot->reserved && slot->io_already_done) { ++ slot = os_aio_array_get_nth_slot(array, i); ++ ++ if (slot->reserved && slot->status == OS_AIO_DONE) { + + if (os_aio_print_debug) { + fprintf(stderr, +@@ -3846,79 +3921,66 @@ + } + + ret = TRUE; +- ++ + goto slot_io_done; + } + } + +- n_consecutive = 0; +- +- /* If there are at least 2 seconds old requests, then pick the oldest +- one to prevent starvation. If several requests have the same age, +- then pick the one at the lowest offset. */ +- + biggest_age = 0; +- lowest_offset = ULINT_MAX; +- +- for (i = 0; i < n; i++) { +- slot = os_aio_array_get_nth_slot(array, i + segment * n); +- +- if (slot->reserved) { +- age = (ulint)difftime(time(NULL), +- slot->reservation_time); +- ++ now = time(NULL); ++ oldest_request = lowest_request = NULL; ++ oldest_offset = lowest_offset = ULINT_MAX; ++ ++ /* Find the oldest request and the request with the smallest offset */ ++ for (i = 0; i < n; i++) { ++ slot = os_aio_array_get_nth_slot(array, i); ++ ++ if (slot->reserved && slot->status == OS_AIO_NOT_ISSUED) { ++ age = (ulint)difftime(now, slot->reservation_time); ++ ++ /* If there are at least 2 seconds old requests, then pick the oldest ++ one to prevent starvation. If several requests have the same age, ++ then pick the one at the lowest offset. */ + if ((age >= 2 && age > biggest_age) + || (age >= 2 && age == biggest_age +- && slot->offset < lowest_offset)) { ++ && slot->offset < oldest_offset)) { + + /* Found an i/o request */ +- consecutive_ios[0] = slot; +- +- n_consecutive = 1; +- + biggest_age = age; ++ oldest_request = slot; ++ oldest_offset = slot->offset; ++ } ++ ++ /* Look for an i/o request at the lowest offset in the array ++ * (we ignore the high 32 bits of the offset) */ ++ if (slot->offset < lowest_offset) { ++ /* Found an i/o request */ ++ lowest_request = slot; + lowest_offset = slot->offset; + } + } + } + +- if (n_consecutive == 0) { +- /* There were no old requests. Look for an i/o request at the +- lowest offset in the array (we ignore the high 32 bits of the +- offset in these heuristics) */ +- +- lowest_offset = ULINT_MAX; +- +- for (i = 0; i < n; i++) { +- slot = os_aio_array_get_nth_slot(array, +- i + segment * n); +- +- if (slot->reserved && slot->offset < lowest_offset) { +- +- /* Found an i/o request */ +- consecutive_ios[0] = slot; +- +- n_consecutive = 1; +- +- lowest_offset = slot->offset; +- } +- } +- } +- +- if (n_consecutive == 0) { ++ if (!lowest_request && !oldest_request) { + + /* No i/o requested at the moment */ + + goto wait_for_io; + } + +- slot = consecutive_ios[0]; ++ if (oldest_request) { ++ slot = oldest_request; ++ } else { ++ slot = lowest_request; ++ } ++ consecutive_ios[0] = slot; ++ n_consecutive = 1; + + /* Check if there are several consecutive blocks to read or write */ + + consecutive_loop: + for (i = 0; i < n; i++) { +- slot2 = os_aio_array_get_nth_slot(array, i + segment * n); ++ slot2 = os_aio_array_get_nth_slot(array, i); + + if (slot2->reserved && slot2 != slot + && slot2->offset == slot->offset + slot->len +@@ -3926,7 +3988,8 @@ + sum does not wrap over */ + && slot2->offset_high == slot->offset_high + && slot2->type == slot->type +- && slot2->file == slot->file) { ++ && slot2->file == slot->file ++ && slot2->status == OS_AIO_NOT_ISSUED) { + + /* Found a consecutive i/o request */ + +@@ -3935,7 +3998,8 @@ + + slot = slot2; + +- if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) { ++ if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE ++ && n_consecutive < innobase_max_merged_io) { + + goto consecutive_loop; + } else { +@@ -3955,6 +4019,8 @@ + + for (i = 0; i < n_consecutive; i++) { + total_len += consecutive_ios[i]->len; ++ ut_a(consecutive_ios[i]->status == OS_AIO_NOT_ISSUED); ++ consecutive_ios[i]->status = OS_AIO_ISSUED; + } + + if (n_consecutive == 1) { +@@ -3962,7 +4028,16 @@ + combined_buf = slot->buf; + combined_buf2 = NULL; + } else { +- combined_buf2 = ut_malloc(total_len + UNIV_PAGE_SIZE); ++ if ((total_len + UNIV_PAGE_SIZE) > os_aio_thread_buffer_size[global_segment]) { ++ ++ if (os_aio_thread_buffer[global_segment]) ++ ut_free(os_aio_thread_buffer[global_segment]); ++ ++ os_aio_thread_buffer[global_segment] = ut_malloc(total_len + UNIV_PAGE_SIZE); ++ ++ os_aio_thread_buffer_size[global_segment] = total_len + UNIV_PAGE_SIZE; ++ } ++ combined_buf2 = os_aio_thread_buffer[global_segment]; + + ut_a(combined_buf2); + +@@ -3973,6 +4048,9 @@ + this assumes that there is just one i/o-handler thread serving + a single segment of slots! */ + ++ ut_a(slot->reserved); ++ ut_a(slot->status == OS_AIO_ISSUED); ++ + os_mutex_exit(array->mutex); + + if (slot->type == OS_FILE_WRITE && n_consecutive > 1) { +@@ -3998,6 +4076,7 @@ + + /* Do the i/o with ordinary, synchronous i/o functions: */ + if (slot->type == OS_FILE_WRITE) { ++ os_aio_thread_io_writes[global_segment] += n_consecutive; + if (array == os_aio_write_array) { + if ((total_len % UNIV_PAGE_SIZE != 0) + || (slot->offset % UNIV_PAGE_SIZE != 0)) { +@@ -4012,16 +4091,34 @@ + os_file_check_page_trailers(combined_buf, total_len); + } + ++ start_usecs = time_usecs(); + ret = os_file_write(slot->name, slot->file, combined_buf, + slot->offset, slot->offset_high, total_len); +- ++ stop_usecs = time_usecs(); ++ elapsed_usecs = stop_usecs - start_usecs; ++ if (elapsed_usecs < 0) elapsed_usecs = 0; + if (array == os_aio_write_array) { + os_file_check_page_trailers(combined_buf, total_len); + } +- } else { ++ os_aio_write_requests++; ++ os_aio_pages_written += n_consecutive; ++ os_aio_write_time += (ib_longlong)elapsed_usecs; ++ } else { ++ start_usecs = time_usecs(); ++ os_aio_thread_io_reads[global_segment] += n_consecutive; + ret = os_file_read(slot->file, combined_buf, + slot->offset, slot->offset_high, total_len); +- } ++ stop_usecs = time_usecs(); ++ elapsed_usecs = stop_usecs - start_usecs; ++ if (elapsed_usecs < 0) elapsed_usecs = 0; ++ os_aio_read_requests++; ++ os_aio_pages_read += n_consecutive; ++ os_aio_read_time += (ib_longlong)elapsed_usecs; ++ } ++ if (elapsed_usecs > os_aio_thread_max_io_wait[global_segment]) ++ os_aio_thread_max_io_wait[global_segment] = elapsed_usecs; ++ os_aio_thread_io_wait[global_segment] += elapsed_usecs; ++ os_aio_thread_io_requests[global_segment]++; + + ut_a(ret); + srv_set_io_thread_op_info(global_segment, "file i/o done"); +@@ -4042,16 +4139,13 @@ + } + } + +- if (combined_buf2) { +- ut_free(combined_buf2); +- } +- + os_mutex_enter(array->mutex); + + /* Mark the i/os done in slots */ + + for (i = 0; i < n_consecutive; i++) { +- consecutive_ios[i]->io_already_done = TRUE; ++ ut_a(consecutive_ios[i]->status == OS_AIO_ISSUED); ++ consecutive_ios[i]->status = OS_AIO_DONE; + } + + /* We return the messages for the first slot now, and if there were +@@ -4061,6 +4155,8 @@ + slot_io_done: + + ut_a(slot->reserved); ++ ut_a(slot->status == OS_AIO_DONE); ++ slot->status = OS_AIO_CLAIMED; + + *message1 = slot->message1; + *message2 = slot->message2; +@@ -4070,7 +4166,8 @@ + os_mutex_exit(array->mutex); + + os_aio_array_free_slot(array, slot); +- ++ srv_set_io_thread_op_info(global_segment, "exited handler"); ++ + return(ret); + + wait_for_io: +@@ -4115,7 +4212,6 @@ + os_mutex_enter(array->mutex); + + ut_a(array->n_slots > 0); +- ut_a(array->n_segments > 0); + + for (i = 0; i < array->n_slots; i++) { + slot = os_aio_array_get_nth_slot(array, i); +@@ -4165,11 +4261,20 @@ + double time_elapsed; + double avg_bytes_read; + ulint i; +- +- for (i = 0; i < srv_n_file_io_threads; i++) { +- fprintf(file, "I/O thread %lu state: %s (%s)", (ulong) i, +- srv_io_thread_op_info[i], +- srv_io_thread_function[i]); ++ ulint num_issued, num_done, num_claimed; ++ ++ if (file) { ++ for (i = 0; i < os_aio_n_segments; i++) { ++ fprintf(file, ++ "I/O thread %lu state: %s (%s) reads %lu writes %lu " ++ "requests %lu io secs %lf io msecs/request %lf max_io_wait %lf", ++ i, srv_io_thread_op_info[i], srv_io_thread_function[i], ++ os_aio_thread_io_reads[i], os_aio_thread_io_writes[i], ++ os_aio_thread_io_requests[i], ++ os_aio_thread_io_wait[i] / 1000000.0, ++ os_aio_thread_io_requests[i] ? ++ os_aio_thread_io_wait[i] / os_aio_thread_io_requests[i] / 1000.0 : 0.0, ++ os_aio_thread_max_io_wait[i] / 1000.0); + + #ifndef __WIN__ + if (os_aio_segment_wait_events[i]->is_set) { +@@ -4181,6 +4286,7 @@ + } + + fputs("Pending normal aio reads:", file); ++ } // if (file) + + array = os_aio_read_array; + loop: +@@ -4189,14 +4295,23 @@ + os_mutex_enter(array->mutex); + + ut_a(array->n_slots > 0); +- ut_a(array->n_segments > 0); + + n_reserved = 0; ++ num_done = num_issued = num_claimed = 0; + + for (i = 0; i < array->n_slots; i++) { + slot = os_aio_array_get_nth_slot(array, i); + + if (slot->reserved) { ++ if (slot->status == OS_AIO_ISSUED) ++ num_issued++; ++ else if (slot->status == OS_AIO_DONE) ++ num_done++; ++ else { ++ ut_ad(slot->status == OS_AIO_CLAIMED); ++ num_claimed++; ++ } ++ + n_reserved++; + /* fprintf(stderr, "Reserved slot, messages %p %p\n", + slot->message1, slot->message2); */ +@@ -4206,42 +4321,56 @@ + + ut_a(array->n_reserved == n_reserved); + +- fprintf(file, " %lu", (ulong) n_reserved); +- ++ if (file) fprintf(file, " %lu", (ulong) n_reserved); ++ + os_mutex_exit(array->mutex); + + if (array == os_aio_read_array) { +- fputs(", aio writes:", file); +- ++ inno_pending_normal_aio_reads = (ulong) n_reserved; ++ if (file) fputs(", aio writes:", file); + array = os_aio_write_array; + + goto loop; + } + + if (array == os_aio_write_array) { +- fputs(",\n ibuf aio reads:", file); ++ inno_pending_normal_aio_writes = (ulong) n_reserved; ++ if (file) fputs(",\n ibuf aio reads:", file); + array = os_aio_ibuf_array; + + goto loop; + } + + if (array == os_aio_ibuf_array) { +- fputs(", log i/o's:", file); ++ inno_pending_ibuf_aio_reads = (ulong) n_reserved; ++ if (file) fputs(", log i/o's:", file); + array = os_aio_log_array; + + goto loop; + } + + if (array == os_aio_log_array) { +- fputs(", sync i/o's:", file); ++ inno_pending_log_ios = (ulong) n_reserved; ++ if (file) fputs(", sync i/o's:", file); + array = os_aio_sync_array; + + goto loop; + } + +- putc('\n', file); ++ if (array == os_aio_sync_array) { ++ inno_pending_sync_ios = (ulong) n_reserved; ++ } ++ + current_time = time(NULL); + time_elapsed = 0.001 + difftime(current_time, os_last_printout); ++ ++ if (file) { ++ putc('\n', file); ++ fprintf(file, ++ "Summary of background IO slot status: %lu issued, " ++ "%lu done, %lu claimed, sleep set %d\n", ++ num_issued, num_done, num_claimed, ++ os_aio_recommend_sleep_for_read_threads); + + fprintf(file, + "Pending flushes (fsync) log: %lu; buffer pool: %lu\n" +@@ -4274,6 +4403,7 @@ + / time_elapsed, + (os_n_fsyncs - os_n_fsyncs_old) + / time_elapsed); ++ } // if (file) + + os_n_file_reads_old = os_n_file_reads; + os_n_file_writes_old = os_n_file_writes; +diff -r 322370200e6a innobase/srv/srv0srv.c +--- a/innobase/srv/srv0srv.c Mon Nov 03 05:07:57 2008 -0800 ++++ b/innobase/srv/srv0srv.c Mon Nov 03 05:08:52 2008 -0800 +@@ -164,7 +164,17 @@ + ulint srv_mem_pool_size = ULINT_MAX; /* size in bytes */ + ulint srv_lock_table_size = ULINT_MAX; + ++ulint srv_io_capacity = ULINT_MAX; /* Number of IO operations per ++ second the server can do */ ++ ++ibool srv_extra_dirty_writes = TRUE; /* Write dirty pages to disk when pct ++ dirty < max dirty pct */ ++ ++/* Deprecated by srv_n_{read,write}_io_threads */ + ulint srv_n_file_io_threads = ULINT_MAX; ++/* Number of background IO threads for read and write requests */ ++ulint srv_n_read_io_threads = ULINT_MAX; ++ulint srv_n_write_io_threads = ULINT_MAX; + + #ifdef UNIV_LOG_ARCHIVE + ibool srv_log_archive_on = FALSE; +@@ -238,6 +248,24 @@ + + /* variable to count the number of random read-aheads */ + ulint srv_read_ahead_rnd = 0; ++ ++/* Number of IO operations read/write done for all threads */ ++ulint os_aio_read_requests = 0; ++ulint os_aio_write_requests = 0; ++ ++/* Number of pages read/written done for all threads */ ++ulint os_aio_pages_read = 0; ++ulint os_aio_pages_written = 0; ++ ++/* time usec used to perform read/write for all threads */ ++ib_longlong os_aio_read_time = 0; ++ib_longlong os_aio_write_time = 0; ++ ++ulint inno_pending_normal_aio_reads = 0; ++ulint inno_pending_normal_aio_writes = 0; ++ulint inno_pending_ibuf_aio_reads = 0; ++ulint inno_pending_log_ios = 0; ++ulint inno_pending_sync_ios = 0; + + /* structure to pass status variables to MySQL */ + export_struc export_vars; +@@ -413,6 +441,23 @@ + + ulint srv_main_thread_process_no = 0; + ulint srv_main_thread_id = 0; ++ ++// The following count work done by srv_master_thread. ++ ++// Iterations by the 'once per second' loop. ++ulint srv_main_1_second_loops = 0; ++// Calls to sleep by the 'once per second' loop. ++ulint srv_main_sleeps = 0; ++// Iterations by the 'once per 10 seconds' loop. ++ulint srv_main_10_second_loops = 0; ++// Iterations of the loop bounded by the 'background_loop' label. ++ulint srv_main_background_loops = 0; ++// Iterations of the loop bounded by the 'flush_loop' label. ++ulint srv_main_flush_loops = 0; ++// Calls to log_buffer_flush_to_disk. ++ulint srv_sync_flush = 0; ++// Calls to log_buffer_flush_maybe_sync. ++ulint srv_async_flush = 0; + + /* + IMPLEMENTATION OF THE SERVER MAIN PROGRAM +@@ -2170,7 +2215,12 @@ + } + + /************************************************************************* +-The master thread controlling the server. */ ++Returns the number of IO operations that is X percent of the capacity. ++ ++PCT_IO(5) -> returns the number of IO operations that is 5% of the max ++where max is srv_io_capacity. ++*/ ++#define PCT_IO(pct) ((ulint) (srv_io_capacity * ((double) pct / 100.0))) + + #ifndef __WIN__ + void* +@@ -2199,11 +2249,15 @@ + ulint n_pend_ios; + ibool skip_sleep = FALSE; + ulint i; ++ + + #ifdef UNIV_DEBUG_THREAD_CREATION + fprintf(stderr, "Master thread starts, id %lu\n", + os_thread_pf(os_thread_get_curr_id())); + #endif ++ fprintf(stderr, "InnoDB master thread running with io_capacity %lu\n", ++ srv_io_capacity); ++ + srv_main_thread_process_no = os_proc_get_number(); + srv_main_thread_id = os_thread_pf(os_thread_get_curr_id()); + +@@ -2275,26 +2329,28 @@ + + srv_main_thread_op_info = "flushing log"; + log_buffer_flush_to_disk(); ++ srv_sync_flush++; + + srv_main_thread_op_info = "making checkpoint"; + log_free_check(); + +- /* If there were less than 5 i/os during the +- one second sleep, we assume that there is free +- disk i/o capacity available, and it makes sense to +- do an insert buffer merge. */ ++ /* If i/os during one second sleep were less than 5% of ++ capacity, we assume that there is free disk i/o capacity ++ available, and it makes sense to do an insert buffer merge. */ + + n_pend_ios = buf_get_n_pending_ios() + + log_sys->n_pending_writes; + n_ios = log_sys->n_log_ios + buf_pool->n_pages_read + + buf_pool->n_pages_written; +- if (n_pend_ios < 3 && (n_ios - n_ios_old < 5)) { ++ if (n_pend_ios < PCT_IO(3) && (n_ios - n_ios_old < PCT_IO(5))) { + srv_main_thread_op_info = "doing insert buffer merge"; +- ibuf_contract_for_n_pages(TRUE, 5); ++ ibuf_contract_for_n_pages(TRUE, PCT_IO(5)); + + srv_main_thread_op_info = "flushing log"; + +- log_buffer_flush_to_disk(); ++ /* No fsync when srv_flush_log_at_trx_commit != 1 */ ++ log_buffer_flush_maybe_sync(); ++ srv_async_flush++; + } + + if (buf_get_modified_ratio_pct() > +@@ -2303,7 +2359,8 @@ + /* Try to keep the number of modified pages in the + buffer pool under the limit wished by the user */ + +- n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100, ++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, ++ PCT_IO(100), + ut_dulint_max); + + /* If we had to do the flush, it may have taken +@@ -2325,36 +2382,47 @@ + + /* ---- We perform the following code approximately once per + 10 seconds when there is database activity */ ++ srv_main_10_second_loops++; + + #ifdef MEM_PERIODIC_CHECK + /* Check magic numbers of every allocated mem block once in 10 + seconds */ + mem_validate_all_blocks(); + #endif +- /* If there were less than 200 i/os during the 10 second period, +- we assume that there is free disk i/o capacity available, and it +- makes sense to flush 100 pages. */ ++ /* If i/os during the 10 second period were less than 200% of ++ capacity, we assume that there is free disk i/o capacity ++ available, and it makes sense to flush srv_io_capacity pages. ++ ++ Note that this is done regardless of the fraction of dirty ++ pages relative to the max requested by the user. The one second ++ loop above requests writes for that case. The writes done here ++ are not required, and may be disabled. */ + + n_pend_ios = buf_get_n_pending_ios() + log_sys->n_pending_writes; + n_ios = log_sys->n_log_ios + buf_pool->n_pages_read + + buf_pool->n_pages_written; +- if (n_pend_ios < 3 && (n_ios - n_ios_very_old < 200)) { ++ if (srv_extra_dirty_writes && ++ n_pend_ios < PCT_IO(3) && (n_ios - n_ios_very_old < PCT_IO(200))) { + + srv_main_thread_op_info = "flushing buffer pool pages"; +- buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max); ++ buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ut_dulint_max); + + srv_main_thread_op_info = "flushing log"; +- log_buffer_flush_to_disk(); ++ /* No fsync when srv_flush_log_at_trx_commit != 1 */ ++ log_buffer_flush_maybe_sync(); ++ srv_async_flush++; + } + + /* We run a batch of insert buffer merge every 10 seconds, + even if the server were active */ + + srv_main_thread_op_info = "doing insert buffer merge"; +- ibuf_contract_for_n_pages(TRUE, 5); ++ ibuf_contract_for_n_pages(TRUE, PCT_IO(5)); + + srv_main_thread_op_info = "flushing log"; +- log_buffer_flush_to_disk(); ++ /* No fsync when srv_flush_log_at_trx_commit != 1 */ ++ log_buffer_flush_maybe_sync(); ++ srv_async_flush++; + + /* We run a full purge every 10 seconds, even if the server + were active */ +@@ -2378,8 +2446,9 @@ + if (difftime(current_time, last_flush_time) > 1) { + srv_main_thread_op_info = "flushing log"; + +- log_buffer_flush_to_disk(); ++ log_buffer_flush_to_disk(); + last_flush_time = current_time; ++ srv_sync_flush++; + } + } + +@@ -2393,14 +2462,14 @@ + (> 70 %), we assume we can afford reserving the disk(s) for + the time it requires to flush 100 pages */ + +- n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100, ++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), + ut_dulint_max); + } else { + /* Otherwise, we only flush a small number of pages so that + we do not unnecessarily use much disk i/o capacity from + other work */ + +- n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 10, ++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(10), + ut_dulint_max); + } + +@@ -2434,7 +2503,7 @@ + + /* The server has been quiet for a while: start running background + operations */ +- ++ srv_main_background_loops++; + srv_main_thread_op_info = "doing background drop tables"; + + n_tables_to_drop = row_drop_tables_for_mysql_in_background(); +@@ -2472,6 +2541,7 @@ + + log_buffer_flush_to_disk(); + last_flush_time = current_time; ++ srv_sync_flush++; + } + } + +@@ -2487,9 +2557,13 @@ + srv_main_thread_op_info = "doing insert buffer merge"; + + if (srv_fast_shutdown && srv_shutdown_state > 0) { +- n_bytes_merged = 0; ++ n_bytes_merged = 0; + } else { +- n_bytes_merged = ibuf_contract_for_n_pages(TRUE, 20); ++ /* This should do an amount of IO similar to the number of ++ * dirty pages that will be flushed in the call to ++ * buf_flush_batch below. Otherwise, the system favors ++ * clean pages over cleanup throughput. */ ++ n_bytes_merged = ibuf_contract_for_n_pages(TRUE, PCT_IO(100)); + } + + srv_main_thread_op_info = "reserving kernel mutex"; +@@ -2503,10 +2577,11 @@ + + flush_loop: + srv_main_thread_op_info = "flushing buffer pool pages"; ++ srv_main_flush_loops++; + + if (srv_fast_shutdown < 2) { + n_pages_flushed = +- buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max); ++ buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ut_dulint_max); + } else { + /* In the fastest shutdown we do not flush the buffer pool + to data files: we set n_pages_flushed to 0 artificially. */ +@@ -2528,7 +2603,17 @@ + + srv_main_thread_op_info = "flushing log"; + +- log_buffer_flush_to_disk(); ++ current_time = time(NULL); ++ if (difftime(current_time, last_flush_time) > 1) { ++ srv_main_thread_op_info = (char*) "flushing log"; ++ log_buffer_flush_to_disk(); ++ last_flush_time = current_time; ++ srv_sync_flush++; ++ } else { ++ /* No fsync when srv_flush_log_at_trx_commit != 1 */ ++ log_buffer_flush_maybe_sync(); ++ srv_async_flush++; ++ } + + srv_main_thread_op_info = "making checkpoint"; + +diff -r 322370200e6a innobase/srv/srv0start.c +--- a/innobase/srv/srv0start.c Mon Nov 03 05:07:57 2008 -0800 ++++ b/innobase/srv/srv0start.c Mon Nov 03 05:08:52 2008 -0800 +@@ -973,6 +973,7 @@ + ulint i; + ibool srv_file_per_table_original_value = srv_file_per_table; + mtr_t mtr; ++ ulint n_threads; + #ifdef HAVE_DARWIN_THREADS + # ifdef F_FULLFSYNC + /* This executable has been compiled on Mac OS X 10.3 or later. +@@ -1206,24 +1207,32 @@ + } + + /* Restrict the maximum number of file i/o threads */ +- if (srv_n_file_io_threads > SRV_MAX_N_IO_THREADS) { +- +- srv_n_file_io_threads = SRV_MAX_N_IO_THREADS; ++ if ((srv_n_read_io_threads + srv_n_write_io_threads) > SRV_MAX_N_IO_THREADS) { ++ fprintf(stderr, ++ "InnoDB: requested too many read(%d) or write(%d) IO threads, max is %d\n", ++ srv_n_read_io_threads, srv_n_write_io_threads, SRV_MAX_N_IO_THREADS); ++ return(DB_ERROR); + } + + if (!os_aio_use_native_aio) { +- /* In simulated aio we currently have use only for 4 threads */ +- srv_n_file_io_threads = 4; ++ /* More than 4 threads are now supported. */ ++ n_threads = os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD, ++ srv_n_read_io_threads, ++ srv_n_write_io_threads, ++ SRV_MAX_N_PENDING_SYNC_IOS); ++ } else { ++ /* Might need more slots here. Alas, I don't do windows. */ ++ n_threads = os_aio_init(SRV_N_PENDING_IOS_PER_THREAD, ++ srv_n_read_io_threads, ++ srv_n_write_io_threads, ++ SRV_MAX_N_PENDING_SYNC_IOS); ++ } + +- os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD +- * srv_n_file_io_threads, +- srv_n_file_io_threads, +- SRV_MAX_N_PENDING_SYNC_IOS); +- } else { +- os_aio_init(SRV_N_PENDING_IOS_PER_THREAD +- * srv_n_file_io_threads, +- srv_n_file_io_threads, +- SRV_MAX_N_PENDING_SYNC_IOS); ++ if (n_threads > SRV_MAX_N_IO_THREADS) { ++ fprintf(stderr, ++ "InnoDB: requested too many IO threads(%d), max is %d\n", ++ n_threads, SRV_MAX_N_IO_THREADS); ++ return(DB_ERROR); + } + + fil_init(srv_max_n_open_files); +@@ -1259,11 +1268,11 @@ + + /* Create i/o-handler threads: */ + +- for (i = 0; i < srv_n_file_io_threads; i++) { ++ for (i = 0; i < n_threads; i++) { + n[i] = i; + + os_thread_create(io_handler_thread, n + i, thread_ids + i); +- } ++ } + + #ifdef UNIV_LOG_ARCHIVE + if (0 != ut_strcmp(srv_log_group_home_dirs[0], srv_arch_dir)) { +diff -r 322370200e6a patch_info/innodb_io_tune.info +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/patch_info/innodb_io_tune.info Mon Nov 03 05:08:52 2008 -0800 +@@ -0,0 +1,9 @@ ++File=innodb_io_tune.patch ++Name=Tune InnoDB IO settings ++Version=1.0 ++Author=Google ++License=GPL ++Comment= ++ChangeLog= ++2008-11-01 ++VT: Initial porting +diff -r 322370200e6a sql/ha_innodb.cc +--- a/sql/ha_innodb.cc Mon Nov 03 05:07:57 2008 -0800 ++++ b/sql/ha_innodb.cc Mon Nov 03 05:08:52 2008 -0800 +@@ -147,7 +147,7 @@ + innobase_additional_mem_pool_size, innobase_file_io_threads, + innobase_lock_wait_timeout, innobase_force_recovery, + innobase_open_files; +- ++long innobase_read_io_threads, innobase_write_io_threads; + longlong innobase_buffer_pool_size, innobase_log_file_size; + + /* The default values for the following char* start-up parameters +@@ -175,6 +175,23 @@ + my_bool innobase_rollback_on_timeout = FALSE; + my_bool innobase_create_status_file = FALSE; + my_bool innobase_adaptive_hash_index = TRUE; ++ ++/* Max number of IO requests merged to perform large IO in background ++ IO threads. ++*/ ++long innobase_max_merged_io = 64; ++ ++/* time interval in seconds allowed to calling innodb_show_status functions */ ++long innobase_min_status_update_time_interval = 30; ++ ++ ++/* Default number of IO per second supported by server. Tunes background ++ IO rate ++*/ ++long innobase_io_capacity = 100; ++ ++/* Write dirty pages when pct dirty is less than max pct dirty */ ++my_bool innobase_extra_dirty_writes = TRUE; + + static char *internal_innobase_data_file_path = NULL; + +@@ -1372,7 +1389,11 @@ + + srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size; + ++ srv_io_capacity = (ulint) innobase_io_capacity; ++ srv_extra_dirty_writes = (ibool) innobase_extra_dirty_writes; + srv_n_file_io_threads = (ulint) innobase_file_io_threads; ++ srv_n_read_io_threads = (ulint) innobase_read_io_threads; ++ srv_n_write_io_threads = (ulint) innobase_write_io_threads; + + srv_lock_wait_timeout = (ulint) innobase_lock_wait_timeout; + srv_force_recovery = (ulint) innobase_force_recovery; +diff -r 322370200e6a sql/ha_innodb.h +--- a/sql/ha_innodb.h Mon Nov 03 05:07:57 2008 -0800 ++++ b/sql/ha_innodb.h Mon Nov 03 05:08:52 2008 -0800 +@@ -197,6 +197,7 @@ + + extern struct show_var_st innodb_status_variables[]; + extern ulong innobase_fast_shutdown; ++extern long innobase_max_merged_io; + extern ulong innobase_large_page_size; + extern long innobase_mirrored_log_groups, innobase_log_files_in_group; + extern longlong innobase_buffer_pool_size, innobase_log_file_size; +@@ -205,10 +206,14 @@ + extern long innobase_buffer_pool_awe_mem_mb; + extern long innobase_file_io_threads, innobase_lock_wait_timeout; + extern long innobase_force_recovery; ++extern long innobase_read_io_threads, innobase_write_io_threads; + extern long innobase_open_files; + extern char *innobase_data_home_dir, *innobase_data_file_path; + extern char *innobase_log_group_home_dir, *innobase_log_arch_dir; + extern char *innobase_unix_file_flush_method; ++extern long innobase_io_capacity; ++extern my_bool innobase_extra_dirty_writes; ++ + /* The following variables have to be my_bool for SHOW VARIABLES to work */ + extern my_bool innobase_log_archive, + innobase_use_doublewrite, +diff -r 322370200e6a sql/mysqld.cc +--- a/sql/mysqld.cc Mon Nov 03 05:07:57 2008 -0800 ++++ b/sql/mysqld.cc Mon Nov 03 05:08:52 2008 -0800 +@@ -4932,6 +4932,11 @@ + OPT_INNODB_ADDITIONAL_MEM_POOL_SIZE, + OPT_INNODB_MAX_PURGE_LAG, + OPT_INNODB_FILE_IO_THREADS, ++ OPT_INNODB_READ_IO_THREADS, ++ OPT_INNODB_WRITE_IO_THREADS, ++ OPT_INNODB_MAX_MERGED_IO, ++ OPT_INNODB_IO_CAPACITY, ++ OPT_INNODB_EXTRA_DIRTY_WRITES, + OPT_INNODB_LOCK_WAIT_TIMEOUT, + OPT_INNODB_THREAD_CONCURRENCY, + OPT_INNODB_COMMIT_CONCURRENCY, +@@ -5302,6 +5307,25 @@ + (gptr*) &global_system_variables.innodb_table_locks, + (gptr*) &global_system_variables.innodb_table_locks, + 0, GET_BOOL, OPT_ARG, 1, 0, 0, 0, 0, 0}, ++ {"innodb_max_merged_io", OPT_INNODB_MAX_MERGED_IO, ++ "Max number of IO requests merged to issue large IO from background IO threads.", ++ (gptr*) &innobase_max_merged_io, ++ (gptr*) &innobase_max_merged_io, 0, GET_LONG, REQUIRED_ARG, 64, 1, 64, 0, 0, 0}, ++ {"innodb_read_io_threads", OPT_INNODB_READ_IO_THREADS, ++ "Number of background read I/O threads in InnoDB.", (gptr*) &innobase_read_io_threads, ++ (gptr*) &innobase_read_io_threads, 0, GET_LONG, REQUIRED_ARG, 1, 1, 64, 0, 1, 0}, ++ {"innodb_write_io_threads", OPT_INNODB_WRITE_IO_THREADS, ++ "Number of background write I/O threads in InnoDB.", (gptr*) &innobase_write_io_threads, ++ (gptr*) &innobase_write_io_threads, 0, GET_LONG, REQUIRED_ARG, 1, 1, 64, 0, 1, 0}, ++ {"innodb_io_capacity", OPT_INNODB_IO_CAPACITY, ++ "Number of IO operations per second the server can do. Tunes background IO rate.", ++ (gptr*) &innobase_io_capacity, ++ (gptr*) &innobase_io_capacity, 0, GET_LONG, ++ REQUIRED_ARG, 100, 100, 999999999, 0, 1, 0}, ++ {"innodb_extra_dirty_writes", OPT_INNODB_EXTRA_DIRTY_WRITES, ++ "When set, flush dirty buffer pages when dirty pct is less than max dirty pct. ", ++ (gptr*) &innobase_extra_dirty_writes, (gptr*) &innobase_extra_dirty_writes, ++ 0, GET_BOOL, NO_ARG, 1, 0, 1, 0, 1, 0}, + #endif /* End HAVE_INNOBASE_DB */ + {"isam", OPT_ISAM, "Obsolete. ISAM storage engine is no longer supported.", + (gptr*) &opt_isam, (gptr*) &opt_isam, 0, GET_BOOL, NO_ARG, 0, 0, 0, +diff -r 322370200e6a sql/set_var.cc +--- a/sql/set_var.cc Mon Nov 03 05:07:57 2008 -0800 ++++ b/sql/set_var.cc Mon Nov 03 05:08:52 2008 -0800 +@@ -919,12 +919,14 @@ + {"innodb_data_home_dir", (char*) &innobase_data_home_dir, SHOW_CHAR_PTR}, + {"innodb_adaptive_hash_index", (char*) &innobase_adaptive_hash_index, SHOW_MY_BOOL}, + {"innodb_doublewrite", (char*) &innobase_use_doublewrite, SHOW_MY_BOOL}, ++ {"innodb_extra_dirty_writes", (char*) &innobase_extra_dirty_writes, SHOW_MY_BOOL}, + {sys_innodb_fast_shutdown.name,(char*) &sys_innodb_fast_shutdown, SHOW_SYS}, + {"innodb_file_io_threads", (char*) &innobase_file_io_threads, SHOW_LONG }, + {"innodb_file_per_table", (char*) &innobase_file_per_table, SHOW_MY_BOOL}, + {sys_innodb_flush_log_at_trx_commit.name, (char*) &sys_innodb_flush_log_at_trx_commit, SHOW_SYS}, + {"innodb_flush_method", (char*) &innobase_unix_file_flush_method, SHOW_CHAR_PTR}, + {"innodb_force_recovery", (char*) &innobase_force_recovery, SHOW_LONG }, ++ {"innodb_io_capacity", (char*) &innobase_io_capacity, SHOW_LONG }, + {"innodb_lock_wait_timeout", (char*) &innobase_lock_wait_timeout, SHOW_LONG }, + {"innodb_locks_unsafe_for_binlog", (char*) &innobase_locks_unsafe_for_binlog, SHOW_MY_BOOL}, + {"innodb_log_arch_dir", (char*) &innobase_log_arch_dir, SHOW_CHAR_PTR}, +@@ -943,6 +945,9 @@ + {sys_innodb_table_locks.name, (char*) &sys_innodb_table_locks, SHOW_SYS}, + {sys_innodb_thread_concurrency.name, (char*) &sys_innodb_thread_concurrency, SHOW_SYS}, + {sys_innodb_thread_sleep_delay.name, (char*) &sys_innodb_thread_sleep_delay, SHOW_SYS}, ++ {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG }, ++ {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG }, ++ {"innodb_max_merged_io", (char*) &innobase_max_merged_io, SHOW_LONG}, + #endif + {sys_interactive_timeout.name,(char*) &sys_interactive_timeout, SHOW_SYS}, + {sys_join_buffer_size.name, (char*) &sys_join_buffer_size, SHOW_SYS}, diff --git a/percona/5.0.87-b20-20091116/innodb_rw_lock_old.patch b/percona/5.0.87-b20-20091116/innodb_rw_lock_old.patch new file mode 100644 index 0000000..b4a1a79 --- /dev/null +++ b/percona/5.0.87-b20-20091116/innodb_rw_lock_old.patch @@ -0,0 +1,1357 @@ +diff -ruN a/innobase/btr/btr0sea.c b/innobase/btr/btr0sea.c +--- a/innobase/btr/btr0sea.c 2009-05-20 14:21:44.000000000 +0900 ++++ b/innobase/btr/btr0sea.c 2009-05-20 14:39:34.000000000 +0900 +@@ -773,7 +773,7 @@ + rw_lock_s_lock(&btr_search_latch); + } + +- ut_ad(btr_search_latch.writer != RW_LOCK_EX); ++ ut_ad(btr_search_latch.writer_count == 0); + ut_ad(btr_search_latch.reader_count > 0); + + rec = ha_search_and_get_data(btr_search_sys->hash_index, fold); +diff -ruN a/innobase/include/sync0rw.h b/innobase/include/sync0rw.h +--- a/innobase/include/sync0rw.h 2009-01-30 06:42:20.000000000 +0900 ++++ b/innobase/include/sync0rw.h 2009-04-16 16:15:28.000000000 +0900 +@@ -325,7 +325,17 @@ + Accessor functions for rw lock. */ + UNIV_INLINE + ulint +-rw_lock_get_waiters( ++rw_lock_get_s_waiters( ++/*==================*/ ++ rw_lock_t* lock); ++UNIV_INLINE ++ulint ++rw_lock_get_x_waiters( ++/*==================*/ ++ rw_lock_t* lock); ++UNIV_INLINE ++ulint ++rw_lock_get_wx_waiters( + /*================*/ + rw_lock_t* lock); + UNIV_INLINE +@@ -408,6 +418,17 @@ + rw_lock_debug_t* info); /* in: debug struct */ + #endif /* UNIV_SYNC_DEBUG */ + ++#ifdef HAVE_ATOMIC_BUILTINS ++/* This value means NOT_LOCKED */ ++#define RW_LOCK_BIAS 0x00100000 ++#else ++#error HAVE_ATOMIC_BUILTINS is not defined. Do you use enough new GCC or compatibles? ++#error Or do you use exact options for CFLAGS? ++#error e.g. (for x86_32): "-m32 -march=i586 -mtune=i686" ++#error e.g. (for Sparc_64): "-m64 -mcpu=v9" ++#error Otherwise, this build may be slower than normal version. ++#endif ++ + /* NOTE! The structure appears here only for the compiler to know its size. + Do not use its fields directly! The structure used in the spin lock + implementation of a read-write lock. Several threads may have a shared lock +@@ -417,9 +438,9 @@ + field. Then no new readers are allowed in. */ + + struct rw_lock_struct { +- os_event_t event; /* Used by sync0arr.c for thread queueing */ +- +-#ifdef __WIN__ ++ /* Used by sync0arr.c for thread queueing */ ++ os_event_t s_event; /* Used for s_lock */ ++ os_event_t x_event; /* Used for x_lock */ + os_event_t wait_ex_event; /* This windows specific event is + used by the thread which has set the + lock state to RW_LOCK_WAIT_EX. The +@@ -427,31 +448,35 @@ + thread will be the next one to proceed + once the current the event gets + signalled. See LEMMA 2 in sync0sync.c */ ++ ++#ifdef HAVE_ATOMIC_BUILTINS ++ volatile lint lock_word; /* Used by using atomic builtin */ + #endif + +- ulint reader_count; /* Number of readers who have locked this ++ volatile ulint reader_count; /* Number of readers who have locked this + lock in the shared mode */ +- ulint writer; /* This field is set to RW_LOCK_EX if there ++ volatile ulint writer; /* This field is set to RW_LOCK_EX if there + is a writer owning the lock (in exclusive + mode), RW_LOCK_WAIT_EX if a writer is + queueing for the lock, and + RW_LOCK_NOT_LOCKED, otherwise. */ +- os_thread_id_t writer_thread; ++ volatile os_thread_id_t writer_thread; + /* Thread id of a possible writer thread */ +- ulint writer_count; /* Number of times the same thread has ++ volatile ulint writer_count; /* Number of times the same thread has + recursively locked the lock in the exclusive + mode */ ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_t mutex; /* The mutex protecting rw_lock_struct */ ++#endif + ulint pass; /* Default value 0. This is set to some + value != 0 given by the caller of an x-lock + operation, if the x-lock is to be passed to + another thread to unlock (which happens in + asynchronous i/o). */ +- ulint waiters; /* This ulint is set to 1 if there are +- waiters (readers or writers) in the global +- wait array, waiting for this rw_lock. +- Otherwise, == 0. */ +- ibool writer_is_wait_ex; ++ volatile ulint s_waiters; /* 1: there are waiters (s_lock) */ ++ volatile ulint x_waiters; /* 1: there are waiters (x_lock) */ ++ volatile ulint wait_ex_waiters; /* 1: there are waiters (wait_ex) */ ++ volatile ibool writer_is_wait_ex; + /* This is TRUE if the writer field is + RW_LOCK_WAIT_EX; this field is located far + from the memory update hotspot fields which +diff -ruN a/innobase/include/sync0rw.ic b/innobase/include/sync0rw.ic +--- a/innobase/include/sync0rw.ic 2009-01-30 06:42:20.000000000 +0900 ++++ b/innobase/include/sync0rw.ic 2009-04-16 17:06:53.000000000 +0900 +@@ -47,20 +47,64 @@ + Accessor functions for rw lock. */ + UNIV_INLINE + ulint +-rw_lock_get_waiters( ++rw_lock_get_s_waiters( + /*================*/ + rw_lock_t* lock) + { +- return(lock->waiters); ++ return(lock->s_waiters); + } + UNIV_INLINE +-void +-rw_lock_set_waiters( ++ulint ++rw_lock_get_x_waiters( + /*================*/ ++ rw_lock_t* lock) ++{ ++ return(lock->x_waiters); ++} ++UNIV_INLINE ++ulint ++rw_lock_get_wx_waiters( ++/*================*/ ++ rw_lock_t* lock) ++{ ++ return(lock->wait_ex_waiters); ++} ++UNIV_INLINE ++void ++rw_lock_set_s_waiters( + rw_lock_t* lock, + ulint flag) + { +- lock->waiters = flag; ++#ifdef HAVE_ATOMIC_BUILTINS ++ __sync_lock_test_and_set(&lock->s_waiters, flag); ++#else ++ lock->s_waiters = flag; ++#endif ++} ++UNIV_INLINE ++void ++rw_lock_set_x_waiters( ++ rw_lock_t* lock, ++ ulint flag) ++{ ++#ifdef HAVE_ATOMIC_BUILTINS ++ __sync_lock_test_and_set(&lock->x_waiters, flag); ++#else ++ lock->x_waiters = flag; ++#endif ++} ++UNIV_INLINE ++void ++rw_lock_set_wx_waiters( ++/*================*/ ++ rw_lock_t* lock, ++ ulint flag) ++{ ++#ifdef HAVE_ATOMIC_BUILTINS ++ __sync_lock_test_and_set(&lock->wait_ex_waiters, flag); ++#else ++ lock->wait_ex_waiters = flag; ++#endif + } + UNIV_INLINE + ulint +@@ -68,7 +112,19 @@ + /*===============*/ + rw_lock_t* lock) + { ++#ifdef HAVE_ATOMIC_BUILTINS ++ if (lock->writer == RW_LOCK_NOT_LOCKED) { ++ return(RW_LOCK_NOT_LOCKED); ++ } ++ ++ if (lock->writer_is_wait_ex) { ++ return(RW_LOCK_WAIT_EX); ++ } else { ++ return(RW_LOCK_EX); ++ } ++#else + return(lock->writer); ++#endif + } + UNIV_INLINE + void +@@ -96,6 +152,7 @@ + { + lock->reader_count = count; + } ++#ifndef HAVE_ATOMIC_BUILTINS + UNIV_INLINE + mutex_t* + rw_lock_get_mutex( +@@ -104,6 +161,7 @@ + { + return(&(lock->mutex)); + } ++#endif + + /********************************************************************** + Returns the value of writer_count for the lock. Does not reserve the lock +@@ -133,14 +191,26 @@ + const char* file_name, /* in: file name where lock requested */ + ulint line) /* in: line where requested */ + { +-#ifdef UNIV_SYNC_DEBUG ++#if defined(UNIV_SYNC_DEBUG) && !defined(HAVE_ATOMIC_BUILTINS) + ut_ad(mutex_own(rw_lock_get_mutex(lock))); + #endif /* UNIV_SYNC_DEBUG */ + /* Check if the writer field is free */ + ++#ifdef HAVE_ATOMIC_BUILTINS ++ if (UNIV_LIKELY(rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED)) { ++ /* try s-lock */ ++ if(__sync_sub_and_fetch(&(lock->lock_word),1) <= 0) { ++ /* fail */ ++ __sync_fetch_and_add(&(lock->lock_word),1); ++ return(FALSE); /* locking did not succeed */ ++ } ++ /* success */ ++ __sync_fetch_and_add(&(lock->reader_count),1); ++#else + if (UNIV_LIKELY(lock->writer == RW_LOCK_NOT_LOCKED)) { + /* Set the shared lock by incrementing the reader count */ + lock->reader_count++; ++#endif + + #ifdef UNIV_SYNC_DEBUG + rw_lock_add_debug_info(lock, pass, RW_LOCK_SHARED, file_name, +@@ -167,11 +237,15 @@ + const char* file_name, /* in: file name where requested */ + ulint line) /* in: line where lock requested */ + { +- ut_ad(lock->writer == RW_LOCK_NOT_LOCKED); ++ ut_ad(rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED); + ut_ad(rw_lock_get_reader_count(lock) == 0); + + /* Set the shared lock by incrementing the reader count */ ++#ifdef HAVE_ATOMIC_BUILTINS ++ __sync_fetch_and_add(&(lock->reader_count),1); ++#else + lock->reader_count++; ++#endif + + lock->last_s_file_name = file_name; + lock->last_s_line = line; +@@ -199,7 +273,11 @@ + + rw_lock_set_writer(lock, RW_LOCK_EX); + lock->writer_thread = os_thread_get_curr_id(); ++#ifdef HAVE_ATOMIC_BUILTINS ++ __sync_fetch_and_add(&(lock->writer_count),1); ++#else + lock->writer_count++; ++#endif + lock->pass = 0; + + lock->last_x_file_name = file_name; +@@ -241,15 +319,21 @@ + ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED)); /* see NOTE above */ + #endif /* UNIV_SYNC_DEBUG */ + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_enter(rw_lock_get_mutex(lock)); ++#endif + + if (UNIV_LIKELY(rw_lock_s_lock_low(lock, pass, file_name, line))) { ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(rw_lock_get_mutex(lock)); ++#endif + + return; /* Success */ + } else { + /* Did not succeed, try spin wait */ ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(rw_lock_get_mutex(lock)); ++#endif + + rw_lock_s_lock_spin(lock, pass, file_name, line); + +@@ -272,11 +356,23 @@ + { + ibool success = FALSE; + ++#ifdef HAVE_ATOMIC_BUILTINS ++ if (rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED) { ++ /* try s-lock */ ++ if(__sync_sub_and_fetch(&(lock->lock_word),1) <= 0) { ++ /* fail */ ++ __sync_fetch_and_add(&(lock->lock_word),1); ++ return(FALSE); /* locking did not succeed */ ++ } ++ /* success */ ++ __sync_fetch_and_add(&(lock->reader_count),1); ++#else + mutex_enter(rw_lock_get_mutex(lock)); + + if (lock->writer == RW_LOCK_NOT_LOCKED) { + /* Set the shared lock by incrementing the reader count */ + lock->reader_count++; ++#endif + + #ifdef UNIV_SYNC_DEBUG + rw_lock_add_debug_info(lock, 0, RW_LOCK_SHARED, file_name, +@@ -289,7 +385,9 @@ + success = TRUE; + } + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(rw_lock_get_mutex(lock)); ++#endif + + return(success); + } +@@ -309,6 +407,54 @@ + { + ibool success = FALSE; + os_thread_id_t curr_thread = os_thread_get_curr_id(); ++#ifdef HAVE_ATOMIC_BUILTINS ++ if (lock->reader_count == 0) { ++ /* try to lock writer */ ++ if(__sync_lock_test_and_set(&(lock->writer),RW_LOCK_EX) ++ == RW_LOCK_NOT_LOCKED) { ++ /* success */ ++retry_x_lock: ++ /* try x-lock */ ++ if(__sync_sub_and_fetch(&(lock->lock_word), ++ RW_LOCK_BIAS) == 0) { ++ /* success */ ++ lock->writer_thread = curr_thread; ++ lock->pass = 0; ++ lock->writer_is_wait_ex = FALSE; ++ /* next function may work as memory barrier */ ++ relock: ++ __sync_fetch_and_add(&(lock->writer_count),1); ++ ++#ifdef UNIV_SYNC_DEBUG ++ rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line); ++#endif ++ ++ lock->last_x_file_name = file_name; ++ lock->last_x_line = line; ++ ++ ut_ad(rw_lock_validate(lock)); ++ ++ return(TRUE); ++ } else { ++ /* fail (x-lock) */ ++ if (__sync_fetch_and_add(&(lock->lock_word),RW_LOCK_BIAS) ++ == 0) ++ goto retry_x_lock; ++ } ++ ++ __sync_lock_test_and_set(&(lock->writer),RW_LOCK_NOT_LOCKED); ++ } ++ } ++ ++ if (lock->pass == 0 ++ && os_thread_eq(lock->writer_thread, curr_thread)) { ++ goto relock; ++ } ++ ++ //ut_ad(rw_lock_validate(lock)); ++ ++ return(FALSE); ++#else + mutex_enter(rw_lock_get_mutex(lock)); + + if (UNIV_UNLIKELY(rw_lock_get_reader_count(lock) != 0)) { +@@ -339,6 +485,7 @@ + ut_ad(rw_lock_validate(lock)); + + return(success); ++#endif + } + + /********************************************************************** +@@ -354,16 +501,33 @@ + #endif + ) + { ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_t* mutex = &(lock->mutex); +- ibool sg = FALSE; ++#endif ++ ibool x_sg = FALSE; ++ ibool wx_sg = FALSE; ++#ifdef HAVE_ATOMIC_BUILTINS ++ ibool last = FALSE; ++#endif + ++#ifndef HAVE_ATOMIC_BUILTINS + /* Acquire the mutex protecting the rw-lock fields */ + mutex_enter(mutex); ++#endif + + /* Reset the shared lock by decrementing the reader count */ + + ut_a(lock->reader_count > 0); ++#ifdef HAVE_ATOMIC_BUILTINS ++ /* unlock lock_word */ ++ __sync_fetch_and_add(&(lock->lock_word),1); ++ ++ if(__sync_sub_and_fetch(&(lock->reader_count),1) == 0) { ++ last = TRUE; ++ } ++#else + lock->reader_count--; ++#endif + + #ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, pass, RW_LOCK_SHARED); +@@ -372,22 +536,39 @@ + /* If there may be waiters and this was the last s-lock, + signal the object */ + +- if (UNIV_UNLIKELY(lock->waiters) ++#ifdef HAVE_ATOMIC_BUILTINS ++ if (UNIV_UNLIKELY(last && __sync_lock_test_and_set(&lock->wait_ex_waiters, 0))) { ++ os_event_set(lock->wait_ex_event); ++ sync_array_object_signalled(sync_primary_wait_array); ++ } ++ else if (UNIV_UNLIKELY(last && __sync_lock_test_and_set(&lock->x_waiters, 0))) { ++ os_event_set(lock->x_event); ++ sync_array_object_signalled(sync_primary_wait_array); ++ } ++#else ++ if (UNIV_UNLIKELY(lock->wait_ex_waiters) + && lock->reader_count == 0) { +- sg = TRUE; ++ wx_sg = TRUE; + +- rw_lock_set_waiters(lock, 0); ++ rw_lock_set_wx_waiters(lock, 0); ++ } ++ else if (UNIV_UNLIKELY(lock->x_waiters) ++ && lock->reader_count == 0) { ++ x_sg = TRUE; ++ ++ rw_lock_set_x_waiters(lock, 0); + } + + mutex_exit(mutex); + +- if (UNIV_UNLIKELY(sg)) { +-#ifdef __WIN__ ++ if (UNIV_UNLIKELY(wx_sg)) { + os_event_set(lock->wait_ex_event); +-#endif +- os_event_set(lock->event); ++ sync_array_object_signalled(sync_primary_wait_array); ++ } else if (UNIV_UNLIKELY(x_sg)) { ++ os_event_set(lock->x_event); + sync_array_object_signalled(sync_primary_wait_array); + } ++#endif + + ut_ad(rw_lock_validate(lock)); + +@@ -409,13 +590,22 @@ + + ut_ad(lock->reader_count > 0); + ++#ifdef HAVE_ATOMIC_BUILTINS ++ __sync_sub_and_fetch(&(lock->reader_count),1); ++#else + lock->reader_count--; ++#endif + + #ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, 0, RW_LOCK_SHARED); + #endif + ++#ifdef HAVE_ATOMIC_BUILTINS ++ ut_ad(!lock->s_waiters); ++ ut_ad(!lock->x_waiters); ++#else + ut_ad(!lock->waiters); ++#endif + ut_ad(rw_lock_validate(lock)); + #ifdef UNIV_SYNC_PERF_STAT + rw_s_exit_count++; +@@ -435,41 +625,83 @@ + #endif + ) + { +- ibool sg = FALSE; ++#ifdef HAVE_ATOMIC_BUILTINS ++ ibool last = FALSE; ++#endif ++ ibool s_sg = FALSE; ++ ibool x_sg = FALSE; + ++#ifndef HAVE_ATOMIC_BUILTINS + /* Acquire the mutex protecting the rw-lock fields */ + mutex_enter(&(lock->mutex)); ++#endif + + /* Reset the exclusive lock if this thread no longer has an x-mode + lock */ + + ut_ad(lock->writer_count > 0); + ++#ifdef HAVE_ATOMIC_BUILTINS ++ if(__sync_sub_and_fetch(&(lock->writer_count),1) == 0) { ++ last = TRUE; ++ } ++ ++ if (last) { ++ /* unlock lock_word */ ++ __sync_fetch_and_add(&(lock->lock_word),RW_LOCK_BIAS); ++ ++ /* FIXME: It is a value of bad manners for pthread. ++ But we shouldn't keep an ID of not-owner. */ ++ lock->writer_thread = -1; ++ __sync_lock_test_and_set(&(lock->writer),RW_LOCK_NOT_LOCKED); ++ } ++#else + lock->writer_count--; + + if (lock->writer_count == 0) { + rw_lock_set_writer(lock, RW_LOCK_NOT_LOCKED); + } ++#endif + + #ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, pass, RW_LOCK_EX); + #endif + + /* If there may be waiters, signal the lock */ +- if (UNIV_UNLIKELY(lock->waiters) +- && lock->writer_count == 0) { +- +- sg = TRUE; +- rw_lock_set_waiters(lock, 0); ++#ifdef HAVE_ATOMIC_BUILTINS ++ if (last) { ++ if(__sync_lock_test_and_set(&lock->s_waiters, 0)){ ++ s_sg = TRUE; ++ } ++ if(__sync_lock_test_and_set(&lock->x_waiters, 0)){ ++ x_sg = TRUE; ++ } ++ } ++#else ++ if (lock->writer_count == 0) { ++ if(lock->s_waiters){ ++ s_sg = TRUE; ++ rw_lock_set_s_waiters(lock, 0); ++ } ++ if(lock->x_waiters){ ++ x_sg = TRUE; ++ rw_lock_set_x_waiters(lock, 0); ++ } + } + + mutex_exit(&(lock->mutex)); ++#endif + +- if (UNIV_UNLIKELY(sg)) { ++ if (UNIV_UNLIKELY(s_sg)) { ++ os_event_set(lock->s_event); ++ sync_array_object_signalled(sync_primary_wait_array); ++ } ++ if (UNIV_UNLIKELY(x_sg)) { + #ifdef __WIN__ ++ /* I doubt the necessity of it. */ + os_event_set(lock->wait_ex_event); + #endif +- os_event_set(lock->event); ++ os_event_set(lock->x_event); + sync_array_object_signalled(sync_primary_wait_array); + } + +@@ -494,9 +726,13 @@ + + ut_ad(lock->writer_count > 0); + ++#ifdef HAVE_ATOMIC_BUILTINS ++ if(__sync_sub_and_fetch(&(lock->writer_count),1) == 0) { ++#else + lock->writer_count--; + + if (lock->writer_count == 0) { ++#endif + rw_lock_set_writer(lock, RW_LOCK_NOT_LOCKED); + } + +@@ -504,7 +740,12 @@ + rw_lock_remove_debug_info(lock, 0, RW_LOCK_EX); + #endif + ++#ifdef HAVE_ATOMIC_BUILTINS ++ ut_ad(!lock->s_waiters); ++ ut_ad(!lock->x_waiters); ++#else + ut_ad(!lock->waiters); ++#endif + ut_ad(rw_lock_validate(lock)); + + #ifdef UNIV_SYNC_PERF_STAT +diff -ruN a/innobase/sync/sync0arr.c b/innobase/sync/sync0arr.c +--- a/innobase/sync/sync0arr.c 2009-01-30 06:42:24.000000000 +0900 ++++ b/innobase/sync/sync0arr.c 2009-04-16 16:15:28.000000000 +0900 +@@ -309,13 +309,13 @@ + { + if (type == SYNC_MUTEX) { + return(os_event_reset(((mutex_t *) object)->event)); +-#ifdef __WIN__ + } else if (type == RW_LOCK_WAIT_EX) { + return(os_event_reset( + ((rw_lock_t *) object)->wait_ex_event)); +-#endif +- } else { +- return(os_event_reset(((rw_lock_t *) object)->event)); ++ } else if (type == RW_LOCK_SHARED) { ++ return(os_event_reset(((rw_lock_t *) object)->s_event)); ++ } else { /* RW_LOCK_EX */ ++ return(os_event_reset(((rw_lock_t *) object)->x_event)); + } + } + +@@ -415,15 +415,12 @@ + + if (cell->request_type == SYNC_MUTEX) { + event = ((mutex_t*) cell->wait_object)->event; +-#ifdef __WIN__ +- /* On windows if the thread about to wait is the one which +- has set the state of the rw_lock to RW_LOCK_WAIT_EX, then +- it waits on a special event i.e.: wait_ex_event. */ + } else if (cell->request_type == RW_LOCK_WAIT_EX) { + event = ((rw_lock_t*) cell->wait_object)->wait_ex_event; +-#endif +- } else { +- event = ((rw_lock_t*) cell->wait_object)->event; ++ } else if (cell->request_type == RW_LOCK_SHARED) { ++ event = ((rw_lock_t*) cell->wait_object)->s_event; ++ } else { ++ event = ((rw_lock_t*) cell->wait_object)->x_event; + } + + cell->waiting = TRUE; +@@ -464,6 +461,7 @@ + mutex_t* mutex; + rw_lock_t* rwlock; + ulint type; ++ ulint writer; + + type = cell->request_type; + +@@ -492,12 +490,10 @@ + (ulong) mutex->waiters); + + } else if (type == RW_LOCK_EX +-#ifdef __WIN__ + || type == RW_LOCK_WAIT_EX +-#endif + || type == RW_LOCK_SHARED) { + +- fputs(type == RW_LOCK_EX ? "X-lock on" : "S-lock on", file); ++ fputs(type == RW_LOCK_SHARED ? "S-lock on" : "X-lock on", file); + + rwlock = cell->old_wait_rw_lock; + +@@ -505,21 +501,23 @@ + " RW-latch at %p created in file %s line %lu\n", + rwlock, rwlock->cfile_name, + (ulong) rwlock->cline); +- if (rwlock->writer != RW_LOCK_NOT_LOCKED) { ++ writer = rw_lock_get_writer(rwlock); ++ if (writer != RW_LOCK_NOT_LOCKED) { + fprintf(file, + "a writer (thread id %lu) has reserved it in mode %s", + (ulong) os_thread_pf(rwlock->writer_thread), +- rwlock->writer == RW_LOCK_EX ++ writer == RW_LOCK_EX + ? " exclusive\n" + : " wait exclusive\n"); + } + + fprintf(file, +- "number of readers %lu, waiters flag %lu\n" ++ "number of readers %lu, s_waiters flag %lu, x_waiters flag %lu\n" + "Last time read locked in file %s line %lu\n" + "Last time write locked in file %s line %lu\n", + (ulong) rwlock->reader_count, +- (ulong) rwlock->waiters, ++ (ulong) rwlock->s_waiters, ++ (ulong) (rwlock->x_waiters || rwlock->wait_ex_waiters), + rwlock->last_s_file_name, + (ulong) rwlock->last_s_line, + rwlock->last_x_file_name, +@@ -839,11 +837,15 @@ + /*========================*/ + sync_array_t* arr) /* in: wait array */ + { ++#ifdef HAVE_ATOMIC_BUILTINS ++ __sync_fetch_and_add(&(arr->sg_count),1); ++#else + sync_array_enter(arr); + + arr->sg_count++; + + sync_array_exit(arr); ++#endif + } + + /************************************************************************** +@@ -880,19 +882,23 @@ + + mutex = cell->wait_object; + os_event_set(mutex->event); +-#ifdef __WIN__ + } else if (cell->request_type + == RW_LOCK_WAIT_EX) { + rw_lock_t* lock; + + lock = cell->wait_object; + os_event_set(lock->wait_ex_event); +-#endif +- } else { ++ } else if (cell->request_type ++ == RW_LOCK_SHARED) { + rw_lock_t* lock; + + lock = cell->wait_object; +- os_event_set(lock->event); ++ os_event_set(lock->s_event); ++ } else { ++ rw_lock_t* lock; ++ ++ lock = cell->wait_object; ++ os_event_set(lock->x_event); + } + } + } +diff -ruN a/innobase/sync/sync0rw.c b/innobase/sync/sync0rw.c +--- a/innobase/sync/sync0rw.c 2009-01-30 06:42:24.000000000 +0900 ++++ b/innobase/sync/sync0rw.c 2009-04-16 17:33:59.000000000 +0900 +@@ -99,6 +99,7 @@ + object is created, then the following call initializes + the sync system. */ + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_create(rw_lock_get_mutex(lock)); + mutex_set_level(rw_lock_get_mutex(lock), SYNC_NO_ORDER_CHECK); + +@@ -108,8 +109,14 @@ + lock->mutex.cmutex_name = cmutex_name; + lock->mutex.mutex_type = 1; + #endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ ++#endif /* !HAVE_ATOMIC_BUILTINS */ + +- rw_lock_set_waiters(lock, 0); ++#ifdef HAVE_ATOMIC_BUILTINS ++ lock->lock_word = RW_LOCK_BIAS; ++#endif ++ rw_lock_set_s_waiters(lock, 0); ++ rw_lock_set_x_waiters(lock, 0); ++ rw_lock_set_wx_waiters(lock, 0); + rw_lock_set_writer(lock, RW_LOCK_NOT_LOCKED); + lock->writer_count = 0; + rw_lock_set_reader_count(lock, 0); +@@ -130,11 +137,9 @@ + lock->last_x_file_name = "not yet reserved"; + lock->last_s_line = 0; + lock->last_x_line = 0; +- lock->event = os_event_create(NULL); +- +-#ifdef __WIN__ ++ lock->s_event = os_event_create(NULL); ++ lock->x_event = os_event_create(NULL); + lock->wait_ex_event = os_event_create(NULL); +-#endif + + mutex_enter(&rw_lock_list_mutex); + +@@ -162,19 +167,21 @@ + ut_a(rw_lock_validate(lock)); + #endif /* UNIV_DEBUG */ + ut_a(rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED); +- ut_a(rw_lock_get_waiters(lock) == 0); ++ ut_a(rw_lock_get_s_waiters(lock) == 0); ++ ut_a(rw_lock_get_x_waiters(lock) == 0); ++ ut_a(rw_lock_get_wx_waiters(lock) == 0); + ut_a(rw_lock_get_reader_count(lock) == 0); + + lock->magic_n = 0; + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_free(rw_lock_get_mutex(lock)); ++#endif + + mutex_enter(&rw_lock_list_mutex); +- os_event_free(lock->event); +- +-#ifdef __WIN__ ++ os_event_free(lock->s_event); ++ os_event_free(lock->x_event); + os_event_free(lock->wait_ex_event); +-#endif + + if (UT_LIST_GET_PREV(list, lock)) { + ut_a(UT_LIST_GET_PREV(list, lock)->magic_n == RW_LOCK_MAGIC_N); +@@ -192,26 +199,43 @@ + Checks that the rw-lock has been initialized and that there are no + simultaneous shared and exclusive locks. */ + ++/* MEMO: If HAVE_ATOMIC_BUILTINS, we should use this function statically. */ ++ + ibool + rw_lock_validate( + /*=============*/ + rw_lock_t* lock) + { ++ ulint test; + ut_a(lock); + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_enter(rw_lock_get_mutex(lock)); ++#endif + + ut_a(lock->magic_n == RW_LOCK_MAGIC_N); ++#ifndef HAVE_ATOMIC_BUILTINS + ut_a((rw_lock_get_reader_count(lock) == 0) + || (rw_lock_get_writer(lock) != RW_LOCK_EX)); +- ut_a((rw_lock_get_writer(lock) == RW_LOCK_EX) +- || (rw_lock_get_writer(lock) == RW_LOCK_WAIT_EX) +- || (rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED)); +- ut_a((rw_lock_get_waiters(lock) == 0) +- || (rw_lock_get_waiters(lock) == 1)); ++#endif ++ test = rw_lock_get_writer(lock); ++ ut_a((test == RW_LOCK_EX) ++ || (test == RW_LOCK_WAIT_EX) ++ || (test == RW_LOCK_NOT_LOCKED)); ++ test = rw_lock_get_s_waiters(lock); ++ ut_a((test == 0) ++ || (test == 1)); ++ test = rw_lock_get_x_waiters(lock); ++ ut_a((test == 0) ++ || (test == 1)); ++ test = rw_lock_get_wx_waiters(lock); ++ ut_a((test == 0) ++ || (test == 1)); ++#ifndef HAVE_ATOMIC_BUILTINS + ut_a((lock->writer != RW_LOCK_EX) || (lock->writer_count > 0)); + + mutex_exit(rw_lock_get_mutex(lock)); ++#endif + + return(TRUE); + } +@@ -237,13 +261,14 @@ + ut_ad(rw_lock_validate(lock)); + + lock_loop: ++ i = 0; ++spin_loop: + rw_s_spin_wait_count++; + + /* Spin waiting for the writer field to become free */ +- i = 0; + +- while (rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED +- && i < SYNC_SPIN_ROUNDS) { ++ while (i < SYNC_SPIN_ROUNDS ++ && rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED) { + if (srv_spin_wait_delay) { + ut_delay(ut_rnd_interval(0, srv_spin_wait_delay)); + } +@@ -262,15 +287,27 @@ + lock->cfile_name, (ulong) lock->cline, (ulong) i); + } + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_enter(rw_lock_get_mutex(lock)); ++#endif + + /* We try once again to obtain the lock */ + + if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) { ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(rw_lock_get_mutex(lock)); ++#endif + + return; /* Success */ + } else { ++#ifdef HAVE_ATOMIC_BUILTINS ++ /* like sync0sync.c doing */ ++ i++; ++ ++ if (i < SYNC_SPIN_ROUNDS) { ++ goto spin_loop; ++ } ++#endif + /* If we get here, locking did not succeed, we may + suspend the thread to wait in the wait array */ + +@@ -281,9 +318,26 @@ + file_name, line, + &index); + +- rw_lock_set_waiters(lock, 1); ++ rw_lock_set_s_waiters(lock, 1); ++ ++#ifdef HAVE_ATOMIC_BUILTINS ++ /* like sync0sync.c doing */ ++ for (i = 0; i < 4; i++) { ++ if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) { ++ sync_array_free_cell(sync_primary_wait_array, index); ++ return; /* Success */ ++ } ++ } + ++ /* If wait_ex_waiter stalls, wakes it. */ ++ if (lock->reader_count == 0 ++ && __sync_lock_test_and_set(&lock->wait_ex_waiters, 0)) { ++ os_event_set(lock->wait_ex_event); ++ sync_array_object_signalled(sync_primary_wait_array); ++ } ++#else + mutex_exit(rw_lock_get_mutex(lock)); ++#endif + + if (srv_print_latch_waits) { + fprintf(stderr, +@@ -318,13 +372,19 @@ + { + ut_ad(rw_lock_is_locked(lock, RW_LOCK_EX)); + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_enter(&(lock->mutex)); ++#endif + + lock->writer_thread = os_thread_get_curr_id(); + + lock->pass = 0; + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(&(lock->mutex)); ++#else ++ __sync_synchronize(); ++#endif + } + + /********************************************************************** +@@ -342,6 +402,89 @@ + const char* file_name,/* in: file name where lock requested */ + ulint line) /* in: line where requested */ + { ++#ifdef HAVE_ATOMIC_BUILTINS ++ os_thread_id_t curr_thread = os_thread_get_curr_id(); ++retry_writer: ++ /* try to lock writer */ ++ if(__sync_lock_test_and_set(&(lock->writer),RW_LOCK_EX) ++ == RW_LOCK_NOT_LOCKED) { ++ /* success */ ++ /* obtain RW_LOCK_WAIT_EX right */ ++ lock->writer_thread = curr_thread; ++ lock->pass = pass; ++ lock->writer_is_wait_ex = TRUE; ++ /* atomic operation may be safer about memory order. */ ++ __sync_synchronize(); ++#ifdef UNIV_SYNC_DEBUG ++ rw_lock_add_debug_info(lock, pass, RW_LOCK_WAIT_EX, ++ file_name, line); ++#endif ++ } ++ ++ if (!os_thread_eq(lock->writer_thread, curr_thread)) { ++ return(RW_LOCK_NOT_LOCKED); ++ } ++ ++ switch(rw_lock_get_writer(lock)) { ++ case RW_LOCK_WAIT_EX: ++ /* have right to try x-lock */ ++retry_x_lock: ++ /* try x-lock */ ++ if(__sync_sub_and_fetch(&(lock->lock_word), ++ RW_LOCK_BIAS) == 0) { ++ /* success */ ++ lock->pass = pass; ++ lock->writer_is_wait_ex = FALSE; ++ __sync_fetch_and_add(&(lock->writer_count),1); ++ ++#ifdef UNIV_SYNC_DEBUG ++ rw_lock_remove_debug_info(lock, pass, RW_LOCK_WAIT_EX); ++ rw_lock_add_debug_info(lock, pass, RW_LOCK_EX, ++ file_name, line); ++#endif ++ ++ lock->last_x_file_name = file_name; ++ lock->last_x_line = line; ++ ++ /* Locking succeeded, we may return */ ++ return(RW_LOCK_EX); ++ } else if(__sync_fetch_and_add(&(lock->lock_word), ++ RW_LOCK_BIAS) == 0) { ++ /* retry x-lock */ ++ goto retry_x_lock; ++ } ++ ++ /* There are readers, we have to wait */ ++ return(RW_LOCK_WAIT_EX); ++ ++ break; ++ ++ case RW_LOCK_EX: ++ /* already have x-lock */ ++ if ((lock->pass == 0)&&(pass == 0)) { ++ __sync_fetch_and_add(&(lock->writer_count),1); ++ ++#ifdef UNIV_SYNC_DEBUG ++ rw_lock_add_debug_info(lock, pass, RW_LOCK_EX, file_name, ++ line); ++#endif ++ ++ lock->last_x_file_name = file_name; ++ lock->last_x_line = line; ++ ++ /* Locking succeeded, we may return */ ++ return(RW_LOCK_EX); ++ } ++ ++ return(RW_LOCK_NOT_LOCKED); ++ ++ break; ++ ++ default: /* RW_LOCK_NOT_LOCKED? maybe impossible */ ++ goto retry_writer; ++ } ++#else /* HAVE_ATOMIC_BUILTINS */ ++ + #ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(rw_lock_get_mutex(lock))); + #endif /* UNIV_SYNC_DEBUG */ +@@ -423,6 +566,7 @@ + /* Locking succeeded, we may return */ + return(RW_LOCK_EX); + } ++#endif /* HAVE_ATOMIC_BUILTINS */ + + /* Locking did not succeed */ + return(RW_LOCK_NOT_LOCKED); +@@ -448,19 +592,33 @@ + ulint line) /* in: line where requested */ + { + ulint index; /* index of the reserved wait cell */ +- ulint state; /* lock state acquired */ ++ ulint state = RW_LOCK_NOT_LOCKED; /* lock state acquired */ ++#ifdef HAVE_ATOMIC_BUILTINS ++ ulint prev_state = RW_LOCK_NOT_LOCKED; ++#endif + ulint i; /* spin round count */ + + ut_ad(rw_lock_validate(lock)); + + lock_loop: ++ i = 0; ++ ++#ifdef HAVE_ATOMIC_BUILTINS ++ prev_state = state; ++#else + /* Acquire the mutex protecting the rw-lock fields */ + mutex_enter_fast(&(lock->mutex)); ++#endif + + state = rw_lock_x_lock_low(lock, pass, file_name, line); + ++#ifdef HAVE_ATOMIC_BUILTINS ++ if (state != prev_state) i=0; /* if progress, reset counter. */ ++#else + mutex_exit(&(lock->mutex)); ++#endif + ++spin_loop: + if (state == RW_LOCK_EX) { + + return; /* Locking succeeded */ +@@ -468,10 +626,9 @@ + } else if (state == RW_LOCK_NOT_LOCKED) { + + /* Spin waiting for the writer field to become free */ +- i = 0; + +- while (rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED +- && i < SYNC_SPIN_ROUNDS) { ++ while (i < SYNC_SPIN_ROUNDS ++ && lock->lock_word != RW_LOCK_BIAS) { + if (srv_spin_wait_delay) { + ut_delay(ut_rnd_interval(0, + srv_spin_wait_delay)); +@@ -485,9 +642,12 @@ + } else if (state == RW_LOCK_WAIT_EX) { + + /* Spin waiting for the reader count field to become zero */ +- i = 0; + ++#ifdef HAVE_ATOMIC_BUILTINS ++ while (lock->lock_word != RW_LOCK_BIAS ++#else + while (rw_lock_get_reader_count(lock) != 0 ++#endif + && i < SYNC_SPIN_ROUNDS) { + if (srv_spin_wait_delay) { + ut_delay(ut_rnd_interval(0, +@@ -500,7 +660,6 @@ + os_thread_yield(); + } + } else { +- i = 0; /* Eliminate a compiler warning */ + ut_error; + } + +@@ -516,34 +675,69 @@ + /* We try once again to obtain the lock. Acquire the mutex protecting + the rw-lock fields */ + ++#ifdef HAVE_ATOMIC_BUILTINS ++ prev_state = state; ++#else + mutex_enter(rw_lock_get_mutex(lock)); ++#endif + + state = rw_lock_x_lock_low(lock, pass, file_name, line); + ++#ifdef HAVE_ATOMIC_BUILTINS ++ if (state != prev_state) i=0; /* if progress, reset counter. */ ++#endif ++ + if (state == RW_LOCK_EX) { ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(rw_lock_get_mutex(lock)); ++#endif + + return; /* Locking succeeded */ + } + ++#ifdef HAVE_ATOMIC_BUILTINS ++ /* like sync0sync.c doing */ ++ i++; ++ ++ if (i < SYNC_SPIN_ROUNDS) { ++ goto spin_loop; ++ } ++#endif ++ + rw_x_system_call_count++; + + sync_array_reserve_cell(sync_primary_wait_array, + lock, +-#ifdef __WIN__ +- /* On windows RW_LOCK_WAIT_EX signifies +- that this thread should wait on the +- special wait_ex_event. */ + (state == RW_LOCK_WAIT_EX) + ? RW_LOCK_WAIT_EX : +-#endif + RW_LOCK_EX, + file_name, line, + &index); + +- rw_lock_set_waiters(lock, 1); ++ if (state == RW_LOCK_WAIT_EX) { ++ rw_lock_set_wx_waiters(lock, 1); ++ } else { ++ rw_lock_set_x_waiters(lock, 1); ++ } + ++#ifdef HAVE_ATOMIC_BUILTINS ++ /* like sync0sync.c doing */ ++ for (i = 0; i < 4; i++) { ++ prev_state = state; ++ state = rw_lock_x_lock_low(lock, pass, file_name, line); ++ if (state == RW_LOCK_EX) { ++ sync_array_free_cell(sync_primary_wait_array, index); ++ return; /* Locking succeeded */ ++ } ++ if (state != prev_state) { ++ /* retry! */ ++ sync_array_free_cell(sync_primary_wait_array, index); ++ goto lock_loop; ++ } ++ } ++#else + mutex_exit(rw_lock_get_mutex(lock)); ++#endif + + if (srv_print_latch_waits) { + fprintf(stderr, +@@ -718,7 +912,9 @@ + ut_ad(lock); + ut_ad(rw_lock_validate(lock)); + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_enter(&(lock->mutex)); ++#endif + + info = UT_LIST_GET_FIRST(lock->debug_list); + +@@ -728,7 +924,9 @@ + && (info->pass == 0) + && (info->lock_type == lock_type)) { + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(&(lock->mutex)); ++#endif + /* Found! */ + + return(TRUE); +@@ -736,7 +934,9 @@ + + info = UT_LIST_GET_NEXT(list, info); + } ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(&(lock->mutex)); ++#endif + + return(FALSE); + } +@@ -758,21 +958,25 @@ + ut_ad(lock); + ut_ad(rw_lock_validate(lock)); + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_enter(&(lock->mutex)); ++#endif + + if (lock_type == RW_LOCK_SHARED) { + if (lock->reader_count > 0) { + ret = TRUE; + } + } else if (lock_type == RW_LOCK_EX) { +- if (lock->writer == RW_LOCK_EX) { ++ if (rw_lock_get_writer(lock) == RW_LOCK_EX) { + ret = TRUE; + } + } else { + ut_error; + } + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(&(lock->mutex)); ++#endif + + return(ret); + } +@@ -801,16 +1005,26 @@ + + count++; + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_enter(&(lock->mutex)); ++#endif + + if ((rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED) + || (rw_lock_get_reader_count(lock) != 0) +- || (rw_lock_get_waiters(lock) != 0)) { ++ || (rw_lock_get_s_waiters(lock) != 0) ++ || (rw_lock_get_x_waiters(lock) != 0) ++ || (rw_lock_get_wx_waiters(lock) != 0)) { + + fprintf(stderr, "RW-LOCK: %p ", lock); + +- if (rw_lock_get_waiters(lock)) { +- fputs(" Waiters for the lock exist\n", stderr); ++ if (rw_lock_get_s_waiters(lock)) { ++ fputs(" s_waiters for the lock exist,", stderr); ++ } ++ if (rw_lock_get_x_waiters(lock)) { ++ fputs(" x_waiters for the lock exist\n", stderr); ++ } ++ if (rw_lock_get_wx_waiters(lock)) { ++ fputs(" wait_ex_waiters for the lock exist\n", stderr); + } else { + putc('\n', stderr); + } +@@ -822,7 +1036,9 @@ + } + } + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(&(lock->mutex)); ++#endif + lock = UT_LIST_GET_NEXT(list, lock); + } + +@@ -847,10 +1063,18 @@ + + if ((rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED) + || (rw_lock_get_reader_count(lock) != 0) +- || (rw_lock_get_waiters(lock) != 0)) { ++ || (rw_lock_get_s_waiters(lock) != 0) ++ || (rw_lock_get_x_waiters(lock) != 0) ++ || (rw_lock_get_wx_waiters(lock) != 0)) { + +- if (rw_lock_get_waiters(lock)) { +- fputs(" Waiters for the lock exist\n", stderr); ++ if (rw_lock_get_s_waiters(lock)) { ++ fputs(" s_waiters for the lock exist,", stderr); ++ } ++ if (rw_lock_get_x_waiters(lock)) { ++ fputs(" x_waiters for the lock exist\n", stderr); ++ } ++ if (rw_lock_get_wx_waiters(lock)) { ++ fputs(" wait_ex_waiters for the lock exist\n", stderr); + } else { + putc('\n', stderr); + } +@@ -909,14 +1133,18 @@ + lock = UT_LIST_GET_FIRST(rw_lock_list); + + while (lock != NULL) { ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_enter(rw_lock_get_mutex(lock)); ++#endif + + if ((rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED) + || (rw_lock_get_reader_count(lock) != 0)) { + count++; + } + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(rw_lock_get_mutex(lock)); ++#endif + lock = UT_LIST_GET_NEXT(list, lock); + } + +diff -ruN a/patch_info/innodb_rw_lock.info b/patch_info/innodb_rw_lock.info +--- /dev/null 1970-01-01 09:00:00.000000000 +0900 ++++ b/patch_info/innodb_rw_lock.info 2009-04-16 16:15:28.000000000 +0900 +@@ -0,0 +1,6 @@ ++File=innodb_rw_lock.patch ++Name=Fix of InnoDB rw_locks ++Version=1.0 ++Author=Yasufumi Kinoshita ++License=BSD ++Comment= diff --git a/percona/5.0.87-b20-20091116/innodb_show_hashed_memory_standalone.patch b/percona/5.0.87-b20-20091116/innodb_show_hashed_memory_standalone.patch new file mode 100644 index 0000000..bf8f6b4 --- /dev/null +++ b/percona/5.0.87-b20-20091116/innodb_show_hashed_memory_standalone.patch @@ -0,0 +1,264 @@ +diff -ruN mysql-5.0.67_highperf/innobase/buf/buf0buf.c mysql-5.0.67_highperf_tmp/innobase/buf/buf0buf.c +--- mysql-5.0.67_highperf/innobase/buf/buf0buf.c 2008-11-12 09:25:58.000000000 +0900 ++++ mysql-5.0.67_highperf_tmp/innobase/buf/buf0buf.c 2008-11-12 09:27:52.000000000 +0900 +@@ -2454,13 +2454,15 @@ + (ulong) UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped)); + } + fprintf(file, +- "Buffer pool size %lu\n" +- "Free buffers %lu\n" +- "Database pages %lu\n" +- "Modified db pages %lu\n" ++ "Buffer pool size %lu\n" ++ "Buffer pool size, bytes %lu\n" ++ "Free buffers %lu\n" ++ "Database pages %lu\n" ++ "Modified db pages %lu\n" + "Pending reads %lu\n" + "Pending writes: LRU %lu, flush list %lu, single page %lu\n", + (ulong) size, ++ (ulong) size * UNIV_PAGE_SIZE, + (ulong) UT_LIST_GET_LEN(buf_pool->free), + (ulong) UT_LIST_GET_LEN(buf_pool->LRU), + (ulong) UT_LIST_GET_LEN(buf_pool->flush_list), +diff -ruN mysql-5.0.67_highperf/innobase/fil/fil0fil.c mysql-5.0.67_highperf_tmp/innobase/fil/fil0fil.c +--- mysql-5.0.67_highperf/innobase/fil/fil0fil.c 2008-11-12 09:26:07.000000000 +0900 ++++ mysql-5.0.67_highperf_tmp/innobase/fil/fil0fil.c 2008-11-12 09:27:52.000000000 +0900 +@@ -4472,3 +4472,30 @@ + + return(mach_read_from_2(page + FIL_PAGE_TYPE)); + } ++ ++/************************************************************************* ++Return local hash table informations. */ ++ ++ulint ++fil_system_hash_cells(void) ++/*=======================*/ ++{ ++ if (fil_system) { ++ return (fil_system->spaces->n_cells ++ + fil_system->name_hash->n_cells); ++ } else { ++ return 0; ++ } ++} ++ ++ulint ++fil_system_hash_nodes(void) ++/*=======================*/ ++{ ++ if (fil_system) { ++ return (UT_LIST_GET_LEN(fil_system->space_list) ++ * (sizeof(fil_space_t) + MEM_BLOCK_HEADER_SIZE)); ++ } else { ++ return 0; ++ } ++} +diff -ruN mysql-5.0.67_highperf/innobase/include/fil0fil.h mysql-5.0.67_highperf_tmp/innobase/include/fil0fil.h +--- mysql-5.0.67_highperf/innobase/include/fil0fil.h 2008-11-12 09:26:07.000000000 +0900 ++++ mysql-5.0.67_highperf_tmp/innobase/include/fil0fil.h 2008-11-12 09:27:52.000000000 +0900 +@@ -701,6 +701,16 @@ + written to page, the return value not defined */ + byte* page); /* in: file page */ + ++/************************************************************************* ++Return local hash table informations. */ ++ ++ulint ++fil_system_hash_cells(void); ++/*========================*/ ++ ++ulint ++fil_system_hash_nodes(void); ++/*========================*/ + + typedef struct fil_space_struct fil_space_t; + +diff -ruN mysql-5.0.67_highperf/innobase/include/thr0loc.h mysql-5.0.67_highperf_tmp/innobase/include/thr0loc.h +--- mysql-5.0.67_highperf/innobase/include/thr0loc.h 2008-11-12 09:24:58.000000000 +0900 ++++ mysql-5.0.67_highperf_tmp/innobase/include/thr0loc.h 2008-11-12 09:27:52.000000000 +0900 +@@ -77,6 +77,17 @@ + /*=============================*/ + /* out: pointer to the in_ibuf field */ + ++/************************************************************************* ++Return local hash table informations. */ ++ ++ulint ++thr_local_hash_cells(void); ++/*=======================*/ ++ ++ulint ++thr_local_hash_nodes(void); ++/*=======================*/ ++ + #ifndef UNIV_NONINL + #include "thr0loc.ic" + #endif +diff -ruN mysql-5.0.67_highperf/innobase/srv/srv0srv.c mysql-5.0.67_highperf_tmp/innobase/srv/srv0srv.c +--- mysql-5.0.67_highperf/innobase/srv/srv0srv.c 2008-11-12 09:26:07.000000000 +0900 ++++ mysql-5.0.67_highperf_tmp/innobase/srv/srv0srv.c 2008-11-12 09:54:19.000000000 +0900 +@@ -1645,6 +1645,14 @@ + time_t current_time; + ulint n_reserved; + ++ ulint btr_search_sys_subtotal; ++ ulint lock_sys_subtotal; ++ ulint recv_sys_subtotal; ++ ulint io_counter_subtotal; ++ ++ ulint i; ++ trx_t* trx; ++ + mutex_enter(&srv_innodb_monitor_mutex); + + current_time = time(NULL); +@@ -1747,6 +1755,80 @@ + ut_total_allocated_memory, + mem_pool_get_reserved(mem_comm_pool)); + ++ /* Calcurate reserved memories */ ++ if (btr_search_sys && btr_search_sys->hash_index->heap) { ++ btr_search_sys_subtotal = mem_heap_get_size(btr_search_sys->hash_index->heap); ++ } else { ++ btr_search_sys_subtotal = 0; ++ for (i=0; i < btr_search_sys->hash_index->n_mutexes; i++) { ++ btr_search_sys_subtotal += mem_heap_get_size(btr_search_sys->hash_index->heaps[i]); ++ } ++ } ++ ++ lock_sys_subtotal = 0; ++ if (trx_sys) { ++ mutex_enter(&kernel_mutex); ++ trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list); ++ while (trx) { ++ lock_sys_subtotal += ((trx->lock_heap) ? mem_heap_get_size(trx->lock_heap) : 0); ++ trx = UT_LIST_GET_NEXT(mysql_trx_list, trx); ++ } ++ mutex_exit(&kernel_mutex); ++ } ++ ++ recv_sys_subtotal = ((recv_sys && recv_sys->addr_hash) ++ ? mem_heap_get_size(recv_sys->heap) : 0); ++ ++ fprintf(file, ++ "Internal hash tables (constant factor + variable factor)\n" ++ " Adaptive hash index %lu \t(%lu + %lu)\n" ++ " Page hash %lu\n" ++ " Dictionary cache %lu \t(%lu + %lu)\n" ++ " File system %lu \t(%lu + %lu)\n" ++ " Lock system %lu \t(%lu + %lu)\n" ++ " Recovery system %lu \t(%lu + %lu)\n" ++ " Threads %lu \t(%lu + %lu)\n", ++ ++ (ulong) (btr_search_sys ++ ? (btr_search_sys->hash_index->n_cells * sizeof(hash_cell_t)) : 0) ++ + btr_search_sys_subtotal, ++ (ulong) (btr_search_sys ++ ? (btr_search_sys->hash_index->n_cells * sizeof(hash_cell_t)) : 0), ++ (ulong) btr_search_sys_subtotal, ++ ++ (ulong) (buf_pool->page_hash->n_cells * sizeof(hash_cell_t)), ++ ++ (ulong) (dict_sys ? ((dict_sys->table_hash->n_cells ++ + dict_sys->table_id_hash->n_cells ++ + dict_sys->col_hash->n_cells) * sizeof(hash_cell_t) ++ + dict_sys->size) : 0), ++ (ulong) (dict_sys ? ((dict_sys->table_hash->n_cells ++ + dict_sys->table_id_hash->n_cells ++ + dict_sys->col_hash->n_cells) * sizeof(hash_cell_t)) : 0), ++ (ulong) (dict_sys ? (dict_sys->size) : 0), ++ ++ (ulong) (fil_system_hash_cells() * sizeof(hash_cell_t) ++ + fil_system_hash_nodes()), ++ (ulong) (fil_system_hash_cells() * sizeof(hash_cell_t)), ++ (ulong) fil_system_hash_nodes(), ++ ++ (ulong) ((lock_sys ? (lock_sys->rec_hash->n_cells * sizeof(hash_cell_t)) : 0) ++ + lock_sys_subtotal), ++ (ulong) (lock_sys ? (lock_sys->rec_hash->n_cells * sizeof(hash_cell_t)) : 0), ++ (ulong) lock_sys_subtotal, ++ ++ (ulong) (((recv_sys && recv_sys->addr_hash) ++ ? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0) ++ + recv_sys_subtotal), ++ (ulong) ((recv_sys && recv_sys->addr_hash) ++ ? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0), ++ (ulong) recv_sys_subtotal, ++ ++ (ulong) (thr_local_hash_cells() * sizeof(hash_cell_t) ++ + thr_local_hash_nodes()), ++ (ulong) (thr_local_hash_cells() * sizeof(hash_cell_t)), ++ (ulong) thr_local_hash_nodes()); ++ + if (srv_use_awe) { + fprintf(file, + "In addition to that %lu MB of AWE memory allocated\n", +diff -ruN mysql-5.0.67_highperf/innobase/thr/thr0loc.c mysql-5.0.67_highperf_tmp/innobase/thr/thr0loc.c +--- mysql-5.0.67_highperf/innobase/thr/thr0loc.c 2008-11-12 09:24:58.000000000 +0900 ++++ mysql-5.0.67_highperf_tmp/innobase/thr/thr0loc.c 2008-11-12 09:27:52.000000000 +0900 +@@ -32,6 +32,7 @@ + + /* The hash table. The module is not yet initialized when it is NULL. */ + hash_table_t* thr_local_hash = NULL; ++ulint thr_local_hash_n_nodes = 0; + + /* The private data for each thread should be put to + the structure below and the accessor functions written +@@ -223,6 +224,7 @@ + HASH_INSERT(thr_local_t, hash, thr_local_hash, + os_thread_pf(os_thread_get_curr_id()), + local); ++ thr_local_hash_n_nodes++; + + mutex_exit(&thr_local_mutex); + } +@@ -251,6 +253,7 @@ + + HASH_DELETE(thr_local_t, hash, thr_local_hash, + os_thread_pf(id), local); ++ thr_local_hash_n_nodes--; + + mutex_exit(&thr_local_mutex); + +@@ -274,3 +277,29 @@ + mutex_create(&thr_local_mutex); + mutex_set_level(&thr_local_mutex, SYNC_THR_LOCAL); + } ++ ++/************************************************************************* ++Return local hash table informations. */ ++ ++ulint ++thr_local_hash_cells(void) ++/*======================*/ ++{ ++ if (thr_local_hash) { ++ return (thr_local_hash->n_cells); ++ } else { ++ return 0; ++ } ++} ++ ++ulint ++thr_local_hash_nodes(void) ++/*======================*/ ++{ ++ if (thr_local_hash) { ++ return (thr_local_hash_n_nodes ++ * (sizeof(thr_local_t) + MEM_BLOCK_HEADER_SIZE)); ++ } else { ++ return 0; ++ } ++} +diff -ruN mysql-5.0.67_highperf/patch_info/innodb_show_hashed_memory.info mysql-5.0.67_highperf_tmp/patch_info/innodb_show_hashed_memory.info +--- /dev/null 1970-01-01 09:00:00.000000000 +0900 ++++ mysql-5.0.67_highperf_tmp/patch_info/innodb_show_hashed_memory.info 2008-11-12 09:27:52.000000000 +0900 +@@ -0,0 +1,6 @@ ++File=innodb_show_hashed_memory.patch ++Name=Adds additional information of InnoDB internal hash table memories in SHOW INNODB STATUS ++Version=1.0 ++Author=Percona <info@percona.com> ++License=GPL ++Comment= diff --git a/percona/5.0.87-b20-20091116/mirror_binlog.patch b/percona/5.0.87-b20-20091116/mirror_binlog.patch new file mode 100644 index 0000000..d52e806 --- /dev/null +++ b/percona/5.0.87-b20-20091116/mirror_binlog.patch @@ -0,0 +1,2694 @@ +diff -r 66cc9e0a6768 mysql-test/lib/mtr_cases.pl +--- a/mysql-test/lib/mtr_cases.pl Thu Dec 04 21:37:12 2008 -0800 ++++ b/mysql-test/lib/mtr_cases.pl Thu Dec 04 21:46:15 2008 -0800 +@@ -334,6 +334,10 @@ + + $tinfo->{'slave_num'}= 1; # Default for rpl* tests, use one slave + ++ if ( $tname eq 'rpl_mirror_binlog' ) ++ { ++ $tinfo->{'slave_num'}= 3; ++ } + } + + if ( defined mtr_match_prefix($tname,"federated") ) +@@ -344,15 +348,20 @@ + + my $master_opt_file= "$testdir/$tname-master.opt"; + my $slave_opt_file= "$testdir/$tname-slave.opt"; +- my $slave_mi_file= "$testdir/$tname.slave-mi"; ++ my $slave_mi_files= ["$testdir/$tname.slave-mi", ++ "$testdir/$tname.1.slave-mi", ++ "$testdir/$tname.2.slave-mi"]; + my $master_sh= "$testdir/$tname-master.sh"; + my $slave_sh= "$testdir/$tname-slave.sh"; + my $disabled_file= "$testdir/$tname.disabled"; + my $im_opt_file= "$testdir/$tname-im.opt"; + +- $tinfo->{'master_opt'}= []; +- $tinfo->{'slave_opt'}= []; +- $tinfo->{'slave_mi'}= []; ++ $tinfo->{'master_opt'}= []; ++ $tinfo->{'slave_opt'}= []; ++ $tinfo->{'slave_mi'}= {}; ++ $tinfo->{'slave_mi'}{0}= []; ++ $tinfo->{'slave_mi'}{1}= []; ++ $tinfo->{'slave_mi'}{2}= []; + + if ( -f $master_opt_file ) + { +@@ -427,9 +436,14 @@ + push(@{$tinfo->{'slave_opt'}}, @$slave_opt); + } + +- if ( -f $slave_mi_file ) ++ my $mi_idx= 0; ++ foreach my $slave_mi_file ( @$slave_mi_files ) + { +- $tinfo->{'slave_mi'}= mtr_get_opts_from_file($slave_mi_file); ++ if ( -f $slave_mi_file ) ++ { ++ $tinfo->{'slave_mi'}{$mi_idx}= mtr_get_opts_from_file($slave_mi_file); ++ } ++ $mi_idx+= 1; + } + + if ( -f $master_sh ) +diff -r 66cc9e0a6768 mysql-test/mysql-test-run.pl +--- a/mysql-test/mysql-test-run.pl Thu Dec 04 21:37:12 2008 -0800 ++++ b/mysql-test/mysql-test-run.pl Thu Dec 04 21:46:15 2008 -0800 +@@ -275,6 +275,7 @@ + our $opt_stress_test_file= ""; + + our $opt_warnings; ++our $opt_slave_innodb= 0; + + our $opt_skip_ndbcluster= 0; + our $opt_skip_ndbcluster_slave= 0; +@@ -299,6 +300,8 @@ + our $used_binlog_format; + our $used_default_engine; + our $debug_compiled_binaries; ++ ++our $current_testname= ""; + + our %mysqld_variables; + +@@ -645,6 +648,7 @@ + 'testcase-timeout=i' => \$opt_testcase_timeout, + 'suite-timeout=i' => \$opt_suite_timeout, + 'warnings|log-warnings' => \$opt_warnings, ++ 'slave-innodb' => \$opt_slave_innodb, + + # Options which are no longer used + (map { $_ => \&warn_about_removed_option } @removed_options), +@@ -1001,6 +1005,14 @@ + { + $ENV{'BIG_TEST'}= 1; + } ++ ++ # -------------------------------------------------------------------------- ++ # Big test flags ++ # -------------------------------------------------------------------------- ++ if ( $opt_big_test ) ++ { ++ $ENV{'BIG_TEST'}= 1; ++ } + + # -------------------------------------------------------------------------- + # Gcov flag +@@ -1885,7 +1897,9 @@ + $ENV{'SLAVE_MYSOCK'}= $slave->[0]->{'path_sock'}; + $ENV{'SLAVE_MYPORT'}= $slave->[0]->{'port'}; + $ENV{'SLAVE_MYPORT1'}= $slave->[1]->{'port'}; ++ $ENV{'SLAVE_MYSOCK1'}= $slave->[1]->{'path_sock'}; + $ENV{'SLAVE_MYPORT2'}= $slave->[2]->{'port'}; ++ $ENV{'SLAVE_MYSOCK2'}= $slave->[2]->{'path_sock'}; + $ENV{'MYSQL_TCP_PORT'}= $mysqld_variables{'port'}; + $ENV{'DEFAULT_MASTER_PORT'}= $mysqld_variables{'master-port'}; + +@@ -2375,6 +2389,8 @@ + if ( ! $glob_win32 ) + { + symlink("$glob_mysql_test_dir/std_data", "$opt_vardir/std_data_ln"); ++ my @a = ("chmod", "-R", "o+r", "$glob_mysql_test_dir/std_data"); ++ system(@a) == 0 or die "system @ failed: $?" + } + else + { +@@ -3466,6 +3482,8 @@ + $ENV{'TZ'}= $tinfo->{'timezone'}; + mtr_verbose("Setting timezone: $tinfo->{'timezone'}"); + ++ $current_testname= $tinfo->{'name'}; ++ + my $master_restart= run_testcase_need_master_restart($tinfo); + my $slave_restart= run_testcase_need_slave_restart($tinfo); + +@@ -3881,7 +3899,8 @@ + unless $mysqld->{'type'} eq 'slave'; + + mtr_add_arg($args, "%s--init-rpl-role=slave", $prefix); +- if (! ( $opt_skip_slave_binlog || $skip_binlog )) ++ ++ if (! ($opt_skip_slave_binlog or ($current_testname eq 'rpl_mirror_binlog')) ) + { + mtr_add_arg($args, "%s--log-bin=%s/log/slave%s-bin", $prefix, + $opt_vardir, $sidx); # FIXME use own dir for binlogs +@@ -4568,7 +4587,7 @@ + if ( ! $slave->[$idx]->{'pid'} ) + { + mysqld_start($slave->[$idx],$tinfo->{'slave_opt'}, +- $tinfo->{'slave_mi'}); ++ $tinfo->{'slave_mi'}{$idx}); + + } + } +@@ -4580,7 +4599,6 @@ + # Wait for clusters to start + foreach my $cluster (@{$clusters}) + { +- + next if !$cluster->{'pid'}; + + if (ndbcluster_wait_started($cluster, "")) +@@ -5179,6 +5197,7 @@ + skip-im Don't start IM, and skip the IM test cases + big-test Set the environment variable BIG_TEST, which can be + checked from test cases. ++ + + Options that specify ports + +diff -r 66cc9e0a6768 mysql-test/r/rpl_mirror_binlog.result +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/mysql-test/r/rpl_mirror_binlog.result Thu Dec 04 21:46:15 2008 -0800 +@@ -0,0 +1,441 @@ ++stop slave; ++drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9; ++reset master; ++reset slave; ++drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9; ++start slave; ++drop table if exists t1; ++create table t1(n int) engine = InnoDB; ++insert into t1 values (300); ++insert into t1 values (299); ++insert into t1 values (298); ++insert into t1 values (297); ++insert into t1 values (296); ++insert into t1 values (295); ++insert into t1 values (294); ++insert into t1 values (293); ++insert into t1 values (292); ++insert into t1 values (291); ++insert into t1 values (290); ++insert into t1 values (289); ++insert into t1 values (288); ++insert into t1 values (287); ++insert into t1 values (286); ++insert into t1 values (285); ++insert into t1 values (284); ++insert into t1 values (283); ++insert into t1 values (282); ++insert into t1 values (281); ++insert into t1 values (280); ++insert into t1 values (279); ++insert into t1 values (278); ++insert into t1 values (277); ++insert into t1 values (276); ++insert into t1 values (275); ++insert into t1 values (274); ++insert into t1 values (273); ++insert into t1 values (272); ++insert into t1 values (271); ++insert into t1 values (270); ++insert into t1 values (269); ++insert into t1 values (268); ++insert into t1 values (267); ++insert into t1 values (266); ++insert into t1 values (265); ++insert into t1 values (264); ++insert into t1 values (263); ++insert into t1 values (262); ++insert into t1 values (261); ++insert into t1 values (260); ++insert into t1 values (259); ++insert into t1 values (258); ++insert into t1 values (257); ++insert into t1 values (256); ++insert into t1 values (255); ++insert into t1 values (254); ++insert into t1 values (253); ++insert into t1 values (252); ++insert into t1 values (251); ++insert into t1 values (250); ++insert into t1 values (249); ++insert into t1 values (248); ++insert into t1 values (247); ++insert into t1 values (246); ++insert into t1 values (245); ++insert into t1 values (244); ++insert into t1 values (243); ++insert into t1 values (242); ++insert into t1 values (241); ++insert into t1 values (240); ++insert into t1 values (239); ++insert into t1 values (238); ++insert into t1 values (237); ++insert into t1 values (236); ++insert into t1 values (235); ++insert into t1 values (234); ++insert into t1 values (233); ++insert into t1 values (232); ++insert into t1 values (231); ++insert into t1 values (230); ++insert into t1 values (229); ++insert into t1 values (228); ++insert into t1 values (227); ++insert into t1 values (226); ++insert into t1 values (225); ++insert into t1 values (224); ++insert into t1 values (223); ++insert into t1 values (222); ++insert into t1 values (221); ++insert into t1 values (220); ++insert into t1 values (219); ++insert into t1 values (218); ++insert into t1 values (217); ++insert into t1 values (216); ++insert into t1 values (215); ++insert into t1 values (214); ++insert into t1 values (213); ++insert into t1 values (212); ++insert into t1 values (211); ++insert into t1 values (210); ++insert into t1 values (209); ++insert into t1 values (208); ++insert into t1 values (207); ++insert into t1 values (206); ++insert into t1 values (205); ++insert into t1 values (204); ++insert into t1 values (203); ++insert into t1 values (202); ++insert into t1 values (201); ++insert into t1 values (200); ++insert into t1 values (199); ++insert into t1 values (198); ++insert into t1 values (197); ++insert into t1 values (196); ++insert into t1 values (195); ++insert into t1 values (194); ++insert into t1 values (193); ++insert into t1 values (192); ++insert into t1 values (191); ++insert into t1 values (190); ++insert into t1 values (189); ++insert into t1 values (188); ++insert into t1 values (187); ++insert into t1 values (186); ++insert into t1 values (185); ++insert into t1 values (184); ++insert into t1 values (183); ++insert into t1 values (182); ++insert into t1 values (181); ++insert into t1 values (180); ++insert into t1 values (179); ++insert into t1 values (178); ++insert into t1 values (177); ++insert into t1 values (176); ++insert into t1 values (175); ++insert into t1 values (174); ++insert into t1 values (173); ++insert into t1 values (172); ++insert into t1 values (171); ++insert into t1 values (170); ++insert into t1 values (169); ++insert into t1 values (168); ++insert into t1 values (167); ++insert into t1 values (166); ++insert into t1 values (165); ++insert into t1 values (164); ++insert into t1 values (163); ++insert into t1 values (162); ++insert into t1 values (161); ++insert into t1 values (160); ++insert into t1 values (159); ++insert into t1 values (158); ++insert into t1 values (157); ++insert into t1 values (156); ++insert into t1 values (155); ++insert into t1 values (154); ++insert into t1 values (153); ++insert into t1 values (152); ++insert into t1 values (151); ++insert into t1 values (150); ++insert into t1 values (149); ++insert into t1 values (148); ++insert into t1 values (147); ++insert into t1 values (146); ++insert into t1 values (145); ++insert into t1 values (144); ++insert into t1 values (143); ++insert into t1 values (142); ++insert into t1 values (141); ++insert into t1 values (140); ++insert into t1 values (139); ++insert into t1 values (138); ++insert into t1 values (137); ++insert into t1 values (136); ++insert into t1 values (135); ++insert into t1 values (134); ++insert into t1 values (133); ++insert into t1 values (132); ++insert into t1 values (131); ++insert into t1 values (130); ++insert into t1 values (129); ++insert into t1 values (128); ++insert into t1 values (127); ++insert into t1 values (126); ++insert into t1 values (125); ++insert into t1 values (124); ++insert into t1 values (123); ++insert into t1 values (122); ++insert into t1 values (121); ++insert into t1 values (120); ++insert into t1 values (119); ++insert into t1 values (118); ++insert into t1 values (117); ++insert into t1 values (116); ++insert into t1 values (115); ++insert into t1 values (114); ++insert into t1 values (113); ++insert into t1 values (112); ++insert into t1 values (111); ++insert into t1 values (110); ++insert into t1 values (109); ++insert into t1 values (108); ++insert into t1 values (107); ++insert into t1 values (106); ++insert into t1 values (105); ++insert into t1 values (104); ++insert into t1 values (103); ++insert into t1 values (102); ++insert into t1 values (101); ++insert into t1 values (100); ++insert into t1 values (99); ++insert into t1 values (98); ++insert into t1 values (97); ++insert into t1 values (96); ++insert into t1 values (95); ++insert into t1 values (94); ++insert into t1 values (93); ++insert into t1 values (92); ++insert into t1 values (91); ++insert into t1 values (90); ++insert into t1 values (89); ++insert into t1 values (88); ++insert into t1 values (87); ++insert into t1 values (86); ++insert into t1 values (85); ++insert into t1 values (84); ++insert into t1 values (83); ++insert into t1 values (82); ++insert into t1 values (81); ++insert into t1 values (80); ++insert into t1 values (79); ++insert into t1 values (78); ++insert into t1 values (77); ++insert into t1 values (76); ++insert into t1 values (75); ++insert into t1 values (74); ++insert into t1 values (73); ++insert into t1 values (72); ++insert into t1 values (71); ++insert into t1 values (70); ++insert into t1 values (69); ++insert into t1 values (68); ++insert into t1 values (67); ++insert into t1 values (66); ++insert into t1 values (65); ++insert into t1 values (64); ++insert into t1 values (63); ++insert into t1 values (62); ++insert into t1 values (61); ++insert into t1 values (60); ++insert into t1 values (59); ++insert into t1 values (58); ++insert into t1 values (57); ++insert into t1 values (56); ++insert into t1 values (55); ++insert into t1 values (54); ++insert into t1 values (53); ++insert into t1 values (52); ++insert into t1 values (51); ++insert into t1 values (50); ++insert into t1 values (49); ++insert into t1 values (48); ++insert into t1 values (47); ++insert into t1 values (46); ++insert into t1 values (45); ++insert into t1 values (44); ++insert into t1 values (43); ++insert into t1 values (42); ++insert into t1 values (41); ++insert into t1 values (40); ++insert into t1 values (39); ++insert into t1 values (38); ++insert into t1 values (37); ++insert into t1 values (36); ++insert into t1 values (35); ++insert into t1 values (34); ++insert into t1 values (33); ++insert into t1 values (32); ++insert into t1 values (31); ++insert into t1 values (30); ++insert into t1 values (29); ++insert into t1 values (28); ++insert into t1 values (27); ++insert into t1 values (26); ++insert into t1 values (25); ++insert into t1 values (24); ++insert into t1 values (23); ++insert into t1 values (22); ++insert into t1 values (21); ++insert into t1 values (20); ++insert into t1 values (19); ++insert into t1 values (18); ++insert into t1 values (17); ++insert into t1 values (16); ++insert into t1 values (15); ++insert into t1 values (14); ++insert into t1 values (13); ++insert into t1 values (12); ++insert into t1 values (11); ++insert into t1 values (10); ++insert into t1 values (9); ++insert into t1 values (8); ++insert into t1 values (7); ++insert into t1 values (6); ++insert into t1 values (5); ++insert into t1 values (4); ++insert into t1 values (3); ++insert into t1 values (2); ++insert into t1 values (1); ++"The following are SLAVE." ++select count(distinct n) from t1; ++count(distinct n) ++300 ++select min(n) from t1; ++min(n) ++1 ++select max(n) from t1; ++max(n) ++300 ++show slave status; ++Slave_IO_State Master_Host Master_User Master_Port Connect_Retry Master_Log_File Read_Master_Log_Pos Relay_Log_File Relay_Log_Pos Relay_Master_Log_File Slave_IO_Running Slave_SQL_Running Replicate_Do_DB Replicate_Ignore_DB Replicate_Do_Table Replicate_Ignore_Table Replicate_Wild_Do_Table Replicate_Wild_Ignore_Table Last_Errno Last_Error Skip_Counter Exec_Master_Log_Pos Relay_Log_Space Until_Condition Until_Log_File Until_Log_Pos Master_SSL_Allowed Master_SSL_CA_File Master_SSL_CA_Path Master_SSL_Cert Master_SSL_Cipher Master_SSL_Key Seconds_Behind_Master ++Waiting for master to send event 127.0.0.1 root 9306 1 master-bin.000014 2849 # # master-bin.000014 Yes Yes # 0 0 2849 # None 0 No # ++show master status; ++File Position Binlog_Do_DB Binlog_Ignore_DB ++master-bin.000014 2849 ++"The following are SLAVE1." ++start slave; ++select count(distinct n) from t1; ++count(distinct n) ++300 ++select min(n) from t1; ++min(n) ++1 ++select max(n) from t1; ++max(n) ++300 ++show slave status; ++Slave_IO_State Master_Host Master_User Master_Port Connect_Retry Master_Log_File Read_Master_Log_Pos Relay_Log_File Relay_Log_Pos Relay_Master_Log_File Slave_IO_Running Slave_SQL_Running Replicate_Do_DB Replicate_Ignore_DB Replicate_Do_Table Replicate_Ignore_Table Replicate_Wild_Do_Table Replicate_Wild_Ignore_Table Last_Errno Last_Error Skip_Counter Exec_Master_Log_Pos Relay_Log_Space Until_Condition Until_Log_File Until_Log_Pos Master_SSL_Allowed Master_SSL_CA_File Master_SSL_CA_Path Master_SSL_Cert Master_SSL_Cipher Master_SSL_Key Seconds_Behind_Master ++Waiting for master to send event 127.0.0.1 root 9308 1 master-bin.000014 2849 # # master-bin.000014 Yes Yes # 0 0 2849 # None 0 No # ++"The following are SLAVE." ++MAKE MASTER MASTER_LOG_FILE='master-bin', ++MASTER_SERVER_ID=2, ++INDEX='replication-log'; ++ERROR HY000: Could not initialize master info structure; more error messages can be found in the MySQL error log ++stop slave; ++MAKE MASTER MASTER_LOG_FILE='master-bin', ++MASTER_SERVER_ID=2, ++INDEX='replication_log'; ++ERROR HY000: Could not initialize master info structure; more error messages can be found in the MySQL error log ++MAKE MASTER REVOKE SESSION WITH KILL; ++MAKE MASTER MASTER_LOG_FILE='master-bin', ++MASTER_SERVER_ID=2, ++INDEX='replication_log' ++ WITH BINLOG; ++MAKE MASTER GRANT SESSION; ++delete from t1 where n > 250; ++select count(distinct n) from t1; ++count(distinct n) ++250 ++"The following are SLAVE1." ++select count(distinct n) from t1; ++count(distinct n) ++250 ++select min(n) from t1; ++min(n) ++1 ++select max(n) from t1; ++max(n) ++250 ++"The following are SLAVE2." ++start slave; ++select count(distinct n) from t1; ++count(distinct n) ++250 ++select min(n) from t1; ++min(n) ++1 ++select max(n) from t1; ++max(n) ++250 ++show slave status; ++Slave_IO_State Master_Host Master_User Master_Port Connect_Retry Master_Log_File Read_Master_Log_Pos Relay_Log_File Relay_Log_Pos Relay_Master_Log_File Slave_IO_Running Slave_SQL_Running Replicate_Do_DB Replicate_Ignore_DB Replicate_Do_Table Replicate_Ignore_Table Replicate_Wild_Do_Table Replicate_Wild_Ignore_Table Last_Errno Last_Error Skip_Counter Exec_Master_Log_Pos Relay_Log_Space Until_Condition Until_Log_File Until_Log_Pos Master_SSL_Allowed Master_SSL_CA_File Master_SSL_CA_Path Master_SSL_Cert Master_SSL_Cipher Master_SSL_Key Seconds_Behind_Master ++Waiting for master to send event 127.0.0.1 root 9308 1 master-bin.000015 189 # # master-bin.000015 Yes Yes # 0 0 189 # None 0 No # ++drop table t1; ++drop table t1; ++"The following are SLAVE." ++show master logs; ++Log_name File_size ++master-bin.000001 4214 ++master-bin.000002 4212 ++master-bin.000003 4212 ++master-bin.000004 4212 ++master-bin.000005 4212 ++master-bin.000006 4212 ++master-bin.000007 4212 ++master-bin.000008 4212 ++master-bin.000009 4212 ++master-bin.000010 4194 ++master-bin.000011 4190 ++master-bin.000012 4190 ++master-bin.000013 4190 ++master-bin.000014 2849 ++master-bin.000015 265 ++show master status; ++File Position Binlog_Do_DB Binlog_Ignore_DB ++master-bin.000015 265 ++"The following are SLAVE2." ++show master logs; ++Log_name File_size ++master-bin.000001 4214 ++master-bin.000002 4212 ++master-bin.000003 4212 ++master-bin.000004 4212 ++master-bin.000005 4212 ++master-bin.000006 4212 ++master-bin.000007 4212 ++master-bin.000008 4212 ++master-bin.000009 4212 ++master-bin.000010 4194 ++master-bin.000011 4190 ++master-bin.000012 4190 ++master-bin.000013 4190 ++master-bin.000014 2849 ++master-bin.000015 265 ++show master status; ++File Position Binlog_Do_DB Binlog_Ignore_DB ++master-bin.000015 265 ++purge master logs to 'master-bin.000006'; ++show master logs; ++Log_name File_size ++master-bin.000006 4212 ++master-bin.000007 4212 ++master-bin.000008 4212 ++master-bin.000009 4212 ++master-bin.000010 4194 ++master-bin.000011 4190 ++master-bin.000012 4190 ++master-bin.000013 4190 ++master-bin.000014 2849 ++master-bin.000015 265 ++reset master; ++ERROR HY000: Binlog closed, cannot RESET MASTER +diff -r 66cc9e0a6768 mysql-test/t/rpl_mirror_binlog-master.opt +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/mysql-test/t/rpl_mirror_binlog-master.opt Thu Dec 04 21:46:15 2008 -0800 +@@ -0,0 +1,1 @@ ++-O max_binlog_size=4096 +diff -r 66cc9e0a6768 mysql-test/t/rpl_mirror_binlog-slave.opt +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/mysql-test/t/rpl_mirror_binlog-slave.opt Thu Dec 04 21:46:15 2008 -0800 +@@ -0,0 +1,1 @@ ++--rpl_mirror_binlog_enabled=1 --log-bin-index=replication_log +diff -r 66cc9e0a6768 mysql-test/t/rpl_mirror_binlog.1.slave-mi +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/mysql-test/t/rpl_mirror_binlog.1.slave-mi Thu Dec 04 21:46:15 2008 -0800 +@@ -0,0 +1,1 @@ ++--master-user=root --master-connect-retry=1 --master-host=127.0.0.1 --master-password="" --master-port=9308 --server-id=3 +diff -r 66cc9e0a6768 mysql-test/t/rpl_mirror_binlog.2.slave-mi +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/mysql-test/t/rpl_mirror_binlog.2.slave-mi Thu Dec 04 21:46:15 2008 -0800 +@@ -0,0 +1,1 @@ ++--master-user=root --master-connect-retry=1 --master-host=127.0.0.1 --master-password="" --master-port=9308 --server-id=4 +diff -r 66cc9e0a6768 mysql-test/t/rpl_mirror_binlog.test +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/mysql-test/t/rpl_mirror_binlog.test Thu Dec 04 21:46:15 2008 -0800 +@@ -0,0 +1,119 @@ ++-- source include/master-slave.inc ++-- source include/have_innodb.inc ++connect (slave_sec,localhost,root,,test,$SLAVE_MYPORT1,$SLAVE_MYSOCK1); ++connect (slave_ter,localhost,root,,test,$SLAVE_MYPORT2,$SLAVE_MYSOCK2); ++ ++connection master; ++--disable_warnings ++drop table if exists t1; ++--enable_warnings ++create table t1(n int) engine = InnoDB; ++ ++let $i=300; ++while ($i) ++{ ++ eval insert into t1 values ($i); ++ dec $i; ++} ++ ++save_master_pos; ++ ++connection slave; ++sync_with_master; ++ ++echo "The following are SLAVE."; ++select count(distinct n) from t1; ++select min(n) from t1; ++select max(n) from t1; ++--replace_column 8 # 9 # 18 # 23 # 33 # ++show slave status; ++show master status; ++ ++connection slave_sec; ++echo "The following are SLAVE1."; ++start slave; ++sync_with_master; ++ ++select count(distinct n) from t1; ++select min(n) from t1; ++select max(n) from t1; ++--replace_column 8 # 9 # 18 # 23 # 33 # ++show slave status; ++ ++# make the slave the new master ++connection slave; ++echo "The following are SLAVE."; ++ ++# The first 1201 error is caused by running slave. ++--error 1201 ++MAKE MASTER MASTER_LOG_FILE='master-bin', ++ MASTER_SERVER_ID=2, ++ INDEX='replication-log'; ++stop slave; ++ ++# The second 1201 error is caused by failover mode. ++--error 1201 ++MAKE MASTER MASTER_LOG_FILE='master-bin', ++ MASTER_SERVER_ID=2, ++ INDEX='replication_log'; ++ ++MAKE MASTER REVOKE SESSION WITH KILL; ++MAKE MASTER MASTER_LOG_FILE='master-bin', ++ MASTER_SERVER_ID=2, ++ INDEX='replication_log' ++ WITH BINLOG; ++ ++MAKE MASTER GRANT SESSION; ++ ++delete from t1 where n > 250; ++save_master_pos; ++ ++select count(distinct n) from t1; ++ ++connection slave_sec; ++echo "The following are SLAVE1."; ++ ++sync_with_master; ++select count(distinct n) from t1; ++select min(n) from t1; ++select max(n) from t1; ++ ++connection slave_ter; ++echo "The following are SLAVE2."; ++start slave; ++sync_with_master; ++ ++select count(distinct n) from t1; ++select min(n) from t1; ++select max(n) from t1; ++ ++--replace_column 8 # 9 # 18 # 23 # 33 # ++show slave status; ++ ++connection master; ++drop table t1; ++ ++connection slave; ++drop table t1; ++save_master_pos; ++ ++connection slave_sec; ++sync_with_master; ++ ++connection slave; ++echo "The following are SLAVE."; ++ ++show master logs; ++show master status; ++ ++ ++connection slave_ter; ++echo "The following are SLAVE2."; ++sync_with_master; ++ ++show master logs; ++show master status; ++purge master logs to 'master-bin.000006'; ++show master logs; ++--error 1186 ++reset master; +diff -r 66cc9e0a6768 patch_info/mirror_binlog.info +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/patch_info/mirror_binlog.info Thu Dec 04 21:46:15 2008 -0800 +@@ -0,0 +1,6 @@ ++File=mirror_binlog.patch ++Name=Mirroring binary logs on slave ++Version=V1 ++Author=Google ++License=GPL ++Comment=contains FastMaster promotion patch +diff -r 66cc9e0a6768 sql/Makefile.am +--- a/sql/Makefile.am Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/Makefile.am Thu Dec 04 21:46:15 2008 -0800 +@@ -68,7 +68,7 @@ + sql_array.h sql_cursor.h \ + examples/ha_example.h ha_archive.h \ + examples/ha_tina.h ha_blackhole.h \ +- ha_federated.h ++ ha_federated.h repl_mule.h + mysqld_SOURCES = sql_lex.cc sql_handler.cc \ + item.cc item_sum.cc item_buff.cc item_func.cc \ + item_cmpfunc.cc item_strfunc.cc item_timefunc.cc \ +@@ -105,7 +105,7 @@ + sp_cache.cc parse_file.cc sql_trigger.cc \ + examples/ha_example.cc ha_archive.cc \ + examples/ha_tina.cc ha_blackhole.cc \ +- ha_federated.cc ++ ha_federated.cc repl_mule.cc + + gen_lex_hash_SOURCES = gen_lex_hash.cc + gen_lex_hash_LDADD = $(LDADD) $(CXXLDFLAGS) +diff -r 66cc9e0a6768 sql/Makefile.in +--- a/sql/Makefile.in Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/Makefile.in Thu Dec 04 21:46:15 2008 -0800 +@@ -152,7 +152,7 @@ + sp_rcontext.$(OBJEXT) sp.$(OBJEXT) sp_cache.$(OBJEXT) \ + parse_file.$(OBJEXT) sql_trigger.$(OBJEXT) \ + ha_example.$(OBJEXT) ha_archive.$(OBJEXT) ha_tina.$(OBJEXT) \ +- ha_blackhole.$(OBJEXT) ha_federated.$(OBJEXT) ++ ha_blackhole.$(OBJEXT) ha_federated.$(OBJEXT) repl_mule.$(OBJEXT) + mysqld_OBJECTS = $(am_mysqld_OBJECTS) + mysqld_DEPENDENCIES = $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_2) \ + $(am__DEPENDENCIES_2) $(am__DEPENDENCIES_2) \ +@@ -516,7 +516,7 @@ + sql_array.h sql_cursor.h \ + examples/ha_example.h ha_archive.h \ + examples/ha_tina.h ha_blackhole.h \ +- ha_federated.h ++ ha_federated.h repl_mule.h + + mysqld_SOURCES = sql_lex.cc sql_handler.cc \ + item.cc item_sum.cc item_buff.cc item_func.cc \ +@@ -554,7 +554,7 @@ + sp_cache.cc parse_file.cc sql_trigger.cc \ + examples/ha_example.cc ha_archive.cc \ + examples/ha_tina.cc ha_blackhole.cc \ +- ha_federated.cc ++ ha_federated.cc repl_mule.cc + + gen_lex_hash_SOURCES = gen_lex_hash.cc + gen_lex_hash_LDADD = $(LDADD) $(CXXLDFLAGS) +@@ -748,6 +748,7 @@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/protocol.Po@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/records.Po@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/repl_failsafe.Po@am__quote@ ++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/repl_mule.Po@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/set_var.Po@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/slave.Po@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sp.Po@am__quote@ +diff -r 66cc9e0a6768 sql/lex.h +--- a/sql/lex.h Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/lex.h Thu Dec 04 21:46:15 2008 -0800 +@@ -292,6 +292,7 @@ + { "LONGTEXT", SYM(LONGTEXT)}, + { "LOOP", SYM(LOOP_SYM)}, + { "LOW_PRIORITY", SYM(LOW_PRIORITY)}, ++ { "MAKE", SYM(MAKE_SYM)}, + { "MASTER", SYM(MASTER_SYM)}, + { "MASTER_CONNECT_RETRY", SYM(MASTER_CONNECT_RETRY_SYM)}, + { "MASTER_HOST", SYM(MASTER_HOST_SYM)}, +diff -r 66cc9e0a6768 sql/log.cc +--- a/sql/log.cc Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/log.cc Thu Dec 04 21:46:15 2008 -0800 +@@ -79,7 +79,9 @@ + + bool binlog_init() + { +- return !opt_bin_log; ++ if (!opt_bin_log) ++ binlog_hton.prepare = NULL; ++ return 0; /* return !opt_bin_log; */ + } + + static int binlog_close_connection(THD *thd) +@@ -406,6 +408,7 @@ + :bytes_written(0), last_time(0), query_start(0), name(0), + prepared_xids(0), log_type(LOG_CLOSED), file_id(1), open_count(1), + write_error(FALSE), inited(FALSE), need_start_event(TRUE), ++ mule_binlog_(0), + description_event_for_exec(0), description_event_for_queue(0) + { + /* +@@ -506,7 +509,10 @@ + const char *log_name) + { + File index_file_nr= -1; +- DBUG_ASSERT(!my_b_inited(&index_file)); ++ ++ /* If the index is already opened, do not open it again. */ ++ if (my_b_inited(&index_file)) ++ return FALSE; + + /* + First open of this class instance +@@ -750,7 +756,7 @@ + if (file >= 0) + my_close(file,MYF(0)); + end_io_cache(&log_file); +- end_io_cache(&index_file); ++ close_index_file(); + safeFree(name); + log_type= LOG_CLOSED; + DBUG_RETURN(1); +@@ -768,7 +774,10 @@ + int MYSQL_LOG::raw_get_current_log(LOG_INFO* linfo) + { + strmake(linfo->log_file_name, log_file_name, sizeof(linfo->log_file_name)-1); +- linfo->pos = my_b_tell(&log_file); ++ if (!mule_binlog_) ++ linfo->pos = my_b_tell(&log_file); ++ else ++ linfo->pos = my_b_filelength(&log_file); + return 0; + } + +@@ -935,6 +944,11 @@ + if (need_lock) + pthread_mutex_lock(&LOCK_index); + safe_mutex_assert_owner(&LOCK_index); ++ ++ if (open_index_file(index_file_name, NULL) != 0) { ++ error = -1; ++ goto err; ++ } + + /* As the file is flushed, we can't get an error here */ + (void) reinit_io_cache(&index_file, READ_CACHE, linfo->index_file_offset, 0, +@@ -1446,18 +1460,19 @@ + SYNOPSIS + new_file() + need_lock Set to 1 if caller has not locked LOCK_log ++ logfile_name the specified log filename. + + NOTE + The new file name is stored last in the index file + */ + +-void MYSQL_LOG::new_file(bool need_lock) ++void MYSQL_LOG::new_file(bool need_lock, const char* log_filename) + { + char new_name[FN_REFLEN], *new_name_ptr, *old_name; + enum_log_type save_log_type; + + DBUG_ENTER("MYSQL_LOG::new_file"); +- if (!is_open()) ++ if (!is_log_open()) + { + DBUG_PRINT("info",("log is closed")); + DBUG_VOID_RETURN; +@@ -1496,7 +1511,9 @@ + We have to do this here and not in open as we want to store the + new file name in the current binary log file. + */ +- if (generate_new_name(new_name, name)) ++ if (log_filename) { ++ fn_format(new_name,log_filename,mysql_data_home,"",4); ++ } else if (generate_new_name(new_name, name)) + goto end; + new_name_ptr=new_name; + +@@ -1571,7 +1588,7 @@ + bytes_written+= ev->data_written; + DBUG_PRINT("info",("max_size: %lu",max_size)); + if ((uint) my_b_append_tell(&log_file) > max_size) +- new_file(0); ++ new_file(0); + + err: + pthread_mutex_unlock(&LOCK_log); +@@ -1600,8 +1617,14 @@ + bytes_written += len; + } while ((buf=va_arg(args,const char*)) && (len=va_arg(args,uint))); + DBUG_PRINT("info",("max_size: %lu",max_size)); +- if ((uint) my_b_append_tell(&log_file) > max_size) +- new_file(0); ++ ++ /* If max_size is BINLOG_NOSWITCH_SIZE, binlog would not switch because ++ * of file size limit. ++ */ ++ if (max_size != BINLOG_NOSWITCH_SIZE && ++ (uint) my_b_append_tell(&log_file) > max_size) { ++ new_file(0); ++ } + + err: + if (!error) +@@ -2492,6 +2515,17 @@ + DBUG_VOID_RETURN; + } + ++int MYSQL_LOG::flush_log_file() { ++ return flush_io_cache(&log_file); ++} ++ ++int MYSQL_LOG::close_index_file() { ++ if (my_b_inited(&index_file)) { ++ end_io_cache(&index_file); ++ my_close(index_file.file, MYF(0)); ++ } ++ return 0; ++} + + /* + Check if a string is a valid number +diff -r 66cc9e0a6768 sql/log_event.h +--- a/sql/log_event.h Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/log_event.h Thu Dec 04 21:46:15 2008 -0800 +@@ -94,6 +94,14 @@ + #define LINE_TERM_EMPTY 0x4 + #define LINE_START_EMPTY 0x8 + #define ESCAPED_EMPTY 0x10 ++ ++/* This server-id value is used to indicate a special master-info event ++ * in relay-log. ++ * We will enforce in database that replication can not set this value ++ * as the server-id. ++ */ ++#define MASTER_INFO_SERVER_ID 0xffffffff ++ + + /***************************************************************************** + +diff -r 66cc9e0a6768 sql/mysql_priv.h +--- a/sql/mysql_priv.h Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/mysql_priv.h Thu Dec 04 21:46:15 2008 -0800 +@@ -462,6 +462,7 @@ + /* BINLOG_DUMP options */ + + #define BINLOG_DUMP_NON_BLOCK 1 ++#define BINLOG_MIRROR_CLIENT 0x0004 + + /* sql_show.cc:show_log_files() */ + #define SHOW_LOG_STATUS_FREE "FREE" +@@ -1374,6 +1375,7 @@ + extern const char **errmesg; /* Error messages */ + extern const char *myisam_recover_options_str; + extern const char *in_left_expr_name, *in_additional_cond, *in_having_cond; ++extern char *opt_binlog_index_name; + extern const char * const triggers_file_ext; + extern const char * const trigname_file_ext; + extern Eq_creator eq_creator; +@@ -1875,6 +1877,10 @@ + extern "C" void unireg_abort(int exit_code); + void kill_delayed_threads(void); + bool check_stack_overrun(THD *thd, long margin, char *dummy); ++extern my_bool rpl_mirror_binlog_enabled; ++extern ulong sync_mirror_binlog_period; ++extern my_bool rpl_mirror_binlog_no_replicate; ++extern ulong rpl_mirror_binlog_clients, rpl_mirror_binlog_status; + #else + #define unireg_abort(exit_code) DBUG_RETURN(exit_code) + inline void kill_delayed_threads(void) {} +diff -r 66cc9e0a6768 sql/mysqld.cc +--- a/sql/mysqld.cc Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/mysqld.cc Thu Dec 04 21:46:15 2008 -0800 +@@ -555,6 +555,7 @@ + pthread_mutex_t LOCK_global_user_client_stats; + pthread_mutex_t LOCK_global_table_stats; + pthread_mutex_t LOCK_global_index_stats; ++pthread_mutex_t LOCK_failover_master; + /* + The below lock protects access to two global server variables: + max_prepared_stmt_count and prepared_stmt_count. These variables +@@ -584,13 +585,15 @@ + char *master_ssl_key, *master_ssl_cert; + char *master_ssl_ca, *master_ssl_capath, *master_ssl_cipher; + ++char *opt_binlog_index_name; ++ + /* Static variables */ + + static bool kill_in_progress, segfaulted; + static my_bool opt_do_pstack, opt_bootstrap, opt_myisam_log; + static int cleanup_done; + static ulong opt_specialflag, opt_myisam_block_size; +-static char *opt_logname, *opt_update_logname, *opt_binlog_index_name; ++static char *opt_logname, *opt_update_logname; + static char *opt_tc_heuristic_recover; + static char *mysql_home_ptr, *pidfile_name_ptr; + static char **defaults_argv; +@@ -598,6 +601,32 @@ + + static my_socket unix_sock,ip_sock; + struct rand_struct sql_rand; // used by sql_class.cc:THD::THD() ++ ++/* When set, we are inside a failover slave and deny all non-super access */ ++bool failover_deny_access= 0; ++ ++/* When set, binlog will be mirrored on the replica. */ ++my_bool rpl_mirror_binlog_enabled; ++ ++/* Sync the mirrored binlog to disk after every #th event. */ ++ulong sync_mirror_binlog_period; ++ ++/* The fixed size for replication event buffer. Replication event can exceed ++ * the size. ++ */ ++//ulong rpl_event_buffer_size; ++ ++/* This is a mirror binlog status variable on the primary to indicate how many ++ * mirror binlog servers are connecting. ++ */ ++ulong rpl_mirror_binlog_clients = 0; ++ ++/* This indicates whether mirror binlog is working on a replica database. It ++ * requires: ++ * . rpl_mirror_binlog_enabled = 1 ++ * . the slave I/O thread is running and mirror binlog is also dumped ++ */ ++ulong rpl_mirror_binlog_status = 0; + + /* OS specific variables */ + +@@ -1315,6 +1344,7 @@ + (void) pthread_cond_destroy(&COND_flush_thread_cache); + (void) pthread_cond_destroy(&COND_manager); + (void) pthread_mutex_destroy(&LOCK_stats); ++ (void) pthread_mutex_destroy(&LOCK_failover_master); + (void) pthread_mutex_destroy(&LOCK_global_user_client_stats); + (void) pthread_mutex_destroy(&LOCK_global_table_stats); + (void) pthread_mutex_destroy(&LOCK_global_index_stats); +@@ -3164,6 +3194,7 @@ + (void) pthread_cond_init(&COND_rpl_status, NULL); + #endif + (void) pthread_mutex_init(&LOCK_stats, MY_MUTEX_INIT_FAST); ++ (void) pthread_mutex_init(&LOCK_failover_master, MY_MUTEX_INIT_FAST); + (void) pthread_mutex_init(&LOCK_global_user_client_stats, MY_MUTEX_INIT_FAST); + (void) pthread_mutex_init(&LOCK_global_table_stats, MY_MUTEX_INIT_FAST); + (void) pthread_mutex_init(&LOCK_global_index_stats, MY_MUTEX_INIT_FAST); +@@ -3398,39 +3429,8 @@ + + if (opt_bin_log) + { +- char buf[FN_REFLEN]; +- const char *ln; +- ln= mysql_bin_log.generate_name(opt_bin_logname, "-bin", 1, buf); +- if (!opt_bin_logname && !opt_binlog_index_name) +- { +- /* +- User didn't give us info to name the binlog index file. +- Picking `hostname`-bin.index like did in 4.x, causes replication to +- fail if the hostname is changed later. So, we would like to instead +- require a name. But as we don't want to break many existing setups, we +- only give warning, not error. +- */ +- sql_print_warning("No argument was provided to --log-bin, and " +- "--log-bin-index was not used; so replication " +- "may break when this MySQL server acts as a " +- "master and has his hostname changed!! Please " +- "use '--log-bin=%s' to avoid this problem.", ln); +- } +- if (ln == buf) +- { +- my_free(opt_bin_logname, MYF(MY_ALLOW_ZERO_PTR)); +- opt_bin_logname=my_strdup(buf, MYF(0)); +- } +- if (mysql_bin_log.open_index_file(opt_binlog_index_name, ln)) +- { +- unireg_abort(1); +- } +- +- /* +- Used to specify which type of lock we need to use for queries of type +- INSERT ... SELECT. This will change when we have row level logging. +- */ +- using_update_log=1; ++ if (make_master_open_index(&opt_bin_logname, opt_binlog_index_name) != 0) ++ unireg_abort(1); + } + + if (xid_cache_init()) +@@ -3480,9 +3480,10 @@ + unireg_abort(1); + } + +- if (opt_bin_log && mysql_bin_log.open(opt_bin_logname, LOG_BIN, 0, +- WRITE_CACHE, 0, max_binlog_size, 0)) +- unireg_abort(1); ++ if (opt_bin_log && ++ make_master(NULL, opt_bin_logname, opt_binlog_index_name, NULL) != 0) { ++ unireg_abort(1); ++ } + + #ifdef HAVE_REPLICATION + if (opt_bin_log && expire_logs_days) +@@ -5098,6 +5098,8 @@ + OPT_INNODB_READ_IO_THREADS, + OPT_INNODB_WRITE_IO_THREADS, + OPT_INNODB_ADAPTIVE_HASH_INDEX, ++ OPT_RPL_MIRROR_BINLOG, ++ OPT_SYNC_MIRROR_BINLOG, + OPT_FEDERATED, + OPT_INNODB_USE_LEGACY_CARDINALITY_ALGORITHM + }; +@@ -5725,6 +5728,11 @@ + {"rpl-recovery-rank", OPT_RPL_RECOVERY_RANK, "Undocumented.", + (gptr*) &rpl_recovery_rank, (gptr*) &rpl_recovery_rank, 0, GET_ULONG, + REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, ++ {"rpl_mirror_binlog_enabled", OPT_RPL_MIRROR_BINLOG, ++ "1 = support mirroring binlogs. 0 = disable mirroring binlogs", ++ (gptr*) &rpl_mirror_binlog_enabled, ++ (gptr*) &rpl_mirror_binlog_enabled, 0, GET_BOOL, NO_ARG, ++ 0, 0, 1, 0, 1, 0}, + {"safe-mode", OPT_SAFE, "Skip some optimize stages (for testing).", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + #ifndef TO_BE_DELETED +@@ -5849,6 +5857,11 @@ + {"symbolic-links", 's', "Enable symbolic link support.", + (gptr*) &my_use_symdir, (gptr*) &my_use_symdir, 0, GET_BOOL, NO_ARG, + IF_PURIFY(0,1), 0, 0, 0, 0, 0}, ++ {"sync-mirror-binlog", OPT_SYNC_MIRROR_BINLOG, ++ "Sync the mirrored binlog to disk after every #th event. " ++ "#=0 (the default) does no sync. Syncing slows MySQL down", ++ (gptr*) &sync_mirror_binlog_period, ++ (gptr*) &sync_mirror_binlog_period, 0, GET_ULONG, REQUIRED_ARG, 0, 0, ~0L, 0, 1, 0}, + {"sysdate-is-now", OPT_SYSDATE_IS_NOW, + "Non-default option to alias SYSDATE() to NOW() to make it safe-replicable. Since 5.0, SYSDATE() returns a `dynamic' value different for different invocations, even within the same statement.", + (gptr*) &global_system_variables.sysdate_is_now, +@@ -6625,6 +6638,7 @@ + {"Delayed_errors", (char*) &delayed_insert_errors, SHOW_LONG}, + {"Delayed_insert_threads", (char*) &delayed_insert_threads, SHOW_LONG_CONST}, + {"Delayed_writes", (char*) &delayed_insert_writes, SHOW_LONG}, ++ {"Failover_deny_access", (char*) &failover_deny_access, SHOW_LONG}, + {"Flush_commands", (char*) &refresh_version, SHOW_LONG_CONST}, + {"Handler_commit", (char*) offsetof(STATUS_VAR, ha_commit_count), SHOW_LONG_STATUS}, + {"Handler_delete", (char*) offsetof(STATUS_VAR, ha_delete_count), SHOW_LONG_STATUS}, +diff -r 66cc9e0a6768 sql/repl_mule.cc +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/sql/repl_mule.cc Thu Dec 04 21:46:15 2008 -0800 +@@ -0,0 +1,466 @@ ++/* ++ Copyright (C) 2007 Google Inc. ++ ++This program is free software; you can redistribute it and/or ++modify it under the terms of the GNU General Public License ++as published by the Free Software Foundation; either version 2 ++of the License, or (at your option) any later version. ++ ++This program is distributed in the hope that it will be useful, ++but WITHOUT ANY WARRANTY; without even the implied warranty of ++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License ++along with this program; if not, write to the Free Software ++Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ++*/ ++ ++#include "mysql_priv.h" ++#include <my_dir.h> ++#include "slave.h" ++#include "repl_mule.h" ++ ++/* max log size: 2GB */ ++#define MAX_LOG_SIZE BINLOG_NOSWITCH_SIZE ++ ++ReplMule::ReplMule(THD* thd, MASTER_INFO *mi, RelayStatus status, ++ my_off_t file_size, const char *binlog_indexname, ++ MYSQL_LOG *binlog, ulong sync_period) ++ : desc_event_(new Format_description_log_event(BINLOG_VERSION)), ++ io_thd_(thd), mi_(mi), status_(status), dump_position_(0L), ++ file_size_(file_size), mule_log_(binlog), ++ mule_log_sync_period_(sync_period), mule_log_event_counter_(0) { ++ char llbuf1[22], llbuf2[22]; ++ ++ DBUG_ENTER("ReplMule::ReplMule"); ++ ++ /* Indicate that we are in replication mule mode. */ ++ mule_log_->set_mule_mode(); ++ ++ strmake(curr_log_filename_, mi->master_log_name, ++ sizeof(curr_log_filename_)-1); ++ strmake(mule_indexname_, binlog_indexname, sizeof(mule_indexname_)-1); ++ ++ /* Open the mule log file */ ++ if (!mule_log_->is_log_open()) { ++ /* Do not open binlog file when master_log_name is not specified. We ++ * are at the I/O thread initialization time and we do not know what ++ * filename we are going to dump. ++ * We wait for the next rotation event to indicate the filename. ++ */ ++ if (strlen(curr_log_filename_) > 0 && ++ mule_log_->open(curr_log_filename_, LOG_BIN, NULL, ++ SEQ_READ_APPEND, true, MAX_LOG_SIZE, 0) != 0) { ++ sql_print_error("ReplMule: open binlog failed: %s", ++ curr_log_filename_); ++ status_ = MULE_ERROR; ++ DBUG_VOID_RETURN; ++ } ++ } ++ ++ switch (status_) { ++ case MULE_BEHIND: ++ dump_position_ = mi->master_log_pos; ++ mi->master_log_pos = file_size_; ++ sql_print_information("ReplicationMule: MULE_BEHIND - new(%s), old(%s)", ++ llstr(mi->master_log_pos, llbuf1), ++ llstr(dump_position_, llbuf2)); ++ break; ++ case RELAY_MATCH_MULE: ++ case RELAY_MATCH_MULE_RUN: ++ dump_position_ = mi->master_log_pos; ++ sql_print_information("ReplicationMule: RELAY_MATCH_MULE."); ++ break; ++ case MULE_VERIFY: ++ case MULE_VERIFY_RELAY_BEHIND: ++ dump_position_ = mi->master_log_pos; ++ mi->master_log_pos = BIN_LOG_HEADER_SIZE; ++ sql_print_information( ++ "ReplicationMule: MULE_VERIFY - old(%s), file_size(%s)", ++ llstr(dump_position_, llbuf1), llstr(file_size_, llbuf2)); ++ ++ /* seek to the beginning of the file for verification */ ++ seekToPosition(BIN_LOG_HEADER_SIZE); ++ break; ++ } ++ ++ DBUG_VOID_RETURN; ++} ++ ++ReplMule::~ReplMule() { ++ DBUG_ENTER("ReplMule::~ReplMule"); ++ ++ if (mule_log_->is_log_open()) ++ mule_log_->close(LOG_CLOSE_INDEX); ++ mule_log_->clear_mule_mode(); ++ ++ /* If we are still in MULE_BEHIND or MULE_VERIFY state and we exit from ++ * I/O thread, it means we encountered some errors. ++ * mi->master_log_pos might be used by later slave start. It is being ++ * changed here to do event dumping or event verification. So, we should ++ * restore it to its original value. ++ */ ++ switch (status_) { ++ case MULE_BEHIND: ++ case MULE_VERIFY: ++ if (mi_->master_log_pos < dump_position_) ++ mi_->master_log_pos = dump_position_; ++ break; ++ } ++ ++ delete desc_event_; ++ ++ DBUG_VOID_RETURN; ++} ++ ++ReplMule::WriteStatus ReplMule::writeEvent(const char* buf, ulong event_len) { ++ WriteStatus dump_status = WRITE_RELAY; ++ char llbuf1[22], llbuf2[22], llbuf3[22]; ++ char *verify_event; ++ bool verified = false; ++ bool skip_event = false; ++ ++ DBUG_ENTER("ReplMule::dumpEvent"); ++ switch (status_) { ++ case MULE_VERIFY: ++ case MULE_VERIFY_RELAY_BEHIND: ++ if (buf[EVENT_TYPE_OFFSET] == ROTATE_EVENT && ++ IsFakeRotation(buf, event_len)) { ++ /* Do not verify the faked rotate event */ ++ if (status_ == MULE_VERIFY) ++ dump_status = SKIP_RELAY; ++ break; ++ } ++ verify_event = new char[event_len]; ++ if (verify_event == NULL) { ++ sql_print_error( ++ "ReplMule::dumpEvent - insufficient memory in verification, " ++ "position(%s), event_len(%d).", ++ llstr(mi_->master_log_pos, llbuf1), event_len); ++ dump_status = WRITE_ERROR; ++ break; ++ } ++ if (my_b_read(mule_log_->get_log_file(), (byte*) verify_event, ++ event_len) != 0) { ++ sql_print_error( ++ "ReplMule::dumpEvent - read log error in verification, " ++ "position(%s), event_len(%d).", ++ llstr(mi_->master_log_pos, llbuf1), event_len); ++ dump_status = WRITE_ERROR; ++ delete verify_event; ++ break; ++ } ++ verified = (memcmp(buf, verify_event, event_len) == 0); ++ delete verify_event; ++ if (!verified) { ++ sql_print_error( ++ "ReplMule::dumpEvent - event does not match at position(%s)", ++ llstr(mi_->master_log_pos, llbuf1)); ++ dump_status = WRITE_ERROR; ++ break; ++ } ++ /* fall through */ ++ case MULE_BEHIND: ++ dump_status = SKIP_RELAY; ++ if (status_ == MULE_BEHIND && ++ queueEvent(buf, event_len, &skip_event) != 0) { ++ dump_status = WRITE_ERROR; ++ break; ++ } ++ ++ /* Skip faked rotation event */ ++ if (!skip_event) ++ mi_->master_log_pos += event_len; ++ ++ if (mi_->master_log_pos == dump_position_) { ++ if (dump_position_ < file_size_) { ++ status_ = MULE_VERIFY_RELAY_BEHIND; ++ } else { ++ status_ = RELAY_MATCH_MULE; ++ } ++ sql_print_information( ++ "ReplMule::dumpEvent - new status(%d) " ++ "master_log_pos(%s), dump_pos(%s), file_size(%s)", status_, ++ llstr(mi_->master_log_pos, llbuf1), llstr(dump_position_, llbuf2), ++ llstr(file_size_, llbuf3)); ++ } else if (mi_->master_log_pos == file_size_) { ++ if (dump_position_ > file_size_) { ++ status_ = MULE_BEHIND; ++ } else { ++ status_ = RELAY_MATCH_MULE; ++ } ++ sql_print_information( ++ "ReplMule::dumpEvent - new status(%d) " ++ "master_log_pos(%s), dump_pos(%s), file_size(%s)", status_, ++ llstr(mi_->master_log_pos, llbuf1), llstr(dump_position_, llbuf2), ++ llstr(file_size_, llbuf3)); ++ } else if (status_ != MULE_VERIFY_RELAY_BEHIND && ++ mi_->master_log_pos > dump_position_) { ++ sql_print_error( ++ "ReplMule::dumpEvent - mule position(%s) does not match " ++ "relay-log position(%s).", ++ llstr(mi_->master_log_pos, llbuf1), llstr(dump_position_, llbuf2)); ++ dump_status = WRITE_ERROR; ++ } ++ break; ++ case RELAY_MATCH_MULE_RUN: ++ if (buf[EVENT_TYPE_OFFSET] == FORMAT_DESCRIPTION_EVENT) { ++ sql_print_information(" RELAY_MATCH_MULE event %d", buf[EVENT_TYPE_OFFSET] ); ++ /* Do not write format description record if size is the same */ ++ break; ++ } ++ case RELAY_MATCH_MULE: ++ if (queueEvent(buf, event_len, &skip_event) != 0) ++ dump_status = WRITE_ERROR; ++ break; ++ } ++ ++ DBUG_RETURN(dump_status); ++} ++ ++int ReplMule::appendEvent(const char* buf, ulong event_len) { ++ char llbuf1[22]; ++ int error; ++ ++ DBUG_ENTER("ReplMule::appendEvent"); ++ ++ error = mule_log_->appendv(buf,event_len,0); ++ if (error != 0) { ++ sql_print_error("ReplMule::appendEvent - append error at %s(%s)", ++ mi_->master_log_name, ++ llstr(mi_->master_log_pos, llbuf1)); ++ } else if (mule_log_->flush_log_file() != 0) { ++ sql_print_error("ReplMule::appendEvent - flush error at %s(%s)", ++ mi_->master_log_name, ++ llstr(mi_->master_log_pos, llbuf1)); ++ error = -1; ++ } else if (mule_log_sync_period_ > 0) { ++ mule_log_event_counter_++; ++ if (mule_log_event_counter_ >= mule_log_sync_period_) { ++ mule_log_event_counter_ = 0; ++ error = my_sync(mule_log_->get_log_file()->file, MYF(MY_WME)); ++ if (error != 0) ++ sql_print_error("ReplMule::appendEvent - sync error at %s(%s)", ++ mi_->master_log_name, ++ llstr(mi_->master_log_pos, llbuf1)); ++ } ++ } ++ ++ DBUG_RETURN(error); ++} ++ ++int ReplMule::queueEvent(const char* buf, ulong event_len, bool *skip_event) { ++ int error = 0; ++ ++ DBUG_ENTER("ReplMule::queueEvent"); ++ ++ *skip_event = false; ++ ++ mule_log_->lock_log(); ++ if (buf[EVENT_TYPE_OFFSET] == ROTATE_EVENT) { ++ Rotate_log_event rev(buf, event_len, desc_event_); ++ ++ /* If this is a faked rotate event and the specified filename is ++ * the same as the current binlog filename, ignore the event. ++ */ ++ if (IsFakeRotation(rev)) { ++ *skip_event = true; ++ DBUG_PRINT("info",("skipped faked rotation event")); ++ } else { ++ /* Only append real events. */ ++ if (rev.when != 0) ++ error = appendEvent(buf, event_len); ++ ++ /* Only rotate file when append succeeds. */ ++ if (error == 0) { ++ /* Create a new file: lock both index and log. */ ++ if (strlen(curr_log_filename_) == 0) { ++ /* If curr_log_filename_ is not specified, then this is the first ++ * valid rotation event to indicate the filename. ++ */ ++ error = mule_log_->open(rev.new_log_ident, LOG_BIN, NULL, ++ SEQ_READ_APPEND, true, MAX_LOG_SIZE, 0); ++ } else { ++ mule_log_->new_file(0, rev.new_log_ident); ++ } ++ ++ strmake(curr_log_filename_, rev.new_log_ident, ++ strlen(rev.new_log_ident)); ++ ++ DBUG_PRINT("info",("rotate file: %s", rev.new_log_ident)); ++ } ++ } ++ } else { ++ error = appendEvent(buf, event_len); ++ } ++ mule_log_->unlock_log(); ++ ++ DBUG_RETURN(error); ++} ++ ++void ReplMule::seekToPosition(my_off_t pos) { ++ DBUG_ENTER("ReplMule::seekToPosition"); ++ DBUG_PRINT("enter",("seek_pos: %ld", (ulong) pos)); ++ ++ my_b_seek(mule_log_->get_log_file(), pos); ++ DBUG_VOID_RETURN; ++} ++ ++bool ReplMule::IsFakeRotation(const char* buf, ulong event_len) { ++ DBUG_ENTER("ReplMule::IsFakeRotation"); ++ ++ Rotate_log_event rev(buf, event_len, desc_event_); ++ DBUG_RETURN(IsFakeRotation(rev)); ++} ++ ++bool ReplMule::IsFakeRotation(const Rotate_log_event& rev) { ++ DBUG_ENTER("ReplMule::IsFakeRotation"); ++ DBUG_RETURN(rev.when == 0 && ++ rev.ident_len == strlen(curr_log_filename_) && ++ strcmp(rev.new_log_ident, curr_log_filename_) == 0); ++} ++ ++/* createReplicationMule: ++ * Create a mule that relays master's replication binlog and ++ * generate an exact same copy on the local filesystem. ++ * ++ * Code flow: ++ * last_mulelog = scan the existing mule log index to find it ++ * if (mulelog index is not created or there is no mule log inside it) ++ * old_mule_log <- requested dumping position ++ * requested dumping position <- 0 in the file ++ * else ++ * check whether the mule log matches the requested dump ++ * (whether the last mule log name/size matches) ++ * if the mule log name does not match ++ * exit with an error ++ * if (the mule log size does not match the requested dump position) ++ * request the dump from position 0 and read all events ++ * verify all events with the corresponding events in mule log ++ * if (the verification succeeds) ++ * continue the dump ++ * else ++ * exit with an error ++ */ ++ReplMule* ReplMule::createReplicationMule( ++ THD* thd, MASTER_INFO *mi, const char *binlog_indexname, ++ MYSQL_LOG *binlog) { ++ ReplMule *mule = NULL; ++ LOG_INFO linfo; ++ bool index_opened = false; ++ ++ DBUG_ENTER("ReplMule::createReplicationMule"); ++ ++ /* binlog_indexname must be set to some real value. */ ++ DBUG_ASSERT(binlog_indexname); ++ ++ /* Lock binlog index for all binlog operations */ ++ binlog->lock_index(); ++ index_opened = binlog->open_index_file(binlog_indexname, NULL); ++ DBUG_PRINT("info",("open index file succeed: %d", index_opened)); ++ sql_print_information("createReplicationMule"); ++ ++ /* Scan the existing binlog index to find the last relayed binlog */ ++ if (index_opened || ++ binlog->find_log_pos(&linfo, NullS, false) != 0) { ++ /* binlog index is not created or has no log file inside: ++ * . old_relay_binlog <- requested dumping position ++ * . requested dumping position <- 0 in the file ++ */ ++ if (mi->master_log_pos == BIN_LOG_HEADER_SIZE) { ++ mule = new ReplMule(thd, mi, RELAY_MATCH_MULE, BIN_LOG_HEADER_SIZE, ++ binlog_indexname, binlog, sync_mirror_binlog_period); ++ } else { ++ mule = new ReplMule(thd, mi, MULE_BEHIND, BIN_LOG_HEADER_SIZE, ++ binlog_indexname, binlog, sync_mirror_binlog_period); ++ } ++ ++ if (mule == NULL) { ++ sql_print_error("Mule malloc operation failed."); ++ } ++ } else { ++ IO_CACHE* log_file; ++ MY_STAT stat; ++ char last_binlog_name[FN_REFLEN]; ++ ++ /* Find the last log file from the binlog index. ++ * Check whether the last binlog matches the requested dump for both ++ * binlog name and binlog size. ++ */ ++ for (;;) { ++ strmake(last_binlog_name, linfo.log_file_name, FN_REFLEN); ++ last_binlog_name[FN_REFLEN - 1] = '\0'; ++ if (binlog->find_next_log(&linfo, false)) ++ break; ++ } ++ DBUG_PRINT("info",("the last binlog: %s", last_binlog_name)); ++ ++ /* if the binlog name does not match, exit with an error. */ ++ if (strcmp(last_binlog_name+dirname_length(last_binlog_name), ++ mi->master_log_name) != 0) { ++ sql_print_error("Mule binlog(%s) does not match new relay-binlog(%s)", ++ last_binlog_name, mi->master_log_name); ++ } /* Open the last binlog. */ ++ else if (binlog->open(last_binlog_name, LOG_BIN, NULL, ++ SEQ_READ_APPEND, true, MAX_LOG_SIZE, 0) != 0) { ++ sql_print_error("Mule open last binlog failed: %s", last_binlog_name); ++ } else { ++ bool valid_file_size = true; ++ ++ /* Get the binlog size. */ ++ log_file = binlog->get_log_file(); ++ if (my_fstat(log_file->file, &stat, MYF(0)) == 0) { ++ /* If the binlog size does not match the requested dump position, then ++ * request the dump from position 0 and verify all events, we need to ++ * verify events because the mule log might be used for serving during ++ * anytime. We must be sure that they are correct. ++ */ ++ sql_print_information("Binglog size %d", stat.st_size); ++ if (stat.st_size == mi->master_log_pos) { ++ mule = new ReplMule(thd, mi, RELAY_MATCH_MULE_RUN, stat.st_size, ++ binlog_indexname, binlog, ++ sync_mirror_binlog_period); ++ } else if (stat.st_size > BIN_LOG_HEADER_SIZE) { ++ mule = new ReplMule(thd, mi, MULE_VERIFY, stat.st_size, ++ binlog_indexname, binlog, ++ sync_mirror_binlog_period); ++ } else if (stat.st_size == BIN_LOG_HEADER_SIZE) { ++ mule = new ReplMule(thd, mi, MULE_BEHIND, BIN_LOG_HEADER_SIZE, ++ binlog_indexname, binlog, ++ sync_mirror_binlog_period); ++ } else { ++ char llbuf[22]; ++ valid_file_size = false; ++ sql_print_error("Mule binlog file(%s) invalid size: %s", ++ last_binlog_name, llstr(stat.st_size, llbuf)); ++ } ++ } else { ++ valid_file_size = false; ++ sql_print_error("Mule binlog file(%s): fstat failed.", ++ last_binlog_name); ++ } ++ ++ if (valid_file_size) { ++ if (mule == NULL) { ++ sql_print_error("Mule malloc operation failed."); ++ } else if (mule->status_ == MULE_ERROR) { ++ /* If mule creation fails, indicate the error. */ ++ delete mule; ++ mule = NULL; ++ } ++ } ++ } ++ } ++ ++ /* Clear the mule binlog mode if there are errors. */ ++ if (mule == NULL) { ++ binlog->clear_mule_mode(); ++ binlog->close_index_file(); ++ } ++ ++ /* Unlock binlog index */ ++ binlog->unlock_index(); ++ ++ DBUG_RETURN(mule); ++} +diff -r 66cc9e0a6768 sql/repl_mule.h +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ b/sql/repl_mule.h Thu Dec 04 21:46:15 2008 -0800 +@@ -0,0 +1,166 @@ ++/* ++ Copyright (C) 2007 Google Inc. ++ ++This program is free software; you can redistribute it and/or ++modify it under the terms of the GNU General Public License ++as published by the Free Software Foundation; either version 2 ++of the License, or (at your option) any later version. ++ ++This program is distributed in the hope that it will be useful, ++but WITHOUT ANY WARRANTY; without even the implied warranty of ++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License ++along with this program; if not, write to the Free Software ++Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ++*/ ++ ++#ifndef SQL_REPL_MULE_H__ ++#define SQL_REPL_MULE_H__ ++ ++/* Replication Mule is the class that is responsible for generating ++ * an exact copy of the binlog from a master database. We call this feature ++ * mirror binlog and it can be enabled by setting rpl_mirror_binlog. We ++ * need to keep the same copy for the following purposes: ++ * . The replica can serve the binlog transparently as if they are the ++ * master database. This can relieve master connection overhead. ++ * . During failover, the replica can become the new master and serve ++ * old binlogs transparently. ++ * (The Mule name comes from the popular P2P software eMule.) ++ * ++ * Internally, we call the mirrored binlog mule log. ++ */ ++ ++class THD; ++class Rotate_log_event; ++class Format_description_log_event; ++typedef struct st_master_info MASTER_INFO; ++ ++class ReplMule { ++ public: ++ /* Because I/O thread also creates relay-binlog, instead of an exact ++ * copy of the original master's binlog, we have two resources that ++ * might get out of sync. ++ * This enum indicates the status: ++ * MULE_BEHIND - the mule's header is behind: ++ * (mule is activated for the first time) ++ * RELAY_MATCH_MULE - mule matches relay-log ++ * RELAY_MATCH_MULE_RUN - mule matches relay-log and it was not empty binlog ++ * MULE_VERIFY - mule has more events than the relay-log and needs ++ * verification; we can not verify based on relay-log ++ * events because events might get changed a little; ++ * verification starts with downloading all events in ++ * the last binlog from the master and compare with ++ * all events in the mule log; ++ * MULE_VERIFY_RELAY_BEHIND - mule has more events than the relay-log ++ * and relay-log needs to write events ++ * MULE_ERROR - mule detects errors in event duplicate ++ * ++ * When the mule mirrors binlogs, it writes an event into the mule log ++ * first. Then, I/O thread writes the event into the relay log. ++ */ ++ enum RelayStatus { ++ MULE_BEHIND = 1, ++ RELAY_MATCH_MULE = 2, ++ RELAY_MATCH_MULE_RUN = 7, ++ MULE_VERIFY = 3, ++ MULE_VERIFY_RELAY_BEHIND = 4, ++ MULE_ERROR = 5, ++ }; ++ ++ enum WriteStatus { ++ WRITE_RELAY = 1, ++ WRITE_ERROR = 2, ++ SKIP_RELAY = 3, ++ }; ++ ++ private: ++ const Format_description_log_event *desc_event_; ++ THD *io_thd_; ++ MASTER_INFO *mi_; ++ ++ /* ++ * I/O thread will write both mule log for mirror binlog and relay log ++ * for SQL thread. ++ * The variable indicates whether the two are in sync. ++ */ ++ RelayStatus status_; ++ ++ /* The starting event writing position. */ ++ my_off_t dump_position_; ++ ++ /* During the initial setup, the last mule log's file size. */ ++ my_off_t file_size_; ++ ++ /* Internally, we call the mirrored binlog mule log. */ ++ MYSQL_LOG *mule_log_; ++ ++ /* Sync the mule log to disk for every #N events. */ ++ ulong mule_log_sync_period_; ++ ulong mule_log_event_counter_; ++ ++ /* mule log's index filename */ ++ char mule_indexname_[FN_REFLEN]; ++ ++ /* the current mule log's filename */ ++ char curr_log_filename_[FN_REFLEN]; ++ ++ ReplMule(THD* thd, MASTER_INFO *mi, RelayStatus status, ++ my_off_t file_size, const char *binlog_indexname, ++ MYSQL_LOG *binlog, ulong sync_period); ++ ++ /* ++ * Queue the event into the current mule log. If it is a rotation ++ * event, generate a new mule log file. ++ * Indicate whether the event is skipped because it is an fake event. ++ * A fake event is generated by the master to indicate the current ++ * reading position. ++ */ ++ int queueEvent(const char* buf, ulong event_len, bool *skip_event); ++ ++ /* Append the event to the current mule log. */ ++ int appendEvent(const char* buf, ulong event_len); ++ ++ bool IsFakeRotation(const char* buf, ulong event_len); ++ bool IsFakeRotation(const Rotate_log_event& rev); ++ ++ /* Seek to the specified position in the current open mule log. */ ++ void seekToPosition(my_off_t pos); ++ ++ public: ++ ++ ~ReplMule(); ++ ++ /* Dump the event into mule binlog. ++ * Input: ++ * buf (IN) - replication event buffer ++ * event_len (IN) - the event length ++ * ++ * Return: ++ * . WRITE_RELAY: the relay log needs to writing the event ++ * . WRITE_ERROR: the writing encountered errors ++ * . SKIP_RELAY: the relay log should skip the event ++ */ ++ WriteStatus writeEvent(const char* buf, ulong event_len); ++ ++ /* createReplicationMule: ++ * Create a mule that relays master's replication binlog and ++ * generate an exact same copy on the local filesystem. ++ * ++ * Input: ++ * thd (IN) - replication I/O thread ++ * mi (IN) - master info struct for I/O thread's progress ++ * binlog_indexname (IN) - filename for binlog's index ++ * binlog (IN) - replication binlog ++ * ++ * Return: ++ * . a replication mule if success ++ * . NULL if there are any errors ++ */ ++ static ReplMule *createReplicationMule(THD* thd, MASTER_INFO *mi, ++ const char *binlog_indexname, ++ MYSQL_LOG *binlog); ++}; ++ ++#endif /* SQL_REPL_MULE_H__ */ +diff -r 66cc9e0a6768 sql/set_var.cc +--- a/sql/set_var.cc Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/set_var.cc Thu Dec 04 21:46:15 2008 -0800 +@@ -345,6 +345,8 @@ + slog_verb); + sys_var_long_ptr sys_rpl_recovery_rank("rpl_recovery_rank", + &rpl_recovery_rank); ++sys_var_bool_ptr sys_rpl_mirror_binlog_enabled("rpl_mirror_binlog_enabled", ++ &rpl_mirror_binlog_enabled); + sys_var_long_ptr sys_query_cache_size("query_cache_size", + &query_cache_size, + fix_query_cache_size); +@@ -364,6 +366,9 @@ + sys_var_thd_ulong sys_trans_prealloc_size("transaction_prealloc_size", + &SV::trans_prealloc_size, + 0, fix_trans_mem_root); ++sys_var_long_ptr sys_sync_mirror_binlog_period( ++ "sync_mirror_binlog_period", ++ &sync_mirror_binlog_period); + + #ifdef HAVE_QUERY_CACHE + sys_var_long_ptr sys_query_cache_limit("query_cache_limit", +@@ -774,6 +779,7 @@ + &sys_relay_log_purge, + #endif + &sys_rpl_recovery_rank, ++ &sys_rpl_mirror_binlog_enabled, + &sys_safe_updates, + &sys_secure_auth, + &sys_secure_file_priv, +@@ -1113,6 +1119,8 @@ + {"relay_log_space_limit", (char*) &relay_log_space_limit, SHOW_LONGLONG}, + #endif + {sys_rpl_recovery_rank.name,(char*) &sys_rpl_recovery_rank, SHOW_SYS}, ++ {sys_rpl_mirror_binlog_enabled.name, ++ (char *) &sys_rpl_mirror_binlog_enabled, SHOW_SYS}, + {"secure_auth", (char*) &sys_secure_auth, SHOW_SYS}, + {"secure_file_priv", (char*) &sys_secure_file_priv, SHOW_SYS}, + #ifdef HAVE_SMEM +diff -r 66cc9e0a6768 sql/slave.cc +--- a/sql/slave.cc Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/slave.cc Thu Dec 04 21:46:15 2008 -0800 +@@ -25,6 +25,7 @@ + #include <thr_alarm.h> + #include <my_dir.h> + #include <sql_common.h> ++#include "repl_mule.h" + #include <errmsg.h> + #include <mysys_err.h> + +@@ -3527,6 +3528,7 @@ + RELAY_LOG_INFO *rli= &mi->rli; + char llbuff[22]; + uint retry_count; ++ ReplMule *mule = NULL; + + // needs to call my_thread_init(), otherwise we get a coredump in DBUG_ stuff + my_thread_init(); +@@ -3609,6 +3611,23 @@ + if (get_master_version_and_clock(mysql, mi)) + goto err; + ++ if (rpl_mirror_binlog_enabled && !mule) { ++ if (opt_binlog_index_name == NULL) { ++ sql_print_error("\"log-bin-index\" must be set in mirror binlog."); ++ goto err; ++ } ++ ++ /* Create the mule to generate the exact copy of the binlog */ ++ mule = ReplMule::createReplicationMule( ++ thd, mi, opt_binlog_index_name, &mysql_bin_log); ++ ++ /* If we could not create the mule, we stop the I/O thread and report ++ * an error. ++ */ ++ if (mule == NULL) ++ goto err; ++ } ++ + if (mi->rli.relay_log.description_event_for_queue->binlog_version > 1) + { + /* +@@ -3624,6 +3643,7 @@ + DBUG_PRINT("info",("Starting reading binary log from master")); + while (!io_slave_killed(thd,mi)) + { ++ const char* event_buf; + bool suppress_warnings= 0; + thd_proc_info(thd, "Requesting binlog dump"); + if (request_dump(mysql, mi, &suppress_warnings)) +@@ -3754,10 +3774,25 @@ + goto connected; + } // if (event_len == packet_error) + ++ event_buf = (const char*)mysql->net.read_pos + 1; ++ ++ if (mule) { ++ ReplMule::WriteStatus d_status = ++ mule->writeEvent(event_buf, event_len); ++ switch (d_status) { ++ case ReplMule::WRITE_RELAY: ++ break; ++ case ReplMule::SKIP_RELAY: ++ /* Skip writing relay event; go back to read the next event */ ++ continue; ++ case ReplMule::WRITE_ERROR: ++ goto err; ++ } ++ } ++ + retry_count=0; // ok event, reset retry counter + thd_proc_info(thd, "Queueing master event to the relay log"); +- if (queue_event(mi,(const char*)mysql->net.read_pos + 1, +- event_len)) ++ if (queue_event(mi, event_buf, event_len)) + { + sql_print_error("Slave I/O thread could not queue event from master"); + goto err; +@@ -3847,6 +3882,7 @@ + change_rpl_status(RPL_ACTIVE_SLAVE,RPL_IDLE_SLAVE); + DBUG_ASSERT(thd->net.buff != 0); + net_end(&thd->net); // destructor will not free it, because net.vio is 0 ++ delete mule; + close_thread_tables(thd, 0); + pthread_mutex_lock(&LOCK_thread_count); + THD_CHECK_SENTRY(thd); +diff -r 66cc9e0a6768 sql/sql_class.h +--- a/sql/sql_class.h Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/sql_class.h Thu Dec 04 21:46:15 2008 -0800 +@@ -152,6 +152,12 @@ + #define LOG_INFO_FATAL -7 + #define LOG_INFO_IN_USE -8 + ++/* If the maximum size is equal to this value, binlog would not rotate on ++ * size limit. ++ */ ++#define BINLOG_NOSWITCH_SIZE ((ulong) -1) ++ ++ + /* bitmap to SQL_LOG::close() */ + #define LOG_CLOSE_INDEX 1 + #define LOG_CLOSE_TO_BE_OPENED 2 +@@ -245,6 +251,9 @@ + bool no_auto_events; + friend class Log_event; + ++ /* mule replication mode */ ++ bool mule_binlog_; ++ + public: + /* + These describe the log's format. This is used only for relay logs. +@@ -317,7 +326,8 @@ + } + bool open_index_file(const char *index_file_name_arg, + const char *log_name); +- void new_file(bool need_lock); ++ int close_index_file(); ++ void new_file(bool need_lock= 1, const char* log_filename= NULL); + bool write(THD *thd, enum enum_server_command command, + const char *format, ...) ATTRIBUTE_FORMAT(printf, 4, 5); + bool write(THD *thd, const char *query, uint query_length, +@@ -357,7 +367,27 @@ + int get_current_log(LOG_INFO* linfo); + int raw_get_current_log(LOG_INFO* linfo); + uint next_file_id(); +- inline bool is_open() { return log_type != LOG_CLOSED; } ++ ++ /* Because mysql use is_open() to check whether replication is on, ++ * we will let the check fail during binlog mule mode. Mule replication ++ * and normal master replication can not be on at the same time. ++ * ++ * is_log_open(): the binlog file is open for either purpose ++ * ++ * is_open(): the binlog is open for master replication. ++ * is_mule_open(): the binlog is open for mirror binlog or for ++ * replication mule; refer repl_mule.h for details ++ */ ++ bool is_log_open() { ++ return log_type != LOG_CLOSED; ++ } ++ bool is_open() { ++ return (!mule_binlog_) && is_log_open(); ++ } ++ bool is_mule_open() { ++ return (mule_binlog_) && is_log_open(); ++ } ++ + inline char* get_index_fname() { return index_file_name;} + inline char* get_log_fname() { return log_file_name; } + inline char* get_name() { return name; } +@@ -366,8 +396,18 @@ + + inline void lock_index() { pthread_mutex_lock(&LOCK_index);} + inline void unlock_index() { pthread_mutex_unlock(&LOCK_index);} ++ inline void lock_log() { pthread_mutex_lock(&LOCK_log);} ++ inline void unlock_log() { pthread_mutex_unlock(&LOCK_log);} + inline IO_CACHE *get_index_file() { return &index_file;} + inline uint32 get_open_count() { return open_count; } ++ /* Look in file repl_mule.h for the definition of mule. */ ++ void set_mule_mode() { ++ mule_binlog_ = 1; ++ } ++ void clear_mule_mode() { ++ mule_binlog_ = 0; ++ } ++ int flush_log_file(); + }; + + /* +diff -r 66cc9e0a6768 sql/sql_lex.h +--- a/sql/sql_lex.h Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/sql_lex.h Thu Dec 04 21:46:15 2008 -0800 +@@ -104,6 +104,7 @@ + // TODO(mcallaghan): update status_vars in mysqld to export these + SQLCOM_SHOW_USER_STATS, SQLCOM_SHOW_TABLE_STATS, SQLCOM_SHOW_INDEX_STATS, + SQLCOM_SHOW_CLIENT_STATS, ++ SQLCOM_MAKE_MASTER, + /* This should be the last !!! */ + SQLCOM_END + }; +@@ -171,6 +172,12 @@ + char *ssl_key, *ssl_cert, *ssl_ca, *ssl_capath, *ssl_cipher; + char *relay_log_name; + ulong relay_log_pos; ++ ++ /* the following fields are used for make master command */ ++ char *log_index_name; ++ bool in_failover; ++ bool kill_session; ++ bool with_old_binlog; + } LEX_MASTER_INFO; + + +diff -r 66cc9e0a6768 sql/sql_parse.cc +--- a/sql/sql_parse.cc Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/sql_parse.cc Thu Dec 04 21:46:15 2008 -0800 +@@ -402,6 +402,15 @@ + passwd_len ? "yes": "no", + thd->main_security_ctx.master_access, + (thd->db ? thd->db : "*none*"))); ++ ++ /* If we are in failover mode, reject all non-super user connections. */ ++ if (is_in_failover() && ++ !(thd->main_security_ctx.master_access & SUPER_ACL)) { ++ net_send_error(thd, ER_SPECIFIC_ACCESS_DENIED_ERROR, ++ "super-user only during failover"); ++ DBUG_RETURN(-1); ++ } ++ + + if (check_count) + { +@@ -3470,6 +3479,22 @@ + else + res = load_master_data(thd); + break; ++ ++ case SQLCOM_MAKE_MASTER: ++ { ++ thd_proc_info(thd, "Making master"); ++ ++ if (check_global_access(thd, SUPER_ACL)) ++ goto error; ++ res = make_master(thd, NULL, NULL, &lex->mi); ++ if (res == 0) { ++ // TODO -- wei is this OK, setting it to NULL? ++ thd_proc_info(thd, 0); ++ send_ok(thd); ++ } ++ break; ++ } ++ + #endif /* HAVE_REPLICATION */ + #ifdef HAVE_NDBCLUSTER_DB + case SQLCOM_SHOW_NDBCLUSTER_STATUS: +diff -r 66cc9e0a6768 sql/sql_repl.cc +--- a/sql/sql_repl.cc Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/sql_repl.cc Thu Dec 04 21:46:15 2008 -0800 +@@ -20,11 +20,19 @@ + #include "log_event.h" + #include <my_dir.h> + ++extern pthread_mutex_t LOCK_failover_master; ++extern bool failover_deny_access; ++ + int max_binlog_dump_events = 0; // unlimited + my_bool opt_sporadic_binlog_dump_fail = 0; + #ifndef DBUG_OFF + static int binlog_dump_count = 0; + #endif ++ ++static int make_master_open_log(MYSQL_LOG *log, const char *opt_name, ++ bool no_auto_events, ulong max_size); ++static int set_in_failover(bool kill_session); ++static void clear_in_failover(void); + + /* + fake_rotate_event() builds a fake (=which does not exist physically in any +@@ -255,7 +263,7 @@ + bool purge_master_logs(THD* thd, const char* to_log) + { + char search_file_name[FN_REFLEN]; +- if (!mysql_bin_log.is_open()) ++ if (!mysql_bin_log.is_log_open()) + { + send_ok(thd); + return FALSE; +@@ -308,6 +316,44 @@ + return error; + } + ++/* Show processlist command dump the binlog state. ++ * ++ * Input: ++ * output_info - (OUT) the output proc_info ++ * output_len - (IN) output proc_info's length ++ * thd - (IN) the thread ++ * input_msg - (IN) the input proc_info ++ * log_file_name - (IN) binlog file name ++ * log_pos - (IN) binlog position ++ */ ++static void processlist_show_binlog_state(char *output_info, ++ int output_len, ++ THD *thd, ++ const char *input_msg, ++ const char *log_file_name, ++ my_off_t log_pos) { ++ DBUG_ENTER("processlist_show_binlog_state"); ++ ++ /* Point to input_msg in case "show processlist" access it before the copy ++ * is finished. ++ */ ++ thd_proc_info(thd, input_msg); ++ ++ if (snprintf(output_info, output_len, "%s :%s:%lld:", input_msg, ++ log_file_name + dirname_length(log_file_name), ++ log_pos) > 0) { ++ thd_proc_info(thd, output_info); ++ } ++ ++ DBUG_VOID_RETURN; ++} ++ ++static void repl_cleanup(ushort flags) { ++ if (flags & BINLOG_MIRROR_CLIENT) { ++ /* One less mirror binlog client. */ ++ thread_safe_sub(rpl_mirror_binlog_clients, 1, &LOCK_stats); ++ } ++} + + /* + TODO: Clean up loop to only have one call to send_file() +@@ -319,6 +365,11 @@ + LOG_INFO linfo; + char *log_file_name = linfo.log_file_name; + char search_file_name[FN_REFLEN], *name; ++ ++ /* This buffer should be enough for "comments + :file_name:file_pos:". */ ++ char binlog_state_msg[FN_REFLEN + 100]; ++ int binlog_state_msg_len = FN_REFLEN + 100; ++ + IO_CACHE log; + File file = -1; + String* packet = &thd->packet; +@@ -335,6 +386,15 @@ + + bzero((char*) &log,sizeof(log)); + ++ sql_print_information("Start %s binlog_dump to slave_server(%d), pos(%s, %lu)", ++ "asynchronous", ++ thd->server_id, log_ident, (ulong)pos); ++ ++ if (flags & BINLOG_MIRROR_CLIENT) { ++ /* One more mirror binlog clients. */ ++ thread_safe_increment(rpl_mirror_binlog_clients, &LOCK_stats); ++ } ++ + #ifndef DBUG_OFF + if (opt_sporadic_binlog_dump_fail && (binlog_dump_count++ % 2)) + { +@@ -344,7 +404,7 @@ + } + #endif + +- if (!mysql_bin_log.is_open()) ++ if (!mysql_bin_log.is_log_open()) + { + errmsg = "Binary log is not open"; + my_errno= ER_MASTER_FATAL_ERROR_READING_BINLOG; +@@ -529,6 +589,12 @@ + } + #endif + ++ /* Update the binlog sending state. */ ++ processlist_show_binlog_state( ++ binlog_state_msg, binlog_state_msg_len, thd, ++ "Send binlog events to slave", ++ log_file_name, pos); ++ + if ((*packet)[EVENT_TYPE_OFFSET+1] == FORMAT_DESCRIPTION_EVENT) + { + binlog_can_be_corrupted= test((*packet)[FLAGS_OFFSET+1] & +@@ -634,6 +700,13 @@ + } + if (!thd->killed) + { ++ /* Update the binlog sending state. */ ++ processlist_show_binlog_state( ++ binlog_state_msg, binlog_state_msg_len, thd, ++ "Has sent all binlog to slave; " ++ "waiting for binlog to be updated", ++ log_file_name, pos); ++ + /* Note that the following call unlocks lock_log */ + mysql_bin_log.wait_for_update(thd, 0); + } +@@ -650,7 +723,12 @@ + + if (read_packet) + { +- thd_proc_info(thd, "Sending binlog event to slave"); ++ // thd_proc_info(thd, "Sending binlog event to slave"); ++ /* Update the binlog sending state. */ ++ processlist_show_binlog_state(binlog_state_msg, ++ binlog_state_msg_len, thd, ++ "Sending binlog event to slave", ++ log_file_name, pos); + if (my_net_write(net, (char*)packet->ptr(), packet->length()) ) + { + errmsg = "Failed on my_net_write()"; +@@ -685,10 +763,21 @@ + } + else + { ++ char old_log_file_name[FN_REFLEN]; + bool loop_breaker = 0; + /* need this to break out of the for loop from switch */ + +- thd_proc_info(thd, "Finished reading one binlog; switching to next binlog"); ++ // thd_proc_info(thd, "Finished reading one binlog; switching to next binlog"); ++ /* Update the binlog sending state. */ ++ processlist_show_binlog_state( ++ binlog_state_msg, binlog_state_msg_len, thd, ++ "Finished reading one binlog; switching to next binlog", ++ log_file_name, pos); ++ ++ /* Keep the old fileename. */ ++ strmake(old_log_file_name, log_file_name, ++ sizeof(old_log_file_name) - 1); ++ + switch (mysql_bin_log.find_next_log(&linfo, 1)) { + case LOG_INFO_EOF: + loop_breaker = (flags & BINLOG_DUMP_NON_BLOCK); +@@ -706,6 +795,16 @@ + + end_io_cache(&log); + (void) my_close(file, MYF(MY_WME)); ++ ++ /* A sanity check that we can not serve the same binlog twice because ++ * the filenames are stored in a .index file. ++ */ ++ if (strcmp(old_log_file_name, log_file_name) >= 0) { ++ errmsg = "Re-serving an already served binlog file."; ++ my_errno = ER_MASTER_FATAL_ERROR_READING_BINLOG; ++ goto err; ++ } ++ + + /* + Call fake_rotate_event() in case the previous log (the one which +@@ -733,6 +832,8 @@ + end_io_cache(&log); + (void)my_close(file, MYF(MY_WME)); + ++ repl_cleanup(flags); ++ + send_eof(thd); + thd_proc_info(thd, "Waiting to finalize termination"); + pthread_mutex_lock(&LOCK_thread_count); +@@ -743,6 +844,7 @@ + err: + thd_proc_info(thd, "Waiting to finalize termination"); + end_io_cache(&log); ++ repl_cleanup(flags); + /* + Exclude iteration through thread list + this is needed for purge_logs() - it will iterate through +@@ -1316,7 +1418,7 @@ + Format_description_log_event *description_event= new + Format_description_log_event(3); /* MySQL 4.0 by default */ + +- if (mysql_bin_log.is_open()) ++ if (mysql_bin_log.is_log_open()) + { + LEX_MASTER_INFO *lex_mi= &thd->lex->mi; + SELECT_LEX_UNIT *unit= &thd->lex->unit; +@@ -1456,7 +1558,7 @@ + DBUG_RETURN(TRUE); + protocol->prepare_for_resend(); + +- if (mysql_bin_log.is_open()) ++ if (mysql_bin_log.is_log_open()) + { + LOG_INFO li; + mysql_bin_log.get_current_log(&li); +@@ -1497,7 +1599,7 @@ + Protocol *protocol= thd->protocol; + DBUG_ENTER("show_binlogs"); + +- if (!mysql_bin_log.is_open()) ++ if (!mysql_bin_log.is_log_open()) + { + my_message(ER_NO_BINARY_LOGGING, ER(ER_NO_BINARY_LOGGING), MYF(0)); + return 1; +@@ -1606,6 +1708,235 @@ + DBUG_RETURN(0); + } + ++ ++/* make_master: Make the current database a primary and starts the ++ * binlog logging for all updates. ++ * ++ * The function handles the following sql commands: ++ * . MAKE MASTER MASTER_LOG_FILE='replication_log', MASTER_SERVER_ID=1, ++ * [WITH BINLOG]; ++ * . MAKE MASTER MASTER_LOG_FILE='replication_log', MASTER_SERVER_ID=1, ++ * INDEX='replication_log.index' [WITH BINLOG]; ++ * . MAKE MASTER REVOKE SESSION; ++ * . MAKE MASTER REVOKE SESSION WITH KILL; ++ * . MAKE MASTER GRANT SESSION; ++ * ++ * Args: ++ * thd - the current thread ++ * binlog_name - binlog's filename ++ * binlog_indexname - binlog index's filename ++ * mi - master info struct containing binlog name ++ * (set when we enable master during runtime) ++ * ++ * Return: ++ * 0 : success ++ * -1 : failure ++ */ ++int make_master(THD* thd, ++ const char *binlog_name, ++ const char *binlog_indexname, ++ const LEX_MASTER_INFO* mi) { ++ int error = 0; ++ ++ DBUG_ENTER("make_master"); ++ /* In two mode, we enable the binlog: ++ * . !mi - LEX is not provided; this is called from startup time ++ * . mi->log_file_name - binlog is specified in the command ++ */ ++ if (!mi || mi->log_file_name) { ++ /* Get the mutex */ ++ VOID(pthread_mutex_lock(&LOCK_failover_master)); ++ ++ /* If the binlog is already opened, we issue an error. We reuse one ++ * existing error, which might not be fully accurate. ++ */ ++ if (mysql_bin_log.is_log_open()) { ++ my_error(ER_MASTER_INFO, MYF(0)); ++ sql_print_error("Replication master log is already open: cannot " ++ "make another master!"); ++ error = -1; ++ } else { ++ if (!mi) { ++ /* This opening happens at mysql startup time. */ ++ if (make_master_open_log(&mysql_bin_log, binlog_name, ++ 0, max_binlog_size) != 0) { ++ error = -1; ++ } ++ } else { ++ /* This opening happens during mysql runtime, which is mostly ++ * requested to do failover. ++ */ ++ ++ error = -1; ++ if (!is_in_failover()) { ++ sql_print_error( ++ "\"make master\" runs only in failover mode. " ++ "Please run \"make master revoke session (with kill)\""); ++ } else if (strlen(mi->log_file_name) == 0) { ++ sql_print_error("Master log filename is not specified correctly."); ++ } else if (!mi->server_id || mi->server_id == MASTER_INFO_SERVER_ID) { ++ sql_print_error("\"make master\": invalid server_id(%d)", ++ mi->server_id); ++ } else { ++ /* Open the new log files and delete all existing ones to avoid ++ * conflicts. ++ */ ++ uint32 old_server_id = server_id; ++ char *binlog_name = NULL; ++ ++ /* Set the global master server id. ++ * We would not change server id for all connection threads. ++ * All non-super sessions should be blocked by revoke sessions. ++ * Super-user sessions are responsible for their own operations. ++ */ ++ server_id = mi->server_id; ++ thd->server_id = mi->server_id; ++ ++ if (!(binlog_name = my_strdup(mi->log_file_name, MYF(0))) || ++ make_master_open_index(&binlog_name, mi->log_index_name) != 0 || ++ make_master_open_log(&mysql_bin_log, binlog_name, ++ 0, max_binlog_size) != 0) { ++ sql_print_error("Open master logfile failed."); ++ thd->server_id = old_server_id; ++ server_id = old_server_id; ++ } else if (!mi->with_old_binlog && ++ mysql_bin_log.reset_logs(thd) != 0) { ++ sql_print_error("Cleanup existing master logfiles failed."); ++ thd->server_id = old_server_id; ++ server_id = old_server_id; ++ } else { ++ error = 0; ++ } ++ } ++ if (error == -1) ++ my_error(ER_MASTER_INFO, MYF(0)); ++ } ++ } ++ ++ if (error == 0) { ++ /* indicates that binlog is enabled now */ ++ using_update_log = 1; ++ } else if (mysql_bin_log.is_open()) { ++ mysql_bin_log.close(LOG_CLOSE_INDEX); ++ } ++ ++ /* Release the mutex */ ++ VOID(pthread_mutex_unlock(&LOCK_failover_master)); ++ } else { ++ /* The following actions are related to session management during ++ * failover operation. We do not want some sessions come in ++ * during failover and make updates. ++ * This is invoked for command: MAKE MASTER GRANT/REVOKE SESSION; ++ */ ++ if (mi->in_failover) { ++ set_in_failover(mi->kill_session); ++ } else { ++ clear_in_failover(); ++ } ++ } ++ ++ DBUG_RETURN(error); ++} ++ ++static int make_master_open_log(MYSQL_LOG *log, ++ const char *opt_name, ++ bool no_auto_events, ++ ulong max_size) { ++ char tmp[FN_REFLEN]; ++ ++ // get rid of extension ++ char *p = fn_ext(opt_name); ++ uint length=(uint) (p-opt_name); ++ strmake(tmp,opt_name,min(length,FN_REFLEN)); ++ opt_name=tmp; ++ ++ return log->open(opt_name, LOG_BIN, NULL, WRITE_CACHE, 0, ++ max_size, 0); ++} ++ ++int make_master_open_index(char **binlog_name, ++ const char *binlog_indexname) { ++ char buf[FN_REFLEN]; ++ const char *ln; ++ DBUG_ENTER("make_master_open_index"); ++ ++ ln= mysql_bin_log.generate_name(*binlog_name, "-bin", 1, buf); ++ if (!(*binlog_name) && !binlog_indexname) { ++ /* ++ User didn't give us info to name the binlog index file. ++ Picking `hostname`-bin.index like did in 4.x, causes replication to ++ fail if the hostname is changed later. So, we would like to instead ++ require a name. But as we don't want to break many existing setups, we ++ only give warning, not error. ++ */ ++ sql_print_warning("No argument was provided to --log-bin, and " ++ "--log-bin-index was not used; so replication " ++ "may break when this MySQL server acts as a " ++ "master and has his hostname changed!! Please " ++ "use '--log-bin=%s' to avoid this problem.", ln); ++ } ++ if (ln == buf) { ++ my_free(*binlog_name, MYF(MY_ALLOW_ZERO_PTR)); ++ *binlog_name = my_strdup(buf, MYF(0)); ++ } ++ if (mysql_bin_log.open_index_file(binlog_indexname, ln) != 0) { ++ DBUG_RETURN(-1); ++ } ++ ++ /* ++ Used to specify which type of lock we need to use for queries of type ++ INSERT ... SELECT. This will change when we have row level logging. ++ */ ++ using_update_log=1; ++ ++ DBUG_RETURN(0); ++} ++ ++/* Set the status indicating that we are in failover and deny all non-super ++ * user access. ++ * ++ * Args: ++ * kill_session - kill all non-super sessions if specified ++ * ++ * Return: ++ * 0 - success ++ * -1 - failure (caused by not killing all sessions) ++ */ ++static int set_in_failover(bool kill_session) { ++ failover_deny_access = 1; ++ ++ if (kill_session) { ++ /* If kill session option is specified, we need to kill all non-super ++ * user sessions. ++ */ ++ THD *kill_thd; ++ ++ uint error=ER_NO_SUCH_THREAD; ++ pthread_mutex_lock(&LOCK_thread_count); // For unlink from list ++ I_List_iterator<THD> it(threads); ++ while ((kill_thd=it++)) { ++ if (!(kill_thd->main_security_ctx.master_access & SUPER_ACL)) { ++ pthread_mutex_lock(&kill_thd->LOCK_delete); // Lock from delete ++ ++ /* ask the thread to die */ ++ kill_thd->awake(THD::KILL_CONNECTION); ++ pthread_mutex_unlock(&kill_thd->LOCK_delete); ++ } ++ } ++ pthread_mutex_unlock(&LOCK_thread_count); ++ } ++ return 0; ++} ++ ++static void clear_in_failover(void) { ++ failover_deny_access = 0; ++} ++ ++bool is_in_failover(void) { ++ return failover_deny_access; ++} ++ ++ + #endif /* HAVE_REPLICATION */ + + +diff -r 66cc9e0a6768 sql/sql_repl.h +--- a/sql/sql_repl.h Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/sql_repl.h Thu Dec 04 21:46:15 2008 -0800 +@@ -38,6 +38,10 @@ + int start_slave(THD* thd, MASTER_INFO* mi, bool net_report); + int stop_slave(THD* thd, MASTER_INFO* mi, bool net_report); + bool change_master(THD* thd, MASTER_INFO* mi); ++int make_master(THD* thd, const char *binlog_name, ++ const char *binlog_indexname, const LEX_MASTER_INFO* mi); ++int make_master_open_index(char **binlog_name, const char *binlog_indexname); ++bool is_in_failover(void); + bool mysql_show_binlog_events(THD* thd); + int cmp_master_pos(const char* log_file_name1, ulonglong log_pos1, + const char* log_file_name2, ulonglong log_pos2); +diff -r 66cc9e0a6768 sql/sql_yacc.yy +--- a/sql/sql_yacc.yy Thu Dec 04 21:37:12 2008 -0800 ++++ b/sql/sql_yacc.yy Thu Dec 04 21:46:15 2008 -0800 +@@ -735,6 +735,7 @@ + %token LOOP_SYM + %token LOW_PRIORITY + %token LT ++%token MAKE_SYM + %token MAKE_SET_SYM + %token MASTER_CONNECT_RETRY_SYM + %token MASTER_HOST_SYM +@@ -1167,7 +1168,7 @@ + query verb_clause create change select do drop insert replace insert2 + insert_values update delete truncate rename + show describe load alter optimize keycache preload flush +- reset purge begin commit rollback savepoint release ++ make reset purge begin commit rollback savepoint release + slave master_def master_defs master_file_def slave_until_opts + repair restore backup analyze check start checksum + field_list field_list_item field_spec kill column_def key_def +@@ -1301,6 +1302,7 @@ + | kill + | load + | lock ++ | make + | optimize + | keycache + | preload +@@ -1428,6 +1430,56 @@ + master_defs + {} + ; ++ ++/* make master */ ++make: ++ MAKE_SYM MASTER_SYM ++ { ++ LEX *lex = Lex; ++ lex->sql_command = SQLCOM_MAKE_MASTER; ++ bzero((char*) &lex->mi, sizeof(lex->mi)); ++ } ++ make_master_defs ++ { ++ } ++ ; ++ ++make_master_defs: ++ MASTER_LOG_FILE_SYM EQ TEXT_STRING ',' MASTER_SERVER_ID_SYM EQ ulong_num ++ { ++ Lex->mi.log_file_name = $3.str; ++ Lex->mi.server_id = $7; ++ } ++ make_master_with_defs {} ++ | MASTER_LOG_FILE_SYM EQ TEXT_STRING ',' MASTER_SERVER_ID_SYM EQ ulong_num ',' INDEX_SYM EQ TEXT_STRING ++ { ++ Lex->mi.log_file_name = $3.str; ++ Lex->mi.server_id = $7; ++ Lex->mi.log_index_name = $11.str; ++ } ++ make_master_with_defs {} ++ | GRANT SESSION_SYM ++ { ++ Lex->mi.in_failover = 0; ++ } ++ | REVOKE SESSION_SYM ++ { ++ Lex->mi.in_failover = 1; ++ } ++ | REVOKE SESSION_SYM WITH KILL_SYM ++ { ++ Lex->mi.in_failover = 1; ++ Lex->mi.kill_session = 1; ++ } ++ ; ++ ++make_master_with_defs: ++ /* empty */ {} ++ | WITH BINLOG_SYM ++ { ++ /* All old binlogs will be kept after "make master" command. */ ++ Lex->mi.with_old_binlog = 1; ++ } + + master_defs: + master_def +@@ -8396,6 +8448,7 @@ + | HANDLER_SYM {} + | HELP_SYM {} + | LANGUAGE_SYM {} ++ | MAKE_SYM {} + | NO_SYM {} + | OPEN_SYM {} + | PREPARE_SYM {} |