summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJorge Manuel B. S. Vicetto (jmbsvicetto) <jmbsvicetto@gentoo.org>2009-11-25 04:24:02 -0100
committerJorge Manuel B. S. Vicetto (jmbsvicetto) <jmbsvicetto@gentoo.org>2009-11-25 04:24:02 -0100
commit30afbb10fb42cb7d2b861dbc925a033f1e33ab7e (patch)
tree43fa792ecdbb3e8ba85db51261a59c9985b4017e /percona
parentAdded percona patches for 5.0.87 and updated index for 5.0.87 release. (diff)
downloadmysql-extras-30afbb10fb42cb7d2b861dbc925a033f1e33ab7e.tar.gz
mysql-extras-30afbb10fb42cb7d2b861dbc925a033f1e33ab7e.tar.bz2
mysql-extras-30afbb10fb42cb7d2b861dbc925a033f1e33ab7e.zip
Added missing upstream patches and note about them not being applied by upstream and on Gentoo.
Diffstat (limited to 'percona')
-rw-r--r--percona/5.0.87-b20-20091116/README-GENTOO8
-rw-r--r--percona/5.0.87-b20-20091116/innodb_extra_status.patch747
-rw-r--r--percona/5.0.87-b20-20091116/innodb_io_tune.patch1823
-rw-r--r--percona/5.0.87-b20-20091116/innodb_rw_lock_old.patch1357
-rw-r--r--percona/5.0.87-b20-20091116/innodb_show_hashed_memory_standalone.patch264
-rw-r--r--percona/5.0.87-b20-20091116/mirror_binlog.patch2694
6 files changed, 6893 insertions, 0 deletions
diff --git a/percona/5.0.87-b20-20091116/README-GENTOO b/percona/5.0.87-b20-20091116/README-GENTOO
new file mode 100644
index 0000000..a4e2724
--- /dev/null
+++ b/percona/5.0.87-b20-20091116/README-GENTOO
@@ -0,0 +1,8 @@
+The following patches, while distributed by Percona, are NOT applied in their
+specfile. As such, we do not apply them in Gentoo either:
+=========
+innodb_extra_status.patch
+innodb_io_tune.patch
+innodb_rw_lock_old.patch
+innodb_show_hashed_memory_standalone.patch
+mirror_binlog.patch
diff --git a/percona/5.0.87-b20-20091116/innodb_extra_status.patch b/percona/5.0.87-b20-20091116/innodb_extra_status.patch
new file mode 100644
index 0000000..adc1642
--- /dev/null
+++ b/percona/5.0.87-b20-20091116/innodb_extra_status.patch
@@ -0,0 +1,747 @@
+diff -r b059d02ec814 innobase/buf/buf0buf.c
+--- a/innobase/buf/buf0buf.c Mon Nov 03 05:08:52 2008 -0800
++++ b/innobase/buf/buf0buf.c Mon Nov 03 05:09:34 2008 -0800
+@@ -2353,6 +2353,7 @@
+ "AWE: Database pages and free buffers mapped in frames %lu\n",
+ (ulong) UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped));
+ }
++ if (file) {
+ fprintf(file,
+ "Buffer pool size %lu\n"
+ "Free buffers %lu\n"
+@@ -2371,11 +2372,13 @@
+ + buf_pool->init_flush[BUF_FLUSH_LIST],
+ (ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
+
++ } // if (file)
+ current_time = time(NULL);
+ time_elapsed = 0.001 + difftime(current_time,
+ buf_pool->last_printout_time);
+ buf_pool->last_printout_time = current_time;
+
++ if (file) {
+ fprintf(file,
+ "Pages read %lu, created %lu, written %lu\n"
+ "%.2f reads/s, %.2f creates/s, %.2f writes/s\n",
+@@ -2405,6 +2408,7 @@
+ } else {
+ fputs("No buffer pool page gets since the last printout\n",
+ file);
++ }
+ }
+
+ buf_pool->n_page_gets_old = buf_pool->n_page_gets;
+diff -r b059d02ec814 innobase/ibuf/ibuf0ibuf.c
+--- a/innobase/ibuf/ibuf0ibuf.c Mon Nov 03 05:08:52 2008 -0800
++++ b/innobase/ibuf/ibuf0ibuf.c Mon Nov 03 05:09:34 2008 -0800
+@@ -3519,9 +3519,15 @@
+
+ mutex_enter(&ibuf_mutex);
+
++ inno_ibuf_size = 0;
++ inno_ibuf_inserts = 0;
++ inno_ibuf_merged_recs = 0;
++ inno_ibuf_merges = 0;
++
+ data = UT_LIST_GET_FIRST(ibuf->data_list);
+
+ while (data) {
++ if (file) {
+ fprintf(file,
+ "Ibuf: size %lu, free list len %lu, seg size %lu,\n"
+ "%lu inserts, %lu merged recs, %lu merges\n",
+@@ -3542,6 +3548,12 @@
+ }
+ }
+ #endif
++ } // if (file)
++ inno_ibuf_size += (ulong) data->size;
++ inno_ibuf_inserts += (ulong) data->n_inserts;
++ inno_ibuf_merged_recs += (ulong) data->n_merged_recs;
++ inno_ibuf_merges += (ulong) data->n_merges;
++
+ data = UT_LIST_GET_NEXT(data_list, data);
+ }
+
+diff -r b059d02ec814 innobase/include/lock0lock.h
+--- a/innobase/include/lock0lock.h Mon Nov 03 05:08:52 2008 -0800
++++ b/innobase/include/lock0lock.h Mon Nov 03 05:09:34 2008 -0800
+@@ -24,6 +24,10 @@
+ #endif /* UNIV_DEBUG */
+ /* Buffer for storing information about the most recent deadlock error */
+ extern FILE* lock_latest_err_file;
++
++/* number of deadlocks happened so far */
++extern ulint innodb_deadlocks;
++
+
+ /*************************************************************************
+ Gets the size of a lock struct. */
+diff -r b059d02ec814 innobase/include/srv0srv.h
+--- a/innobase/include/srv0srv.h Mon Nov 03 05:08:52 2008 -0800
++++ b/innobase/include/srv0srv.h Mon Nov 03 05:09:34 2008 -0800
+@@ -261,6 +261,12 @@
+ /* variable to count the number of random read-aheads were done */
+ extern ulint srv_read_ahead_rnd;
+
++/* variable to identify if there is currently a long semaphore wait */
++extern ibool srv_long_lock_wait;
++
++/* variable to count the number long semaphore waits noticed */
++extern ulint srv_long_lock_waits;
++
+ /* Number of IO operations read/write done for all threads */
+ extern ulint os_aio_read_requests;
+ extern ulint os_aio_write_requests;
+@@ -278,6 +284,26 @@
+ extern ulint inno_pending_ibuf_aio_reads;
+ extern ulint inno_pending_log_ios;
+ extern ulint inno_pending_sync_ios;
++
++/* all 24 innodb status variables, exported to status */
++extern ulint inno_transaction_count;
++extern ulint inno_transaction_purge_count;
++extern ulint inno_transaction_purge_lag;
++extern ulint inno_num_active_transactions;
++extern ulint inno_summed_transaction_age;
++extern ulint inno_longest_transaction_age;
++extern ulint inno_lock_wait_timeouts;
++extern ulint inno_num_lock_waiters;
++extern ulint inno_summed_lock_wait_time;
++extern ulint inno_longest_lock_wait;
++extern ulint inno_os_reads;
++extern ulint inno_os_writes;
++extern ulint inno_os_fsyncs;
++extern ulint inno_ibuf_size;
++extern ulint inno_ibuf_inserts;
++extern ulint inno_ibuf_merged_recs;
++extern ulint inno_ibuf_merges;
++extern ulint inno_log_ios_done;
+
+ /* In this structure we store status variables to be passed to MySQL */
+ typedef struct export_var_struct export_struc;
+@@ -552,6 +578,7 @@
+ ulint innodb_data_writes;
+ ulint innodb_data_written;
+ ulint innodb_data_reads;
++ ulint innodb_dict_size;
+ ulint innodb_buffer_pool_pages_total;
+ ulint innodb_buffer_pool_pages_data;
+ ulint innodb_buffer_pool_pages_dirty;
+@@ -587,6 +614,43 @@
+ ulint innodb_rows_inserted;
+ ulint innodb_rows_updated;
+ ulint innodb_rows_deleted;
++ ibool innodb_long_lock_wait;
++ ulint innodb_long_lock_waits;
++
++ ulint innodb_os_aio_read_requests;
++ ulint innodb_os_aio_write_requests;
++ ulint innodb_os_aio_pages_read;
++ ulint innodb_os_aio_pages_written;
++ ib_longlong innodb_os_aio_read_time;
++ ib_longlong innodb_os_aio_write_time;
++ ib_longlong innodb_os_aio_read_time_avg;
++ ib_longlong innodb_os_aio_write_time_avg;
++ ulint innodb_deadlocks;
++
++ // the following 24 variables are exported to "show status"
++ ulint inno_transaction_count;
++ ulint inno_transaction_purge_count;
++ ulint inno_transaction_purge_lag;
++ ulint inno_num_active_transactions;
++ ulint inno_summed_transaction_age;
++ ulint inno_longest_transaction_age;
++ ulint inno_lock_wait_timeouts;
++ ulint inno_num_lock_waiters;
++ ulint inno_summed_lock_wait_time;
++ ulint inno_longest_lock_wait;
++ ulint inno_pending_normal_aio_reads;
++ ulint inno_pending_normal_aio_writes;
++ ulint inno_pending_ibuf_aio_reads;
++ ulint inno_pending_log_ios;
++ ulint inno_pending_sync_ios;
++ ulint inno_os_reads;
++ ulint inno_os_writes;
++ ulint inno_os_fsyncs;
++ ulint inno_ibuf_size;
++ ulint inno_ibuf_inserts;
++ ulint inno_ibuf_merged_recs;
++ ulint inno_ibuf_merges;
++ ulint inno_log_ios_done;
+ };
+
+ /* The server system struct */
+diff -r b059d02ec814 innobase/lock/lock0lock.c
+--- a/innobase/lock/lock0lock.c Mon Nov 03 05:08:52 2008 -0800
++++ b/innobase/lock/lock0lock.c Mon Nov 03 05:09:34 2008 -0800
+@@ -360,6 +360,9 @@
+ ibool lock_deadlock_found = FALSE;
+ FILE* lock_latest_err_file;
+
++/* number of deadlocks happened so far */
++ulint innodb_deadlocks = 0;
++
+ /* Flags for recursive deadlock search */
+ #define LOCK_VICTIM_IS_START 1
+ #define LOCK_VICTIM_IS_OTHER 2
+@@ -3304,6 +3307,7 @@
+
+ FILE* ef = lock_latest_err_file;
+
++ innodb_deadlocks++;
+ rewind(ef);
+ ut_print_timestamp(ef);
+
+@@ -4238,6 +4242,7 @@
+ innobase_mysql_prepare_print_arbitrary_thd();
+ lock_mutex_enter_kernel();
+
++ if (file) {
+ if (lock_deadlock_found) {
+ fputs(
+ "------------------------\n"
+@@ -4269,6 +4274,12 @@
+ fprintf(file,
+ "Total number of lock structs in row lock hash table %lu\n",
+ (ulong) lock_get_n_rec_locks());
++ } // if (file)
++ inno_transaction_purge_count =
++ (ulong) ut_dulint_get_low(purge_sys->purge_trx_no);
++ inno_transaction_count =
++ (ulong) ut_dulint_get_low(trx_sys->max_trx_id);
++ inno_transaction_purge_lag = (ulong) trx_sys->rseg_history_len;
+ }
+
+ /*************************************************************************
+@@ -4289,7 +4300,17 @@
+ ulint i;
+ mtr_t mtr;
+ trx_t* trx;
+-
++ time_t current_time = time(NULL);
++
++ /* init all counters to be updated */
++ inno_num_lock_waiters = 0;
++ inno_summed_lock_wait_time = 0;
++ inno_longest_lock_wait = 0;
++ inno_num_active_transactions = 0;
++ inno_summed_transaction_age = 0;
++ inno_longest_transaction_age = 0;
++
++ if (file) {
+ fprintf(file, "LIST OF TRANSACTIONS FOR EACH SESSION:\n");
+
+ /* First print info on non-active transactions */
+@@ -4304,6 +4325,7 @@
+
+ trx = UT_LIST_GET_NEXT(mysql_trx_list, trx);
+ }
++ } // if (file)
+
+ loop:
+ trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+@@ -4330,6 +4352,7 @@
+ }
+
+ if (nth_lock == 0) {
++ if (file) {
+ fputs("---", file);
+ trx_print(file, trx, 600);
+
+@@ -4341,11 +4364,27 @@
+ (ulong) ut_dulint_get_high(trx->read_view->up_limit_id),
+ (ulong) ut_dulint_get_low(trx->read_view->up_limit_id));
+ }
++ } // if (file)
++
++ if (trx->conc_state == TRX_ACTIVE) {
++ ulong trx_age = (ulong)difftime(time(NULL), trx->start_time);
++ inno_num_active_transactions++;
++ inno_summed_transaction_age += trx_age;
++ if (inno_longest_transaction_age > trx_age)
++ inno_longest_transaction_age = trx_age;
++ }
+
+ if (trx->que_state == TRX_QUE_LOCK_WAIT) {
++ ulong wait_time = (ulong)difftime(current_time,
++ trx->wait_started);
++ inno_num_lock_waiters++;
++ inno_summed_lock_wait_time += wait_time;
++ if (inno_longest_lock_wait < wait_time)
++ inno_longest_lock_wait = wait_time;
++ if (file) {
+ fprintf(file,
+ "------- TRX HAS BEEN WAITING %lu SEC FOR THIS LOCK TO BE GRANTED:\n",
+- (ulong)difftime(time(NULL), trx->wait_started));
++ wait_time);
+
+ if (lock_get_type(trx->wait_lock) == LOCK_REC) {
+ lock_rec_print(file, trx->wait_lock);
+@@ -4354,10 +4393,16 @@
+ }
+
+ fputs("------------------\n", file);
+- }
+- }
+-
+- if (!srv_print_innodb_lock_monitor) {
++ } // if (file)
++ }
++ }
++
++ /* don't print locks per transaction if either
++ 1) srv_print_innodb_lock_monitor is NOT set,
++ ie no magic table innodb_lock_monitor is created, or
++ 2) file == NULL, ie, at counter updating stage from "show status"
++ */
++ if (!srv_print_innodb_lock_monitor || !file) {
+ nth_trx++;
+ goto loop;
+ }
+diff -r b059d02ec814 innobase/srv/srv0srv.c
+--- a/innobase/srv/srv0srv.c Mon Nov 03 05:08:52 2008 -0800
++++ b/innobase/srv/srv0srv.c Mon Nov 03 05:09:34 2008 -0800
+@@ -267,6 +267,35 @@
+ ulint inno_pending_log_ios = 0;
+ ulint inno_pending_sync_ios = 0;
+
++/* variable to identify if there is currently a long semaphore wait */
++ibool srv_long_lock_wait = FALSE;
++
++/* variable to count the number long semaphore waits noticed */
++ulint srv_long_lock_waits = 0;
++
++/* time interval in seconds allowed to calling innodb_show_status functions */
++extern long innobase_min_status_update_time_interval;
++
++/* all 24 innodb status variables, exported to status */
++ulint inno_transaction_count = 0;
++ulint inno_transaction_purge_count = 0;
++ulint inno_transaction_purge_lag = 0;
++ulint inno_num_active_transactions = 0;
++ulint inno_summed_transaction_age = 0;
++ulint inno_longest_transaction_age = 0;
++ulint inno_lock_wait_timeouts = 0; /* Counts number of lock wait timeouts. */
++ulint inno_num_lock_waiters = 0;
++ulint inno_summed_lock_wait_time = 0;
++ulint inno_longest_lock_wait = 0;
++ulint inno_os_reads = 0;
++ulint inno_os_writes = 0;
++ulint inno_os_fsyncs = 0;
++ulint inno_ibuf_size = 0;
++ulint inno_ibuf_inserts = 0;
++ulint inno_ibuf_merged_recs = 0;
++ulint inno_ibuf_merges = 0;
++ulint inno_log_ios_done = 0;
++
+ /* structure to pass status variables to MySQL */
+ export_struc export_vars;
+
+@@ -419,6 +448,10 @@
+ const char* srv_io_thread_function[SRV_MAX_N_IO_THREADS];
+
+ time_t srv_last_monitor_time;
++
++/* last time innodb status were updated thru show status */
++time_t srv_last_innodb_status_time = 0;
++
+
+ mutex_t srv_innodb_monitor_mutex;
+
+@@ -677,6 +710,24 @@
+
+ ulint srv_n_threads_active[SRV_MASTER + 1];
+ ulint srv_n_threads[SRV_MASTER + 1];
++
++/*************************************************************************
++Prints counters for work done by srv_master_thread. */
++
++static
++void
++srv_print_extra(
++/*===================*/
++ FILE *file) /* in: output stream */
++{
++ fprintf(file, "srv_master_thread loops: %lu 1_second, %lu sleeps, "
++ "%lu 10_second, %lu background, %lu flush\n",
++ srv_main_1_second_loops, srv_main_sleeps,
++ srv_main_10_second_loops, srv_main_background_loops,
++ srv_main_flush_loops);
++ fprintf(file, "srv_master_thread log flush: %lu sync, %lu async\n",
++ srv_sync_flush, srv_async_flush);
++}
+
+ /*************************************************************************
+ Sets the info describing an i/o thread current state. */
+@@ -1685,12 +1736,13 @@
+ fputs("----------\n"
+ "BACKGROUND THREAD\n"
+ "----------\n", file);
++ srv_print_extra(file);
+ fil_print(file);
+-
+
+ fputs("----------\n"
+ "SEMAPHORES\n"
+ "----------\n", file);
++ fprintf(file, "Lock wait timeouts %lu\n", inno_lock_wait_timeouts);
+ sync_print(file);
+
+ /* Conceptually, srv_innodb_monitor_mutex has a very high latching
+@@ -1709,24 +1761,6 @@
+
+ mutex_exit(&dict_foreign_err_mutex);
+
+- lock_print_info_summary(file);
+- if (trx_start) {
+- long t = ftell(file);
+- if (t < 0) {
+- *trx_start = ULINT_UNDEFINED;
+- } else {
+- *trx_start = (ulint) t;
+- }
+- }
+- lock_print_info_all_transactions(file);
+- if (trx_end) {
+- long t = ftell(file);
+- if (t < 0) {
+- *trx_end = ULINT_UNDEFINED;
+- } else {
+- *trx_end = (ulint) t;
+- }
+- }
+ fputs("--------\n"
+ "FILE I/O\n"
+ "--------\n", file);
+@@ -1815,6 +1849,27 @@
+ (srv_n_rows_read - srv_n_rows_read_old)
+ / time_elapsed);
+
++ /* Print open transaction details */
++ lock_print_info_summary(file);
++
++ if (trx_start) {
++ long t = ftell(file);
++ if (t < 0) {
++ *trx_start = ULINT_UNDEFINED;
++ } else {
++ *trx_start = (ulint) t;
++ }
++ }
++ lock_print_info_all_transactions(file);
++ if (trx_end) {
++ long t = ftell(file);
++ if (t < 0) {
++ *trx_end = ULINT_UNDEFINED;
++ } else {
++ *trx_end = (ulint) t;
++ }
++ }
++
+ srv_n_rows_inserted_old = srv_n_rows_inserted;
+ srv_n_rows_updated_old = srv_n_rows_updated;
+ srv_n_rows_deleted_old = srv_n_rows_deleted;
+@@ -1833,7 +1888,8 @@
+ void
+ srv_export_innodb_status(void)
+ {
+-
++ long time_elapsed;
++ time_t current_time;
+ mutex_enter(&srv_innodb_monitor_mutex);
+ export_vars.innodb_data_pending_reads= os_n_pending_reads;
+ export_vars.innodb_data_pending_writes= os_n_pending_writes;
+@@ -1844,6 +1900,7 @@
+ export_vars.innodb_data_reads= os_n_file_reads;
+ export_vars.innodb_data_writes= os_n_file_writes;
+ export_vars.innodb_data_written= srv_data_written;
++ export_vars.innodb_dict_size= dict_sys->size;
+ export_vars.innodb_buffer_pool_read_requests= buf_pool->n_page_gets;
+ export_vars.innodb_buffer_pool_write_requests= srv_buf_pool_write_requests;
+ export_vars.innodb_buffer_pool_wait_free= srv_buf_pool_wait_free;
+@@ -1854,10 +1911,12 @@
+ export_vars.innodb_buffer_pool_pages_data= UT_LIST_GET_LEN(buf_pool->LRU);
+ export_vars.innodb_buffer_pool_pages_dirty= UT_LIST_GET_LEN(buf_pool->flush_list);
+ export_vars.innodb_buffer_pool_pages_free= UT_LIST_GET_LEN(buf_pool->free);
+- export_vars.innodb_buffer_pool_pages_latched= buf_get_latched_pages_number();
++ /* This function uses too much CPU for large buffer caches. */
++ export_vars.innodb_buffer_pool_pages_latched= 1; /* buf_get_latched_pages_number(); */
+ export_vars.innodb_buffer_pool_pages_total= buf_pool->curr_size;
+ export_vars.innodb_buffer_pool_pages_misc= buf_pool->max_size -
+ UT_LIST_GET_LEN(buf_pool->LRU) - UT_LIST_GET_LEN(buf_pool->free);
++
+ export_vars.innodb_page_size= UNIV_PAGE_SIZE;
+ export_vars.innodb_log_waits= srv_log_waits;
+ export_vars.innodb_os_log_written= srv_os_log_written;
+@@ -1885,6 +1944,103 @@
+ export_vars.innodb_rows_inserted= srv_n_rows_inserted;
+ export_vars.innodb_rows_updated= srv_n_rows_updated;
+ export_vars.innodb_rows_deleted= srv_n_rows_deleted;
++ export_vars.innodb_long_lock_wait = srv_long_lock_wait;
++ export_vars.innodb_long_lock_waits = srv_long_lock_waits;
++
++ export_vars.innodb_os_aio_read_requests = os_aio_read_requests;
++ export_vars.innodb_os_aio_write_requests = os_aio_write_requests;
++
++ export_vars.innodb_os_aio_pages_read = os_aio_pages_read;
++ export_vars.innodb_os_aio_pages_written = os_aio_pages_written;
++
++ export_vars.innodb_os_aio_read_time = os_aio_read_time;
++ export_vars.innodb_os_aio_write_time = os_aio_write_time;
++
++ if (os_aio_read_requests > 0 ) {
++ export_vars.innodb_os_aio_read_time_avg
++ = os_aio_read_time / os_aio_read_requests;
++ } else {
++ export_vars.innodb_os_aio_read_time_avg = 0;
++ }
++ if (os_aio_write_requests > 0 ) {
++ export_vars.innodb_os_aio_write_time_avg
++ = os_aio_write_time / os_aio_write_requests;
++ } else {
++ export_vars.innodb_os_aio_write_time_avg = 0;
++ }
++
++ export_vars.innodb_deadlocks = innodb_deadlocks;
++
++ // simulate srv_printf_innodb_monitor, invoked by innodb_show_status
++ // 0. direct printout inno_lock_wait_timeouts, declared in srv0srv.c
++ // total # of variable(s) updated: 1
++ export_vars.inno_lock_wait_timeouts = inno_lock_wait_timeouts;
++
++ // *_print functions are allowed to be called once every
++ // some seconds to prevent too frequent invocation.
++ // the number is innobase_min_status_update_time_interval
++ current_time = time(NULL);
++ time_elapsed = difftime(current_time, srv_last_innodb_status_time);
++ if (time_elapsed >= innobase_min_status_update_time_interval) {
++ os_aio_print(NULL);
++ ibuf_print(NULL);
++ buf_print_io(NULL);
++ lock_print_info_summary(NULL);
++ lock_print_info_all_transactions(NULL);
++
++ srv_last_innodb_status_time = current_time;
++ }
++
++ // 1. os_aio_print
++ // the following were filled by calling os_aio_print
++ // total # of variable(s) updated: 8
++
++ export_vars.inno_pending_normal_aio_reads =
++ inno_pending_normal_aio_reads;
++ export_vars.inno_pending_normal_aio_writes =
++ inno_pending_normal_aio_writes;
++ export_vars.inno_pending_ibuf_aio_reads = inno_pending_ibuf_aio_reads;
++ export_vars.inno_pending_log_ios = inno_pending_log_ios;
++ export_vars.inno_pending_sync_ios = inno_pending_sync_ios;
++ export_vars.inno_os_reads = os_n_file_reads;
++ export_vars.inno_os_writes = os_n_file_writes;
++ export_vars.inno_os_fsyncs = os_n_fsyncs;
++
++ // 2. ibuf_print()
++ // total # of variable(s) updated: 4
++
++ export_vars.inno_ibuf_size = inno_ibuf_size;
++ export_vars.inno_ibuf_inserts = inno_ibuf_inserts;
++ export_vars.inno_ibuf_merged_recs = inno_ibuf_merged_recs;
++ export_vars.inno_ibuf_merges = inno_ibuf_merges;
++
++ // 3. log_print
++ // total # of variable(s) updated: 1
++ export_vars.inno_log_ios_done = (ulong) log_sys->n_log_ios;
++
++ // 5. lock_print_info_summary
++ // it enters the mutexes
++ // 1) innobase_mysql_prepare_print_arbitrary_thd()
++ // 2) lock_mutex_enter_kernel()
++ // total # of variable(s) updated: 3
++
++ export_vars.inno_transaction_count = inno_transaction_count;
++ export_vars.inno_transaction_purge_count =
++ inno_transaction_purge_count;
++ export_vars.inno_transaction_purge_lag = inno_transaction_purge_lag;
++
++ // 6. lock_print_info_all_transactions(NULL)
++ // it exits two mutexes entered from lock_print_info_summary(NULL)
++ // total # of variable(s) updated: 6
++
++ export_vars.inno_num_active_transactions = inno_num_active_transactions;
++ export_vars.inno_summed_transaction_age = inno_summed_transaction_age;
++ export_vars.inno_longest_transaction_age = inno_longest_transaction_age;
++
++ export_vars.inno_num_lock_waiters = inno_num_lock_waiters;
++ export_vars.inno_summed_lock_wait_time = inno_summed_lock_wait_time;
++ export_vars.inno_longest_lock_wait = inno_longest_lock_wait;
++
+ mutex_exit(&srv_innodb_monitor_mutex);
+
+ }
+@@ -2026,6 +2182,7 @@
+ if (thr_get_trx(slot->thr)->wait_lock) {
+ lock_cancel_waiting_and_release(
+ thr_get_trx(slot->thr)->wait_lock);
++ ++inno_lock_wait_timeouts;
+ }
+ }
+ }
+diff -r b059d02ec814 patch_info/innodb_extra_status.info
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/patch_info/innodb_extra_status.info Mon Nov 03 05:09:34 2008 -0800
+@@ -0,0 +1,9 @@
++File=innodb_extra_status.patch
++Name=Adds additional information of InnoDB counters into SHOW STATUS
++Version=1.0
++Author=Google
++License=GPL
++Comment=
++ChangeLog=
++2008-11-03
++VT: Initial porting
+diff -r b059d02ec814 sql/ha_innodb.cc
+--- a/sql/ha_innodb.cc Mon Nov 03 05:08:52 2008 -0800
++++ b/sql/ha_innodb.cc Mon Nov 03 05:09:34 2008 -0800
+@@ -299,12 +299,36 @@
+ (char*) &export_vars.innodb_dblwr_pages_written, SHOW_LONG},
+ {"dblwr_writes",
+ (char*) &export_vars.innodb_dblwr_writes, SHOW_LONG},
++ {"dict_size",
++ (char*) &export_vars.innodb_dict_size, SHOW_LONG},
+ {"log_waits",
+ (char*) &export_vars.innodb_log_waits, SHOW_LONG},
+ {"log_write_requests",
+ (char*) &export_vars.innodb_log_write_requests, SHOW_LONG},
+ {"log_writes",
+ (char*) &export_vars.innodb_log_writes, SHOW_LONG},
++ {"long_lock_wait",
++ (char*) &export_vars.innodb_long_lock_wait, SHOW_BOOL},
++ {"long_lock_waits",
++ (char*) &export_vars.innodb_long_lock_waits, SHOW_LONG},
++
++ {"os_read_requests",
++ (char*) &export_vars.innodb_os_aio_read_requests, SHOW_LONG},
++ {"os_write_requests",
++ (char*) &export_vars.innodb_os_aio_write_requests, SHOW_LONG},
++ {"os_pages_read",
++ (char*) &export_vars.innodb_os_aio_pages_read, SHOW_LONG},
++ {"os_pages_written",
++ (char*) &export_vars.innodb_os_aio_pages_written, SHOW_LONG},
++ {"os_read_time",
++ (char*) &export_vars.innodb_os_aio_read_time, SHOW_LONGLONG},
++ {"os_write_time",
++ (char*) &export_vars.innodb_os_aio_write_time, SHOW_LONGLONG},
++ {"time_per_read",
++ (char*) &export_vars.innodb_os_aio_read_time_avg, SHOW_LONGLONG},
++ {"time_per_write",
++ (char*) &export_vars.innodb_os_aio_write_time_avg, SHOW_LONGLONG},
++
+ {"os_log_fsyncs",
+ (char*) &export_vars.innodb_os_log_fsyncs, SHOW_LONG},
+ {"os_log_pending_fsyncs",
+@@ -339,6 +363,56 @@
+ (char*) &export_vars.innodb_rows_read, SHOW_LONG},
+ {"rows_updated",
+ (char*) &export_vars.innodb_rows_updated, SHOW_LONG},
++ {"deadlocks",
++ (char*) &export_vars.innodb_deadlocks, SHOW_LONG},
++
++ /* 24 innodb status variables exported to status */
++ {"transaction_count",
++ (char*) &export_vars.inno_transaction_count, SHOW_LONG},
++ {"transaction_purge_count",
++ (char*) &export_vars.inno_transaction_purge_count, SHOW_LONG},
++ {"transaction_purge_lag",
++ (char*) &export_vars.inno_transaction_purge_lag, SHOW_LONG},
++ {"active_transactions",
++ (char*) &export_vars.inno_num_active_transactions, SHOW_LONG},
++ {"summed_transaction_age",
++ (char*) &export_vars.inno_summed_transaction_age, SHOW_LONG},
++ {"longest_transaction_age",
++ (char*) &export_vars.inno_longest_transaction_age, SHOW_LONG},
++ {"lock_wait_timeouts",
++ (char*) &export_vars.inno_lock_wait_timeouts, SHOW_LONG},
++ {"lock_waiters",
++ (char*) &export_vars.inno_num_lock_waiters, SHOW_LONG},
++ {"summed_lock_wait_time",
++ (char*) &export_vars.inno_summed_lock_wait_time, SHOW_LONG},
++ {"longest_lock_wait",
++ (char*) &export_vars.inno_longest_lock_wait, SHOW_LONG},
++ {"pending_normal_aio_reads",
++ (char*) &export_vars.inno_pending_normal_aio_reads, SHOW_LONG},
++ {"pending_normal_aio_writes",
++ (char*) &export_vars.inno_pending_normal_aio_writes, SHOW_LONG},
++ {"pending_ibuf_aio_reads",
++ (char*) &export_vars.inno_pending_ibuf_aio_reads, SHOW_LONG},
++ {"pending_log_ios",
++ (char*) &export_vars.inno_pending_log_ios, SHOW_LONG},
++ {"pending_sync_ios",
++ (char*) &export_vars.inno_pending_sync_ios, SHOW_LONG},
++ {"os_reads",
++ (char*) &export_vars.inno_os_reads, SHOW_LONG},
++ {"os_writes",
++ (char*) &export_vars.inno_os_writes, SHOW_LONG},
++ {"os_fsyncs",
++ (char*) &export_vars.inno_os_fsyncs, SHOW_LONG},
++ {"ibuf_inserts",
++ (char*) &export_vars.inno_ibuf_size, SHOW_LONG},
++ {"ibuf_size",
++ (char*) &export_vars.inno_ibuf_inserts, SHOW_LONG},
++ {"ibuf_merged_recs",
++ (char*) &export_vars.inno_ibuf_merged_recs, SHOW_LONG},
++ {"ibuf_merges",
++ (char*) &export_vars.inno_ibuf_merges, SHOW_LONG},
++ {"log_ios_done",
++ (char*) &export_vars.inno_log_ios_done, SHOW_LONG},
+ {NullS, NullS, SHOW_LONG}};
+
+ /* General functions */
+diff -r b059d02ec814 sql/ha_innodb.h
+--- a/sql/ha_innodb.h Mon Nov 03 05:08:52 2008 -0800
++++ b/sql/ha_innodb.h Mon Nov 03 05:09:34 2008 -0800
+@@ -198,6 +198,7 @@
+ extern struct show_var_st innodb_status_variables[];
+ extern ulong innobase_fast_shutdown;
+ extern long innobase_max_merged_io;
++extern long innobase_min_status_update_time_interval;
+ extern ulong innobase_large_page_size;
+ extern long innobase_mirrored_log_groups, innobase_log_files_in_group;
+ extern longlong innobase_buffer_pool_size, innobase_log_file_size;
+diff -r b059d02ec814 sql/mysqld.cc
+--- a/sql/mysqld.cc Mon Nov 03 05:08:52 2008 -0800
++++ b/sql/mysqld.cc Mon Nov 03 05:09:34 2008 -0800
+@@ -4950,6 +4950,7 @@
+ OPT_INNODB_SYNC_SPIN_LOOPS,
+ OPT_INNODB_CONCURRENCY_TICKETS,
+ OPT_INNODB_THREAD_SLEEP_DELAY,
++ OPT_INNODB_MIN_STATUS_UPDATE_TIME_INTERVAL,
+ OPT_BDB_CACHE_SIZE,
+ OPT_BDB_LOG_BUFFER_SIZE,
+ OPT_BDB_MAX_LOCK,
+@@ -6031,6 +6032,14 @@
+ (gptr*) &srv_thread_sleep_delay,
+ (gptr*) &srv_thread_sleep_delay,
+ 0, GET_ULONG, REQUIRED_ARG, 10000L, 0L, ULONG_MAX, 0, 1L, 0},
++ {"innodb_status_update_interval",
++ OPT_INNODB_MIN_STATUS_UPDATE_TIME_INTERVAL,
++ "Minimum time interval in seconds before InnoDB status counters "
++ "are updated during SHOW STATUS. "
++ "InnoDB counters are always updated during SHOW INNODB STATUS.",
++ (gptr*) &innobase_min_status_update_time_interval,
++ (gptr*) &innobase_min_status_update_time_interval,
++ 0, GET_LONG, REQUIRED_ARG, 30, 0, 3600, 0, 1, 0},
+ #endif /* HAVE_INNOBASE_DB */
+ {"interactive_timeout", OPT_INTERACTIVE_TIMEOUT,
+ "The number of seconds the server waits for activity on an interactive connection before closing it.",
+diff -r b059d02ec814 sql/set_var.cc
+--- a/sql/set_var.cc Mon Nov 03 05:08:52 2008 -0800
++++ b/sql/set_var.cc Mon Nov 03 05:09:34 2008 -0800
+@@ -948,6 +948,8 @@
+ {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG },
+ {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG },
+ {"innodb_max_merged_io", (char*) &innobase_max_merged_io, SHOW_LONG},
++ {"innodb_status_update_interval",
++ (char*) &innobase_min_status_update_time_interval, SHOW_LONG},
+ #endif
+ {sys_interactive_timeout.name,(char*) &sys_interactive_timeout, SHOW_SYS},
+ {sys_join_buffer_size.name, (char*) &sys_join_buffer_size, SHOW_SYS},
diff --git a/percona/5.0.87-b20-20091116/innodb_io_tune.patch b/percona/5.0.87-b20-20091116/innodb_io_tune.patch
new file mode 100644
index 0000000..3953e1d
--- /dev/null
+++ b/percona/5.0.87-b20-20091116/innodb_io_tune.patch
@@ -0,0 +1,1823 @@
+diff -r 322370200e6a innobase/include/os0file.h
+--- a/innobase/include/os0file.h Mon Nov 03 05:07:57 2008 -0800
++++ b/innobase/include/os0file.h Mon Nov 03 05:08:52 2008 -0800
+@@ -532,21 +532,16 @@
+ FALSE otherwise */
+ const char* path); /* in: path name */
+ /****************************************************************************
+-Initializes the asynchronous io system. Creates separate aio array for
+-non-ibuf read and write, a third aio array for the ibuf i/o, with just one
+-segment, two aio arrays for log reads and writes with one segment, and a
+-synchronous aio array of the specified size. The combined number of segments
+-in the three first aio arrays is the parameter n_segments given to the
+-function. The caller must create an i/o handler thread for each segment in
+-the four first arrays, but not for the sync aio array. */
++Initializes the asynchronous io system. */
+
+-void
++ulint
+ os_aio_init(
+ /*========*/
+- ulint n, /* in: maximum number of pending aio operations
+- allowed; n must be divisible by n_segments */
+- ulint n_segments, /* in: combined number of segments in the four
+- first aio arrays; must be >= 4 */
++ /* out: number of AIO handler threads */
++ ulint ios_per_array, /* in: maximum number of pending aio operations
++ allowed per IO array */
++ ulint n_read_threads, /* in: number of read threads */
++ ulint n_write_threads, /* in: number of write threads */
+ ulint n_slots_sync); /* in: number of slots in the sync aio array */
+ /***********************************************************************
+ Requests an asynchronous i/o operation. */
+diff -r 322370200e6a innobase/include/srv0srv.h
+--- a/innobase/include/srv0srv.h Mon Nov 03 05:07:57 2008 -0800
++++ b/innobase/include/srv0srv.h Mon Nov 03 05:08:52 2008 -0800
+@@ -87,6 +87,14 @@
+ extern ulint srv_lock_table_size;
+
+ extern ulint srv_n_file_io_threads;
++extern ulint srv_n_read_io_threads;
++extern ulint srv_n_write_io_threads;
++
++/* Number of IO operations per second the server can do */
++extern ulint srv_io_capacity;
++
++/* Flush dirty pages when below max dirty percent */
++extern ibool srv_extra_dirty_writes;
+
+ #ifdef UNIV_LOG_ARCHIVE
+ extern ibool srv_log_archive_on;
+@@ -252,6 +260,24 @@
+
+ /* variable to count the number of random read-aheads were done */
+ extern ulint srv_read_ahead_rnd;
++
++/* Number of IO operations read/write done for all threads */
++extern ulint os_aio_read_requests;
++extern ulint os_aio_write_requests;
++
++/* Number of pages read/written done for all threads */
++extern ulint os_aio_pages_read;
++extern ulint os_aio_pages_written;
++
++/* time usec used to perform read/write for all threads */
++extern ib_longlong os_aio_read_time;
++extern ib_longlong os_aio_write_time;
++
++extern ulint inno_pending_normal_aio_reads;
++extern ulint inno_pending_normal_aio_writes;
++extern ulint inno_pending_ibuf_aio_reads;
++extern ulint inno_pending_log_ios;
++extern ulint inno_pending_sync_ios;
+
+ /* In this structure we store status variables to be passed to MySQL */
+ typedef struct export_var_struct export_struc;
+diff -r 322370200e6a innobase/log/log0log.c
+--- a/innobase/log/log0log.c Mon Nov 03 05:07:57 2008 -0800
++++ b/innobase/log/log0log.c Mon Nov 03 05:08:52 2008 -0800
+@@ -1537,6 +1537,30 @@
+
+ log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS, TRUE,
+ LOG_WRITE_FROM_BACKGROUND_SYNC);
++}
++
++/********************************************************************
++Flush the log buffer. Force it to disk depending on the value of
++innodb_flush_log_at_trx_commit. */
++
++void
++log_buffer_flush_maybe_sync(void)
++/*==========================*/
++{
++ dulint lsn;
++
++ mutex_enter(&(log_sys->mutex));
++
++ lsn = log_sys->lsn;
++
++ mutex_exit(&(log_sys->mutex));
++
++ /* Force log buffer to disk when innodb_flush_log_at_trx_commit = 1. */
++ log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS,
++ srv_flush_log_at_trx_commit == 1 ? TRUE : FALSE,
++ srv_flush_log_at_trx_commit == 1 ?
++ LOG_WRITE_FROM_BACKGROUND_SYNC :
++ LOG_WRITE_FROM_BACKGROUND_ASYNC);
+ }
+
+ /********************************************************************
+diff -r 322370200e6a innobase/os/os0file.c
+--- a/innobase/os/os0file.c Mon Nov 03 05:07:57 2008 -0800
++++ b/innobase/os/os0file.c Mon Nov 03 05:08:52 2008 -0800
+@@ -22,6 +22,8 @@
+ #include <errno.h>
+ #endif /* UNIV_HOTBACKUP */
+
++extern long innobase_max_merged_io;
++
+ #undef HAVE_FDATASYNC
+
+ #ifdef POSIX_ASYNC_IO
+@@ -63,6 +65,28 @@
+ ibool os_aio_use_native_aio = FALSE;
+
+ ibool os_aio_print_debug = FALSE;
++
++/* State for the state of an IO request in simulated AIO.
++ Protocol for simulated aio:
++ client requests IO: find slot with reserved = FALSE. Add entry with
++ status = OS_AIO_NOT_ISSUED.
++ IO thread wakes: find adjacent slots with reserved = TRUE and status =
++ OS_AIO_NOT_ISSUED. Change status for slots to
++ OS_AIO_ISSUED.
++ IO operation completes: set status for slots to OS_AIO_DONE. set status
++ for the first slot to OS_AIO_CLAIMED and return
++ result for that slot.
++ When there are multiple read and write threads, they all compete to execute
++ the requests in the array (os_aio_array_t). This avoids the need to load
++ balance requests at the time the request is made at the cost of waking all
++ threads when a request is available.
++*/
++typedef enum {
++ OS_AIO_NOT_ISSUED, /* Available to be processed by an IO thread. */
++ OS_AIO_ISSUED, /* Being processed by an IO thread. */
++ OS_AIO_DONE, /* Request processed. */
++ OS_AIO_CLAIMED /* Result being returned to client. */
++} os_aio_status;
+
+ /* The aio array slot structure */
+ typedef struct os_aio_slot_struct os_aio_slot_t;
+@@ -72,6 +96,8 @@
+ ulint pos; /* index of the slot in the aio
+ array */
+ ibool reserved; /* TRUE if this slot is reserved */
++ os_aio_status status; /* Status for current request. Valid when reserved
++ is TRUE. Used only in simulated aio. */
+ time_t reservation_time;/* time when reserved */
+ ulint len; /* length of the block to read or
+ write */
+@@ -82,11 +108,6 @@
+ ulint offset_high; /* 32 high bits of file offset */
+ os_file_t file; /* file where to read or write */
+ const char* name; /* file name or path */
+- ibool io_already_done;/* used only in simulated aio:
+- TRUE if the physical i/o already
+- made and only the slot message
+- needs to be passed to the caller
+- of os_aio_simulated_handle */
+ fil_node_t* message1; /* message which is given by the */
+ void* message2; /* the requester of an aio operation
+ and which can be used to identify
+@@ -116,9 +137,6 @@
+ in this array */
+ ulint n_slots; /* Total number of slots in the aio array.
+ This must be divisible by n_threads. */
+- ulint n_segments;/* Number of segments in the aio array of
+- pending aio requests. A thread can wait
+- separately for any one of the segments. */
+ ulint n_reserved;/* Number of reserved slots in the
+ aio array outside the ibuf segment */
+ os_aio_slot_t* slots; /* Pointer to the slots in the array */
+@@ -134,6 +152,17 @@
+
+ /* Array of events used in simulated aio */
+ os_event_t* os_aio_segment_wait_events = NULL;
++
++/* Number of threads for reading and writing. */
++ulint os_aio_read_threads = 0;
++ulint os_aio_write_threads = 0;
++
++/* Number for the first global segment for reading. */
++const ulint os_aio_first_read_segment = 2;
++
++/* Number for the first global segment for writing. Set to
++2 + os_aio_read_write_threads. */
++ulint os_aio_first_write_segment = 0;
+
+ /* The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
+ are NULL when the module has not yet been initialized. */
+@@ -143,11 +172,39 @@
+ static os_aio_array_t* os_aio_log_array = NULL;
+ static os_aio_array_t* os_aio_sync_array = NULL;
+
++/* Per thread buffer used for merged IO requests. Used by
++os_aio_simulated_handle so that a buffer doesn't have to be allocated
++for each request. */
++static char* os_aio_thread_buffer[SRV_MAX_N_IO_THREADS];
++static ulint os_aio_thread_buffer_size[SRV_MAX_N_IO_THREADS];
++
++/* Count pages read and written per thread */
++static ulint os_aio_thread_io_reads[SRV_MAX_N_IO_THREADS];
++static ulint os_aio_thread_io_writes[SRV_MAX_N_IO_THREADS];
++
++/* Number of IO operations done. One request can be for N pages. */
++static ulint os_aio_thread_io_requests[SRV_MAX_N_IO_THREADS];
++
++/* usecs spent blocked on an IO request */
++static double os_aio_thread_io_wait[SRV_MAX_N_IO_THREADS];
++/* max usecs spent blocked on an IO request */
++static double os_aio_thread_max_io_wait[SRV_MAX_N_IO_THREADS];
++
++/* Number of IO global segments. An IO handler thread is created for each
++global segment, except for the segment associated with os_aio_sync_array.
++Several segments can be associated with os_aio_{read,write}_array. One
++segment is created for each of the other arrays. This is also the number
++of valid entries in srv_io_thread_reads, srv_io_thread_writes,
++srv_io_thread_op_info, srv_io_thread_function and os_aio_segment_wait_events. */
+ static ulint os_aio_n_segments = ULINT_UNDEFINED;
+
+-/* If the following is TRUE, read i/o handler threads try to
+-wait until a batch of new read requests have been posted */
+-static ibool os_aio_recommend_sleep_for_read_threads = FALSE;
++/* Set to TRUE to temporarily block reads from being scheduled while a batch
++of read requests is added to allow them to be merged by the IO handler thread
++if they are adjacent. Declared volatile because we don't want this to be
++read from a register in a loop when another thread may change the value in
++memory.
++*/
++static volatile ibool os_aio_recommend_sleep_for_read_threads = FALSE;
+
+ ulint os_n_file_reads = 0;
+ ulint os_bytes_read_since_printout = 0;
+@@ -166,6 +223,19 @@
+ ulint os_file_n_pending_pwrites = 0;
+ ulint os_n_pending_writes = 0;
+ ulint os_n_pending_reads = 0;
++
++/* TODO -- does InnoDB provide a portable method for this? */
++static double time_usecs() {
++#ifdef __WIN__
++ return 0.0;
++#else
++ struct timeval tv;
++ if (gettimeofday(&tv, NULL))
++ return 0;
++ else
++ return tv.tv_sec * 1000000.0 + tv.tv_usec;
++#endif
++}
+
+ /***************************************************************************
+ Gets the operating system version. Currently works only on Windows. */
+@@ -1351,6 +1421,8 @@
+ /* We disable OS caching (O_DIRECT) only on data files */
+ if (type != OS_LOG_FILE
+ && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) {
++
++ fprintf(stderr, "Using O_DIRECT for file %s\n", name);
+
+ os_file_set_nocache(file, name, mode_str);
+ }
+@@ -1798,6 +1870,32 @@
+ #endif /* __WIN__ */
+ }
+
++#ifndef __WIN__
++/***************************************************************************
++Possibly flushes a given file to disk. */
++
++ibool
++os_maybe_fsync(
++/*==========*/
++ /* out: 0 if success, error code otherwise */
++ os_file_t file) /* in, own: handle to a file */
++{
++ return (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) ? 0 : fsync(file);
++}
++
++/***************************************************************************
++Possibly flushes a given file to disk. */
++
++ibool
++os_maybe_fdatasync(
++/*==========*/
++ /* out: 0 if success, error code otherwise */
++ os_file_t file) /* in, own: handle to a file */
++{
++ return (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) ? 0 : fdatasync(file);
++}
++#endif
++
+ /***************************************************************************
+ Flushes the write buffers of a given file to the disk. */
+
+@@ -1855,21 +1953,21 @@
+ /* If we are not on an operating system that supports this,
+ then fall back to a plain fsync. */
+
+- ret = fsync(file);
++ ret = os_maybe_fsync(file);
+ } else {
+ ret = fcntl(file, F_FULLFSYNC, NULL);
+
+ if (ret) {
+ /* If we are not on a file system that supports this,
+ then fall back to a plain fsync. */
+- ret = fsync(file);
++ ret = os_maybe_fsync(file);
+ }
+ }
+ #elif HAVE_FDATASYNC
+- ret = fdatasync(file);
++ ret = os_maybe_fdatasync(file);
+ #else
+ /* fprintf(stderr, "Flushing to file %p\n", file); */
+- ret = fsync(file);
++ ret = os_maybe_fsync(file);
+ #endif
+ os_n_fsyncs++;
+
+@@ -2298,6 +2396,9 @@
+
+ return(TRUE);
+ }
++ fprintf(stderr,
++"InnoDB: error: os_file_pread wanted %lu and got %lu.\n",
++ (ulint) n, (ulint) ret);
+ #endif
+ #ifdef __WIN__
+ error_handling:
+@@ -2784,9 +2885,8 @@
+ os_aio_array_create(
+ /*================*/
+ /* out, own: aio array */
+- ulint n, /* in: maximum number of pending aio operations
+- allowed; n must be divisible by n_segments */
+- ulint n_segments) /* in: number of segments in the aio array */
++ ulint n) /* in: maximum number of pending aio operations
++ allowed */
+ {
+ os_aio_array_t* array;
+ ulint i;
+@@ -2795,7 +2895,6 @@
+ OVERLAPPED* over;
+ #endif
+ ut_a(n > 0);
+- ut_a(n_segments > 0);
+
+ array = ut_malloc(sizeof(os_aio_array_t));
+
+@@ -2806,7 +2905,6 @@
+ os_event_set(array->is_empty);
+
+ array->n_slots = n;
+- array->n_segments = n_segments;
+ array->n_reserved = 0;
+ array->slots = ut_malloc(n * sizeof(os_aio_slot_t));
+ #ifdef __WIN__
+@@ -2833,70 +2931,75 @@
+
+ /****************************************************************************
+ Initializes the asynchronous io system. Calls also os_io_init_simple.
+-Creates a separate aio array for
+-non-ibuf read and write, a third aio array for the ibuf i/o, with just one
+-segment, two aio arrays for log reads and writes with one segment, and a
+-synchronous aio array of the specified size. The combined number of segments
+-in the three first aio arrays is the parameter n_segments given to the
+-function. The caller must create an i/o handler thread for each segment in
+-the four first arrays, but not for the sync aio array. */
+-
+-void
++Creates an aio array for each of non-ibuf read, non-ibuf write, ibuf IO,
++log IO, and synchronous IO. The caller must create i/o handler thread for all
++but the synchronous aio array. Multiple threads can access the same array for
++the non-ibuf read (prefetch) and write (flush dirty buffer pages) arrays.
++Return the number of AIO handler threads. */
++
++ulint
+ os_aio_init(
+ /*========*/
+- ulint n, /* in: maximum number of pending aio operations
+- allowed; n must be divisible by n_segments */
+- ulint n_segments, /* in: combined number of segments in the four
+- first aio arrays; must be >= 4 */
++ ulint ios_per_array, /* in: maximum number of pending aio operations
++ allowed per array */
++ ulint n_read_threads, /* in: number of read threads */
++ ulint n_write_threads, /* in: number of write threads */
+ ulint n_slots_sync) /* in: number of slots in the sync aio array */
+ {
+- ulint n_read_segs;
+- ulint n_write_segs;
+- ulint n_per_seg;
+- ulint i;
++ ulint i;
++ ulint n_segments = 2 + n_read_threads + n_write_threads;
+ #ifdef POSIX_ASYNC_IO
+ sigset_t sigset;
+ #endif
+- ut_ad(n % n_segments == 0);
+- ut_ad(n_segments >= 4);
++ ut_a(ios_per_array >= OS_AIO_N_PENDING_IOS_PER_THREAD);
++ ut_a(n_read_threads >= 1 && n_read_threads <= 64);
++ ut_a(n_write_threads >= 1 && n_write_threads <= 64);
++ ut_a(n_segments < SRV_MAX_N_IO_THREADS);
+
+ os_io_init_simple();
+
+ for (i = 0; i < n_segments; i++) {
+ srv_set_io_thread_op_info(i, "not started yet");
+- }
+-
+- n_per_seg = n / n_segments;
+- n_write_segs = (n_segments - 2) / 2;
+- n_read_segs = n_segments - 2 - n_write_segs;
+-
+- /* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */
+-
+- os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
++ os_aio_thread_io_reads[i] = 0;
++ os_aio_thread_io_writes[i] = 0;
++ os_aio_thread_io_requests[i] = 0;
++ os_aio_thread_buffer[i] = 0;
++ os_aio_thread_buffer_size[i] = 0;
++ os_aio_thread_io_wait[i] = 0;
++ os_aio_thread_max_io_wait[i] = 0;
++ }
++
++ os_aio_read_threads = n_read_threads;
++ os_aio_write_threads = n_write_threads;
++ os_aio_first_write_segment = os_aio_first_read_segment + os_aio_read_threads;
++
++ fprintf(stderr,
++ "InnoDB: ios_per_array %lu read threads %lu write threads %lu\n",
++ ios_per_array, os_aio_read_threads, os_aio_write_threads);
++
++ os_aio_ibuf_array = os_aio_array_create(ios_per_array);
+
+ srv_io_thread_function[0] = "insert buffer thread";
+
+- os_aio_log_array = os_aio_array_create(n_per_seg, 1);
++ os_aio_log_array = os_aio_array_create(ios_per_array);
+
+ srv_io_thread_function[1] = "log thread";
+
+- os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg,
+- n_read_segs);
+- for (i = 2; i < 2 + n_read_segs; i++) {
++ os_aio_read_array = os_aio_array_create(ios_per_array);
++ for (i = os_aio_first_read_segment; i < os_aio_first_write_segment; i++) {
+ ut_a(i < SRV_MAX_N_IO_THREADS);
+- srv_io_thread_function[i] = "read thread";
+- }
+-
+- os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg,
+- n_write_segs);
+- for (i = 2 + n_read_segs; i < n_segments; i++) {
++ srv_io_thread_function[i] = "read thread";
++ }
++
++ os_aio_write_array = os_aio_array_create(ios_per_array);
++ for (i = os_aio_first_write_segment; i < n_segments; i++) {
+ ut_a(i < SRV_MAX_N_IO_THREADS);
+- srv_io_thread_function[i] = "write thread";
+- }
+-
+- os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
+-
+- os_aio_n_segments = n_segments;
++ srv_io_thread_function[i] = "write thread";
++ }
++
++ os_aio_sync_array = os_aio_array_create(n_slots_sync);
++
++ os_aio_n_segments = 2 + os_aio_read_threads + os_aio_write_threads;
+
+ os_aio_validate();
+
+@@ -2924,6 +3027,7 @@
+
+ pthread_sigmask(SIG_BLOCK, &sigset, NULL); */
+ #endif
++ return os_aio_n_segments;
+ }
+
+ #ifdef WIN_ASYNC_IO
+@@ -2981,77 +3085,32 @@
+ os_event_wait(os_aio_write_array->is_empty);
+ }
+
+-/**************************************************************************
+-Calculates segment number for a slot. */
+-static
+-ulint
+-os_aio_get_segment_no_from_slot(
+-/*============================*/
+- /* out: segment number (which is the number
+- used by, for example, i/o-handler threads) */
+- os_aio_array_t* array, /* in: aio wait array */
+- os_aio_slot_t* slot) /* in: slot in this array */
+-{
+- ulint segment;
+- ulint seg_len;
+-
+- if (array == os_aio_ibuf_array) {
+- segment = 0;
+-
+- } else if (array == os_aio_log_array) {
+- segment = 1;
+-
+- } else if (array == os_aio_read_array) {
+- seg_len = os_aio_read_array->n_slots /
+- os_aio_read_array->n_segments;
+-
+- segment = 2 + slot->pos / seg_len;
+- } else {
+- ut_a(array == os_aio_write_array);
+- seg_len = os_aio_write_array->n_slots /
+- os_aio_write_array->n_segments;
+-
+- segment = os_aio_read_array->n_segments + 2
+- + slot->pos / seg_len;
+- }
+-
+- return(segment);
+-}
+-
+-/**************************************************************************
+-Calculates local segment number and aio array from global segment number. */
+-static
+-ulint
+-os_aio_get_array_and_local_segment(
++
++/**************************************************************************
++Calculates aio array from global segment number. */
++static
++os_aio_array_t*
++os_aio_get_array(
+ /*===============================*/
+- /* out: local segment number within
+- the aio array */
+- os_aio_array_t** array, /* out: aio wait array */
++ /* out: aio wait array */
+ ulint global_segment)/* in: global segment number */
+ {
+- ulint segment;
+
+ ut_a(global_segment < os_aio_n_segments);
+
+ if (global_segment == 0) {
+- *array = os_aio_ibuf_array;
+- segment = 0;
++ return os_aio_ibuf_array;
+
+ } else if (global_segment == 1) {
+- *array = os_aio_log_array;
+- segment = 0;
+-
+- } else if (global_segment < os_aio_read_array->n_segments + 2) {
+- *array = os_aio_read_array;
+-
+- segment = global_segment - 2;
+- } else {
+- *array = os_aio_write_array;
+-
+- segment = global_segment - (os_aio_read_array->n_segments + 2);
+- }
+-
+- return(segment);
++ return os_aio_log_array;
++
++ } else if (global_segment < os_aio_first_write_segment) {
++ return os_aio_read_array;
++
++ } else {
++ return os_aio_write_array;
++
++ }
+ }
+
+ /***********************************************************************
+@@ -3160,7 +3219,7 @@
+
+ os_aio_simulated_wake_handler_threads();
+ }
+-
++
+ os_event_wait(array->not_full);
+
+ goto loop;
+@@ -3173,7 +3232,7 @@
+ break;
+ }
+ }
+-
++ ut_a(i < array->n_slots);
+ array->n_reserved++;
+
+ if (array->n_reserved == 1) {
+@@ -3195,7 +3254,7 @@
+ slot->buf = buf;
+ slot->offset = offset;
+ slot->offset_high = offset_high;
+- slot->io_already_done = FALSE;
++ slot->status = OS_AIO_NOT_ISSUED;
+
+ #ifdef WIN_ASYNC_IO
+ control = &(slot->control);
+@@ -3246,8 +3305,9 @@
+ os_mutex_enter(array->mutex);
+
+ ut_ad(slot->reserved);
+-
++
+ slot->reserved = FALSE;
++ slot->status = OS_AIO_NOT_ISSUED;
+
+ array->n_reserved--;
+
+@@ -3266,36 +3326,40 @@
+ }
+
+ /**************************************************************************
+-Wakes up a simulated aio i/o-handler thread if it has something to do. */
++Wake up the simulated aio i/o-handler threads for a given array if there
++is work to do. */
+ static
+ void
+ os_aio_simulated_wake_handler_thread(
+ /*=================================*/
+- ulint global_segment) /* in: the number of the segment in the aio
+- arrays */
+-{
+- os_aio_array_t* array;
+- os_aio_slot_t* slot;
+- ulint segment;
++ os_aio_array_t* array) /* in: aio array for which wakeup is done */
++{
++ os_aio_slot_t* slot;
+ ulint n;
+ ulint i;
+
+ ut_ad(!os_aio_use_native_aio);
+
+- segment = os_aio_get_array_and_local_segment(&array, global_segment);
+-
+- n = array->n_slots / array->n_segments;
+-
+- /* Look through n slots after the segment * n'th slot */
+-
+- os_mutex_enter(array->mutex);
+-
+- for (i = 0; i < n; i++) {
+- slot = os_aio_array_get_nth_slot(array, i + segment * n);
+-
+- if (slot->reserved) {
+- /* Found an i/o request */
+-
++ n = array->n_slots;
++
++ /* Look through n slots */
++
++ os_mutex_enter(array->mutex);
++
++ for (i = 0; i < n; i++) {
++ slot = os_aio_array_get_nth_slot(array, i );
++
++ if (slot->reserved &&
++ (slot->status == OS_AIO_NOT_ISSUED ||
++ slot->status == OS_AIO_DONE)) {
++ /* Found an i/o request
++ /* OS_AIO_NOT_ISSUED means the read or write request has
++ * yet to be done. OS_AIO_DONE means the request has been
++ * done but it was part of a set of requests merged into
++ * one read or write call and was not the first block in
++ * the request, so the handling of the IO completion for
++ * that block has not been done. */
++
+ break;
+ }
+ }
+@@ -3303,7 +3367,25 @@
+ os_mutex_exit(array->mutex);
+
+ if (i < n) {
+- os_event_set(os_aio_segment_wait_events[global_segment]);
++ if (array == os_aio_ibuf_array) {
++ os_event_set(os_aio_segment_wait_events[0]);
++
++ } else if (array == os_aio_log_array) {
++ os_event_set(os_aio_segment_wait_events[1]);
++
++ } else if (array == os_aio_read_array) {
++ ulint x;
++ for (x = os_aio_first_read_segment; x < os_aio_first_write_segment; x++)
++ os_event_set(os_aio_segment_wait_events[x]);
++
++ } else if (array == os_aio_write_array) {
++ ulint x;
++ for (x = os_aio_first_write_segment; x < os_aio_n_segments; x++)
++ os_event_set(os_aio_segment_wait_events[x]);
++
++ } else {
++ ut_a(0);
++ }
+ }
+ }
+
+@@ -3320,13 +3402,14 @@
+ /* We do not use simulated aio: do nothing */
+
+ return;
+- }
+-
+- os_aio_recommend_sleep_for_read_threads = FALSE;
+-
+- for (i = 0; i < os_aio_n_segments; i++) {
+- os_aio_simulated_wake_handler_thread(i);
+- }
++ }
++
++ os_aio_recommend_sleep_for_read_threads = FALSE;
++
++ os_aio_simulated_wake_handler_thread(os_aio_ibuf_array);
++ os_aio_simulated_wake_handler_thread(os_aio_log_array);
++ os_aio_simulated_wake_handler_thread(os_aio_read_array);
++ os_aio_simulated_wake_handler_thread(os_aio_write_array);
+ }
+
+ /**************************************************************************
+@@ -3339,18 +3422,13 @@
+ os_aio_simulated_put_read_threads_to_sleep(void)
+ /*============================================*/
+ {
+- os_aio_array_t* array;
+ ulint g;
+
++ /* TODO(mcallaghan): provide similar function for write? */
+ os_aio_recommend_sleep_for_read_threads = TRUE;
+
+- for (g = 0; g < os_aio_n_segments; g++) {
+- os_aio_get_array_and_local_segment(&array, g);
+-
+- if (array == os_aio_read_array) {
+-
+- os_event_reset(os_aio_segment_wait_events[g]);
+- }
++ for (g = os_aio_first_read_segment; g < os_aio_first_write_segment; g++) {
++ os_event_reset(os_aio_segment_wait_events[g]);
+ }
+ }
+
+@@ -3480,8 +3558,7 @@
+ #endif
+ } else {
+ if (!wake_later) {
+- os_aio_simulated_wake_handler_thread(
+- os_aio_get_segment_no_from_slot(array, slot));
++ os_aio_simulated_wake_handler_thread(array);
+ }
+ }
+ } else if (type == OS_FILE_WRITE) {
+@@ -3497,8 +3574,7 @@
+ #endif
+ } else {
+ if (!wake_later) {
+- os_aio_simulated_wake_handler_thread(
+- os_aio_get_segment_no_from_slot(array, slot));
++ os_aio_simulated_wake_handler_thread(array);
+ }
+ }
+ } else {
+@@ -3561,7 +3637,7 @@
+ os_aio_windows_handle(
+ /*==================*/
+ /* out: TRUE if the aio operation succeeded */
+- ulint segment, /* in: the number of the segment in the aio
++ ulint global_segment, /* in: the number of the segment in the aio
+ arrays to wait for; segment 0 is the ibuf
+ i/o thread, segment 1 the log i/o thread,
+ then follow the non-ibuf read threads, and as
+@@ -3579,7 +3655,6 @@
+ void** message2,
+ ulint* type) /* out: OS_FILE_WRITE or ..._READ */
+ {
+- ulint orig_seg = segment;
+ os_aio_array_t* array;
+ os_aio_slot_t* slot;
+ ulint n;
+@@ -3588,33 +3663,30 @@
+ BOOL ret;
+ DWORD len;
+
+- if (segment == ULINT_UNDEFINED) {
++ if (global_segment == ULINT_UNDEFINED) {
+ array = os_aio_sync_array;
+- segment = 0;
+- } else {
+- segment = os_aio_get_array_and_local_segment(&array, segment);
++ } else {
++ array = os_aio_get_array(global_segment);
+ }
+
+ /* NOTE! We only access constant fields in os_aio_array. Therefore
+ we do not have to acquire the protecting mutex yet */
+
+ ut_ad(os_aio_validate());
+- ut_ad(segment < array->n_segments);
+-
+- n = array->n_slots / array->n_segments;
++
++ n = array->n_slots;
+
+ if (array == os_aio_sync_array) {
+ os_event_wait(os_aio_array_get_nth_slot(array, pos)->event);
+ i = pos;
+ } else {
+- srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
+- i = os_event_wait_multiple(n,
+- (array->native_events) + segment * n);
+- }
+-
+- os_mutex_enter(array->mutex);
+-
+- slot = os_aio_array_get_nth_slot(array, i + segment * n);
++ srv_set_io_thread_op_info(global_segment, "wait Windows aio");
++ i = os_event_wait_multiple(n, (array->native_events));
++ }
++
++ os_mutex_enter(array->mutex);
++
++ slot = os_aio_array_get_nth_slot(array, i);
+
+ ut_a(slot->reserved);
+
+@@ -3787,14 +3859,16 @@
+ ulint* type) /* out: OS_FILE_WRITE or ..._READ */
+ {
+ os_aio_array_t* array;
+- ulint segment;
+ os_aio_slot_t* slot;
+ os_aio_slot_t* slot2;
+ os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
++ os_aio_slot_t* lowest_request;
++ os_aio_slot_t* oldest_request;
+ ulint n_consecutive;
+ ulint total_len;
+ ulint offs;
+ ulint lowest_offset;
++ ulint oldest_offset;
+ ulint biggest_age;
+ ulint age;
+ byte* combined_buf;
+@@ -3802,8 +3876,10 @@
+ ibool ret;
+ ulint n;
+ ulint i;
+-
+- segment = os_aio_get_array_and_local_segment(&array, global_segment);
++
++ double start_usecs, stop_usecs, elapsed_usecs;
++ time_t now;
++ array = os_aio_get_array(global_segment);
+
+ restart:
+ /* NOTE! We only access constant fields in os_aio_array. Therefore
+@@ -3812,11 +3888,10 @@
+ srv_set_io_thread_op_info(global_segment,
+ "looking for i/o requests (a)");
+ ut_ad(os_aio_validate());
+- ut_ad(segment < array->n_segments);
+-
+- n = array->n_slots / array->n_segments;
+-
+- /* Look through n slots after the segment * n'th slot */
++
++ n = array->n_slots;
++
++ /* Look through n slots */
+
+ if (array == os_aio_read_array
+ && os_aio_recommend_sleep_for_read_threads) {
+@@ -3836,9 +3911,9 @@
+ done */
+
+ for (i = 0; i < n; i++) {
+- slot = os_aio_array_get_nth_slot(array, i + segment * n);
+-
+- if (slot->reserved && slot->io_already_done) {
++ slot = os_aio_array_get_nth_slot(array, i);
++
++ if (slot->reserved && slot->status == OS_AIO_DONE) {
+
+ if (os_aio_print_debug) {
+ fprintf(stderr,
+@@ -3846,79 +3921,66 @@
+ }
+
+ ret = TRUE;
+-
++
+ goto slot_io_done;
+ }
+ }
+
+- n_consecutive = 0;
+-
+- /* If there are at least 2 seconds old requests, then pick the oldest
+- one to prevent starvation. If several requests have the same age,
+- then pick the one at the lowest offset. */
+-
+ biggest_age = 0;
+- lowest_offset = ULINT_MAX;
+-
+- for (i = 0; i < n; i++) {
+- slot = os_aio_array_get_nth_slot(array, i + segment * n);
+-
+- if (slot->reserved) {
+- age = (ulint)difftime(time(NULL),
+- slot->reservation_time);
+-
++ now = time(NULL);
++ oldest_request = lowest_request = NULL;
++ oldest_offset = lowest_offset = ULINT_MAX;
++
++ /* Find the oldest request and the request with the smallest offset */
++ for (i = 0; i < n; i++) {
++ slot = os_aio_array_get_nth_slot(array, i);
++
++ if (slot->reserved && slot->status == OS_AIO_NOT_ISSUED) {
++ age = (ulint)difftime(now, slot->reservation_time);
++
++ /* If there are at least 2 seconds old requests, then pick the oldest
++ one to prevent starvation. If several requests have the same age,
++ then pick the one at the lowest offset. */
+ if ((age >= 2 && age > biggest_age)
+ || (age >= 2 && age == biggest_age
+- && slot->offset < lowest_offset)) {
++ && slot->offset < oldest_offset)) {
+
+ /* Found an i/o request */
+- consecutive_ios[0] = slot;
+-
+- n_consecutive = 1;
+-
+ biggest_age = age;
++ oldest_request = slot;
++ oldest_offset = slot->offset;
++ }
++
++ /* Look for an i/o request at the lowest offset in the array
++ * (we ignore the high 32 bits of the offset) */
++ if (slot->offset < lowest_offset) {
++ /* Found an i/o request */
++ lowest_request = slot;
+ lowest_offset = slot->offset;
+ }
+ }
+ }
+
+- if (n_consecutive == 0) {
+- /* There were no old requests. Look for an i/o request at the
+- lowest offset in the array (we ignore the high 32 bits of the
+- offset in these heuristics) */
+-
+- lowest_offset = ULINT_MAX;
+-
+- for (i = 0; i < n; i++) {
+- slot = os_aio_array_get_nth_slot(array,
+- i + segment * n);
+-
+- if (slot->reserved && slot->offset < lowest_offset) {
+-
+- /* Found an i/o request */
+- consecutive_ios[0] = slot;
+-
+- n_consecutive = 1;
+-
+- lowest_offset = slot->offset;
+- }
+- }
+- }
+-
+- if (n_consecutive == 0) {
++ if (!lowest_request && !oldest_request) {
+
+ /* No i/o requested at the moment */
+
+ goto wait_for_io;
+ }
+
+- slot = consecutive_ios[0];
++ if (oldest_request) {
++ slot = oldest_request;
++ } else {
++ slot = lowest_request;
++ }
++ consecutive_ios[0] = slot;
++ n_consecutive = 1;
+
+ /* Check if there are several consecutive blocks to read or write */
+
+ consecutive_loop:
+ for (i = 0; i < n; i++) {
+- slot2 = os_aio_array_get_nth_slot(array, i + segment * n);
++ slot2 = os_aio_array_get_nth_slot(array, i);
+
+ if (slot2->reserved && slot2 != slot
+ && slot2->offset == slot->offset + slot->len
+@@ -3926,7 +3988,8 @@
+ sum does not wrap over */
+ && slot2->offset_high == slot->offset_high
+ && slot2->type == slot->type
+- && slot2->file == slot->file) {
++ && slot2->file == slot->file
++ && slot2->status == OS_AIO_NOT_ISSUED) {
+
+ /* Found a consecutive i/o request */
+
+@@ -3935,7 +3998,8 @@
+
+ slot = slot2;
+
+- if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
++ if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE
++ && n_consecutive < innobase_max_merged_io) {
+
+ goto consecutive_loop;
+ } else {
+@@ -3955,6 +4019,8 @@
+
+ for (i = 0; i < n_consecutive; i++) {
+ total_len += consecutive_ios[i]->len;
++ ut_a(consecutive_ios[i]->status == OS_AIO_NOT_ISSUED);
++ consecutive_ios[i]->status = OS_AIO_ISSUED;
+ }
+
+ if (n_consecutive == 1) {
+@@ -3962,7 +4028,16 @@
+ combined_buf = slot->buf;
+ combined_buf2 = NULL;
+ } else {
+- combined_buf2 = ut_malloc(total_len + UNIV_PAGE_SIZE);
++ if ((total_len + UNIV_PAGE_SIZE) > os_aio_thread_buffer_size[global_segment]) {
++
++ if (os_aio_thread_buffer[global_segment])
++ ut_free(os_aio_thread_buffer[global_segment]);
++
++ os_aio_thread_buffer[global_segment] = ut_malloc(total_len + UNIV_PAGE_SIZE);
++
++ os_aio_thread_buffer_size[global_segment] = total_len + UNIV_PAGE_SIZE;
++ }
++ combined_buf2 = os_aio_thread_buffer[global_segment];
+
+ ut_a(combined_buf2);
+
+@@ -3973,6 +4048,9 @@
+ this assumes that there is just one i/o-handler thread serving
+ a single segment of slots! */
+
++ ut_a(slot->reserved);
++ ut_a(slot->status == OS_AIO_ISSUED);
++
+ os_mutex_exit(array->mutex);
+
+ if (slot->type == OS_FILE_WRITE && n_consecutive > 1) {
+@@ -3998,6 +4076,7 @@
+
+ /* Do the i/o with ordinary, synchronous i/o functions: */
+ if (slot->type == OS_FILE_WRITE) {
++ os_aio_thread_io_writes[global_segment] += n_consecutive;
+ if (array == os_aio_write_array) {
+ if ((total_len % UNIV_PAGE_SIZE != 0)
+ || (slot->offset % UNIV_PAGE_SIZE != 0)) {
+@@ -4012,16 +4091,34 @@
+ os_file_check_page_trailers(combined_buf, total_len);
+ }
+
++ start_usecs = time_usecs();
+ ret = os_file_write(slot->name, slot->file, combined_buf,
+ slot->offset, slot->offset_high, total_len);
+-
++ stop_usecs = time_usecs();
++ elapsed_usecs = stop_usecs - start_usecs;
++ if (elapsed_usecs < 0) elapsed_usecs = 0;
+ if (array == os_aio_write_array) {
+ os_file_check_page_trailers(combined_buf, total_len);
+ }
+- } else {
++ os_aio_write_requests++;
++ os_aio_pages_written += n_consecutive;
++ os_aio_write_time += (ib_longlong)elapsed_usecs;
++ } else {
++ start_usecs = time_usecs();
++ os_aio_thread_io_reads[global_segment] += n_consecutive;
+ ret = os_file_read(slot->file, combined_buf,
+ slot->offset, slot->offset_high, total_len);
+- }
++ stop_usecs = time_usecs();
++ elapsed_usecs = stop_usecs - start_usecs;
++ if (elapsed_usecs < 0) elapsed_usecs = 0;
++ os_aio_read_requests++;
++ os_aio_pages_read += n_consecutive;
++ os_aio_read_time += (ib_longlong)elapsed_usecs;
++ }
++ if (elapsed_usecs > os_aio_thread_max_io_wait[global_segment])
++ os_aio_thread_max_io_wait[global_segment] = elapsed_usecs;
++ os_aio_thread_io_wait[global_segment] += elapsed_usecs;
++ os_aio_thread_io_requests[global_segment]++;
+
+ ut_a(ret);
+ srv_set_io_thread_op_info(global_segment, "file i/o done");
+@@ -4042,16 +4139,13 @@
+ }
+ }
+
+- if (combined_buf2) {
+- ut_free(combined_buf2);
+- }
+-
+ os_mutex_enter(array->mutex);
+
+ /* Mark the i/os done in slots */
+
+ for (i = 0; i < n_consecutive; i++) {
+- consecutive_ios[i]->io_already_done = TRUE;
++ ut_a(consecutive_ios[i]->status == OS_AIO_ISSUED);
++ consecutive_ios[i]->status = OS_AIO_DONE;
+ }
+
+ /* We return the messages for the first slot now, and if there were
+@@ -4061,6 +4155,8 @@
+ slot_io_done:
+
+ ut_a(slot->reserved);
++ ut_a(slot->status == OS_AIO_DONE);
++ slot->status = OS_AIO_CLAIMED;
+
+ *message1 = slot->message1;
+ *message2 = slot->message2;
+@@ -4070,7 +4166,8 @@
+ os_mutex_exit(array->mutex);
+
+ os_aio_array_free_slot(array, slot);
+-
++ srv_set_io_thread_op_info(global_segment, "exited handler");
++
+ return(ret);
+
+ wait_for_io:
+@@ -4115,7 +4212,6 @@
+ os_mutex_enter(array->mutex);
+
+ ut_a(array->n_slots > 0);
+- ut_a(array->n_segments > 0);
+
+ for (i = 0; i < array->n_slots; i++) {
+ slot = os_aio_array_get_nth_slot(array, i);
+@@ -4165,11 +4261,20 @@
+ double time_elapsed;
+ double avg_bytes_read;
+ ulint i;
+-
+- for (i = 0; i < srv_n_file_io_threads; i++) {
+- fprintf(file, "I/O thread %lu state: %s (%s)", (ulong) i,
+- srv_io_thread_op_info[i],
+- srv_io_thread_function[i]);
++ ulint num_issued, num_done, num_claimed;
++
++ if (file) {
++ for (i = 0; i < os_aio_n_segments; i++) {
++ fprintf(file,
++ "I/O thread %lu state: %s (%s) reads %lu writes %lu "
++ "requests %lu io secs %lf io msecs/request %lf max_io_wait %lf",
++ i, srv_io_thread_op_info[i], srv_io_thread_function[i],
++ os_aio_thread_io_reads[i], os_aio_thread_io_writes[i],
++ os_aio_thread_io_requests[i],
++ os_aio_thread_io_wait[i] / 1000000.0,
++ os_aio_thread_io_requests[i] ?
++ os_aio_thread_io_wait[i] / os_aio_thread_io_requests[i] / 1000.0 : 0.0,
++ os_aio_thread_max_io_wait[i] / 1000.0);
+
+ #ifndef __WIN__
+ if (os_aio_segment_wait_events[i]->is_set) {
+@@ -4181,6 +4286,7 @@
+ }
+
+ fputs("Pending normal aio reads:", file);
++ } // if (file)
+
+ array = os_aio_read_array;
+ loop:
+@@ -4189,14 +4295,23 @@
+ os_mutex_enter(array->mutex);
+
+ ut_a(array->n_slots > 0);
+- ut_a(array->n_segments > 0);
+
+ n_reserved = 0;
++ num_done = num_issued = num_claimed = 0;
+
+ for (i = 0; i < array->n_slots; i++) {
+ slot = os_aio_array_get_nth_slot(array, i);
+
+ if (slot->reserved) {
++ if (slot->status == OS_AIO_ISSUED)
++ num_issued++;
++ else if (slot->status == OS_AIO_DONE)
++ num_done++;
++ else {
++ ut_ad(slot->status == OS_AIO_CLAIMED);
++ num_claimed++;
++ }
++
+ n_reserved++;
+ /* fprintf(stderr, "Reserved slot, messages %p %p\n",
+ slot->message1, slot->message2); */
+@@ -4206,42 +4321,56 @@
+
+ ut_a(array->n_reserved == n_reserved);
+
+- fprintf(file, " %lu", (ulong) n_reserved);
+-
++ if (file) fprintf(file, " %lu", (ulong) n_reserved);
++
+ os_mutex_exit(array->mutex);
+
+ if (array == os_aio_read_array) {
+- fputs(", aio writes:", file);
+-
++ inno_pending_normal_aio_reads = (ulong) n_reserved;
++ if (file) fputs(", aio writes:", file);
+ array = os_aio_write_array;
+
+ goto loop;
+ }
+
+ if (array == os_aio_write_array) {
+- fputs(",\n ibuf aio reads:", file);
++ inno_pending_normal_aio_writes = (ulong) n_reserved;
++ if (file) fputs(",\n ibuf aio reads:", file);
+ array = os_aio_ibuf_array;
+
+ goto loop;
+ }
+
+ if (array == os_aio_ibuf_array) {
+- fputs(", log i/o's:", file);
++ inno_pending_ibuf_aio_reads = (ulong) n_reserved;
++ if (file) fputs(", log i/o's:", file);
+ array = os_aio_log_array;
+
+ goto loop;
+ }
+
+ if (array == os_aio_log_array) {
+- fputs(", sync i/o's:", file);
++ inno_pending_log_ios = (ulong) n_reserved;
++ if (file) fputs(", sync i/o's:", file);
+ array = os_aio_sync_array;
+
+ goto loop;
+ }
+
+- putc('\n', file);
++ if (array == os_aio_sync_array) {
++ inno_pending_sync_ios = (ulong) n_reserved;
++ }
++
+ current_time = time(NULL);
+ time_elapsed = 0.001 + difftime(current_time, os_last_printout);
++
++ if (file) {
++ putc('\n', file);
++ fprintf(file,
++ "Summary of background IO slot status: %lu issued, "
++ "%lu done, %lu claimed, sleep set %d\n",
++ num_issued, num_done, num_claimed,
++ os_aio_recommend_sleep_for_read_threads);
+
+ fprintf(file,
+ "Pending flushes (fsync) log: %lu; buffer pool: %lu\n"
+@@ -4274,6 +4403,7 @@
+ / time_elapsed,
+ (os_n_fsyncs - os_n_fsyncs_old)
+ / time_elapsed);
++ } // if (file)
+
+ os_n_file_reads_old = os_n_file_reads;
+ os_n_file_writes_old = os_n_file_writes;
+diff -r 322370200e6a innobase/srv/srv0srv.c
+--- a/innobase/srv/srv0srv.c Mon Nov 03 05:07:57 2008 -0800
++++ b/innobase/srv/srv0srv.c Mon Nov 03 05:08:52 2008 -0800
+@@ -164,7 +164,17 @@
+ ulint srv_mem_pool_size = ULINT_MAX; /* size in bytes */
+ ulint srv_lock_table_size = ULINT_MAX;
+
++ulint srv_io_capacity = ULINT_MAX; /* Number of IO operations per
++ second the server can do */
++
++ibool srv_extra_dirty_writes = TRUE; /* Write dirty pages to disk when pct
++ dirty < max dirty pct */
++
++/* Deprecated by srv_n_{read,write}_io_threads */
+ ulint srv_n_file_io_threads = ULINT_MAX;
++/* Number of background IO threads for read and write requests */
++ulint srv_n_read_io_threads = ULINT_MAX;
++ulint srv_n_write_io_threads = ULINT_MAX;
+
+ #ifdef UNIV_LOG_ARCHIVE
+ ibool srv_log_archive_on = FALSE;
+@@ -238,6 +248,24 @@
+
+ /* variable to count the number of random read-aheads */
+ ulint srv_read_ahead_rnd = 0;
++
++/* Number of IO operations read/write done for all threads */
++ulint os_aio_read_requests = 0;
++ulint os_aio_write_requests = 0;
++
++/* Number of pages read/written done for all threads */
++ulint os_aio_pages_read = 0;
++ulint os_aio_pages_written = 0;
++
++/* time usec used to perform read/write for all threads */
++ib_longlong os_aio_read_time = 0;
++ib_longlong os_aio_write_time = 0;
++
++ulint inno_pending_normal_aio_reads = 0;
++ulint inno_pending_normal_aio_writes = 0;
++ulint inno_pending_ibuf_aio_reads = 0;
++ulint inno_pending_log_ios = 0;
++ulint inno_pending_sync_ios = 0;
+
+ /* structure to pass status variables to MySQL */
+ export_struc export_vars;
+@@ -413,6 +441,23 @@
+
+ ulint srv_main_thread_process_no = 0;
+ ulint srv_main_thread_id = 0;
++
++// The following count work done by srv_master_thread.
++
++// Iterations by the 'once per second' loop.
++ulint srv_main_1_second_loops = 0;
++// Calls to sleep by the 'once per second' loop.
++ulint srv_main_sleeps = 0;
++// Iterations by the 'once per 10 seconds' loop.
++ulint srv_main_10_second_loops = 0;
++// Iterations of the loop bounded by the 'background_loop' label.
++ulint srv_main_background_loops = 0;
++// Iterations of the loop bounded by the 'flush_loop' label.
++ulint srv_main_flush_loops = 0;
++// Calls to log_buffer_flush_to_disk.
++ulint srv_sync_flush = 0;
++// Calls to log_buffer_flush_maybe_sync.
++ulint srv_async_flush = 0;
+
+ /*
+ IMPLEMENTATION OF THE SERVER MAIN PROGRAM
+@@ -2170,7 +2215,12 @@
+ }
+
+ /*************************************************************************
+-The master thread controlling the server. */
++Returns the number of IO operations that is X percent of the capacity.
++
++PCT_IO(5) -> returns the number of IO operations that is 5% of the max
++where max is srv_io_capacity.
++*/
++#define PCT_IO(pct) ((ulint) (srv_io_capacity * ((double) pct / 100.0)))
+
+ #ifndef __WIN__
+ void*
+@@ -2199,11 +2249,15 @@
+ ulint n_pend_ios;
+ ibool skip_sleep = FALSE;
+ ulint i;
++
+
+ #ifdef UNIV_DEBUG_THREAD_CREATION
+ fprintf(stderr, "Master thread starts, id %lu\n",
+ os_thread_pf(os_thread_get_curr_id()));
+ #endif
++ fprintf(stderr, "InnoDB master thread running with io_capacity %lu\n",
++ srv_io_capacity);
++
+ srv_main_thread_process_no = os_proc_get_number();
+ srv_main_thread_id = os_thread_pf(os_thread_get_curr_id());
+
+@@ -2275,26 +2329,28 @@
+
+ srv_main_thread_op_info = "flushing log";
+ log_buffer_flush_to_disk();
++ srv_sync_flush++;
+
+ srv_main_thread_op_info = "making checkpoint";
+ log_free_check();
+
+- /* If there were less than 5 i/os during the
+- one second sleep, we assume that there is free
+- disk i/o capacity available, and it makes sense to
+- do an insert buffer merge. */
++ /* If i/os during one second sleep were less than 5% of
++ capacity, we assume that there is free disk i/o capacity
++ available, and it makes sense to do an insert buffer merge. */
+
+ n_pend_ios = buf_get_n_pending_ios()
+ + log_sys->n_pending_writes;
+ n_ios = log_sys->n_log_ios + buf_pool->n_pages_read
+ + buf_pool->n_pages_written;
+- if (n_pend_ios < 3 && (n_ios - n_ios_old < 5)) {
++ if (n_pend_ios < PCT_IO(3) && (n_ios - n_ios_old < PCT_IO(5))) {
+ srv_main_thread_op_info = "doing insert buffer merge";
+- ibuf_contract_for_n_pages(TRUE, 5);
++ ibuf_contract_for_n_pages(TRUE, PCT_IO(5));
+
+ srv_main_thread_op_info = "flushing log";
+
+- log_buffer_flush_to_disk();
++ /* No fsync when srv_flush_log_at_trx_commit != 1 */
++ log_buffer_flush_maybe_sync();
++ srv_async_flush++;
+ }
+
+ if (buf_get_modified_ratio_pct() >
+@@ -2303,7 +2359,8 @@
+ /* Try to keep the number of modified pages in the
+ buffer pool under the limit wished by the user */
+
+- n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100,
++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST,
++ PCT_IO(100),
+ ut_dulint_max);
+
+ /* If we had to do the flush, it may have taken
+@@ -2325,36 +2382,47 @@
+
+ /* ---- We perform the following code approximately once per
+ 10 seconds when there is database activity */
++ srv_main_10_second_loops++;
+
+ #ifdef MEM_PERIODIC_CHECK
+ /* Check magic numbers of every allocated mem block once in 10
+ seconds */
+ mem_validate_all_blocks();
+ #endif
+- /* If there were less than 200 i/os during the 10 second period,
+- we assume that there is free disk i/o capacity available, and it
+- makes sense to flush 100 pages. */
++ /* If i/os during the 10 second period were less than 200% of
++ capacity, we assume that there is free disk i/o capacity
++ available, and it makes sense to flush srv_io_capacity pages.
++
++ Note that this is done regardless of the fraction of dirty
++ pages relative to the max requested by the user. The one second
++ loop above requests writes for that case. The writes done here
++ are not required, and may be disabled. */
+
+ n_pend_ios = buf_get_n_pending_ios() + log_sys->n_pending_writes;
+ n_ios = log_sys->n_log_ios + buf_pool->n_pages_read
+ + buf_pool->n_pages_written;
+- if (n_pend_ios < 3 && (n_ios - n_ios_very_old < 200)) {
++ if (srv_extra_dirty_writes &&
++ n_pend_ios < PCT_IO(3) && (n_ios - n_ios_very_old < PCT_IO(200))) {
+
+ srv_main_thread_op_info = "flushing buffer pool pages";
+- buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max);
++ buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ut_dulint_max);
+
+ srv_main_thread_op_info = "flushing log";
+- log_buffer_flush_to_disk();
++ /* No fsync when srv_flush_log_at_trx_commit != 1 */
++ log_buffer_flush_maybe_sync();
++ srv_async_flush++;
+ }
+
+ /* We run a batch of insert buffer merge every 10 seconds,
+ even if the server were active */
+
+ srv_main_thread_op_info = "doing insert buffer merge";
+- ibuf_contract_for_n_pages(TRUE, 5);
++ ibuf_contract_for_n_pages(TRUE, PCT_IO(5));
+
+ srv_main_thread_op_info = "flushing log";
+- log_buffer_flush_to_disk();
++ /* No fsync when srv_flush_log_at_trx_commit != 1 */
++ log_buffer_flush_maybe_sync();
++ srv_async_flush++;
+
+ /* We run a full purge every 10 seconds, even if the server
+ were active */
+@@ -2378,8 +2446,9 @@
+ if (difftime(current_time, last_flush_time) > 1) {
+ srv_main_thread_op_info = "flushing log";
+
+- log_buffer_flush_to_disk();
++ log_buffer_flush_to_disk();
+ last_flush_time = current_time;
++ srv_sync_flush++;
+ }
+ }
+
+@@ -2393,14 +2462,14 @@
+ (> 70 %), we assume we can afford reserving the disk(s) for
+ the time it requires to flush 100 pages */
+
+- n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100,
++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100),
+ ut_dulint_max);
+ } else {
+ /* Otherwise, we only flush a small number of pages so that
+ we do not unnecessarily use much disk i/o capacity from
+ other work */
+
+- n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 10,
++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(10),
+ ut_dulint_max);
+ }
+
+@@ -2434,7 +2503,7 @@
+
+ /* The server has been quiet for a while: start running background
+ operations */
+-
++ srv_main_background_loops++;
+ srv_main_thread_op_info = "doing background drop tables";
+
+ n_tables_to_drop = row_drop_tables_for_mysql_in_background();
+@@ -2472,6 +2541,7 @@
+
+ log_buffer_flush_to_disk();
+ last_flush_time = current_time;
++ srv_sync_flush++;
+ }
+ }
+
+@@ -2487,9 +2557,13 @@
+ srv_main_thread_op_info = "doing insert buffer merge";
+
+ if (srv_fast_shutdown && srv_shutdown_state > 0) {
+- n_bytes_merged = 0;
++ n_bytes_merged = 0;
+ } else {
+- n_bytes_merged = ibuf_contract_for_n_pages(TRUE, 20);
++ /* This should do an amount of IO similar to the number of
++ * dirty pages that will be flushed in the call to
++ * buf_flush_batch below. Otherwise, the system favors
++ * clean pages over cleanup throughput. */
++ n_bytes_merged = ibuf_contract_for_n_pages(TRUE, PCT_IO(100));
+ }
+
+ srv_main_thread_op_info = "reserving kernel mutex";
+@@ -2503,10 +2577,11 @@
+
+ flush_loop:
+ srv_main_thread_op_info = "flushing buffer pool pages";
++ srv_main_flush_loops++;
+
+ if (srv_fast_shutdown < 2) {
+ n_pages_flushed =
+- buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max);
++ buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ut_dulint_max);
+ } else {
+ /* In the fastest shutdown we do not flush the buffer pool
+ to data files: we set n_pages_flushed to 0 artificially. */
+@@ -2528,7 +2603,17 @@
+
+ srv_main_thread_op_info = "flushing log";
+
+- log_buffer_flush_to_disk();
++ current_time = time(NULL);
++ if (difftime(current_time, last_flush_time) > 1) {
++ srv_main_thread_op_info = (char*) "flushing log";
++ log_buffer_flush_to_disk();
++ last_flush_time = current_time;
++ srv_sync_flush++;
++ } else {
++ /* No fsync when srv_flush_log_at_trx_commit != 1 */
++ log_buffer_flush_maybe_sync();
++ srv_async_flush++;
++ }
+
+ srv_main_thread_op_info = "making checkpoint";
+
+diff -r 322370200e6a innobase/srv/srv0start.c
+--- a/innobase/srv/srv0start.c Mon Nov 03 05:07:57 2008 -0800
++++ b/innobase/srv/srv0start.c Mon Nov 03 05:08:52 2008 -0800
+@@ -973,6 +973,7 @@
+ ulint i;
+ ibool srv_file_per_table_original_value = srv_file_per_table;
+ mtr_t mtr;
++ ulint n_threads;
+ #ifdef HAVE_DARWIN_THREADS
+ # ifdef F_FULLFSYNC
+ /* This executable has been compiled on Mac OS X 10.3 or later.
+@@ -1206,24 +1207,32 @@
+ }
+
+ /* Restrict the maximum number of file i/o threads */
+- if (srv_n_file_io_threads > SRV_MAX_N_IO_THREADS) {
+-
+- srv_n_file_io_threads = SRV_MAX_N_IO_THREADS;
++ if ((srv_n_read_io_threads + srv_n_write_io_threads) > SRV_MAX_N_IO_THREADS) {
++ fprintf(stderr,
++ "InnoDB: requested too many read(%d) or write(%d) IO threads, max is %d\n",
++ srv_n_read_io_threads, srv_n_write_io_threads, SRV_MAX_N_IO_THREADS);
++ return(DB_ERROR);
+ }
+
+ if (!os_aio_use_native_aio) {
+- /* In simulated aio we currently have use only for 4 threads */
+- srv_n_file_io_threads = 4;
++ /* More than 4 threads are now supported. */
++ n_threads = os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD,
++ srv_n_read_io_threads,
++ srv_n_write_io_threads,
++ SRV_MAX_N_PENDING_SYNC_IOS);
++ } else {
++ /* Might need more slots here. Alas, I don't do windows. */
++ n_threads = os_aio_init(SRV_N_PENDING_IOS_PER_THREAD,
++ srv_n_read_io_threads,
++ srv_n_write_io_threads,
++ SRV_MAX_N_PENDING_SYNC_IOS);
++ }
+
+- os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD
+- * srv_n_file_io_threads,
+- srv_n_file_io_threads,
+- SRV_MAX_N_PENDING_SYNC_IOS);
+- } else {
+- os_aio_init(SRV_N_PENDING_IOS_PER_THREAD
+- * srv_n_file_io_threads,
+- srv_n_file_io_threads,
+- SRV_MAX_N_PENDING_SYNC_IOS);
++ if (n_threads > SRV_MAX_N_IO_THREADS) {
++ fprintf(stderr,
++ "InnoDB: requested too many IO threads(%d), max is %d\n",
++ n_threads, SRV_MAX_N_IO_THREADS);
++ return(DB_ERROR);
+ }
+
+ fil_init(srv_max_n_open_files);
+@@ -1259,11 +1268,11 @@
+
+ /* Create i/o-handler threads: */
+
+- for (i = 0; i < srv_n_file_io_threads; i++) {
++ for (i = 0; i < n_threads; i++) {
+ n[i] = i;
+
+ os_thread_create(io_handler_thread, n + i, thread_ids + i);
+- }
++ }
+
+ #ifdef UNIV_LOG_ARCHIVE
+ if (0 != ut_strcmp(srv_log_group_home_dirs[0], srv_arch_dir)) {
+diff -r 322370200e6a patch_info/innodb_io_tune.info
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/patch_info/innodb_io_tune.info Mon Nov 03 05:08:52 2008 -0800
+@@ -0,0 +1,9 @@
++File=innodb_io_tune.patch
++Name=Tune InnoDB IO settings
++Version=1.0
++Author=Google
++License=GPL
++Comment=
++ChangeLog=
++2008-11-01
++VT: Initial porting
+diff -r 322370200e6a sql/ha_innodb.cc
+--- a/sql/ha_innodb.cc Mon Nov 03 05:07:57 2008 -0800
++++ b/sql/ha_innodb.cc Mon Nov 03 05:08:52 2008 -0800
+@@ -147,7 +147,7 @@
+ innobase_additional_mem_pool_size, innobase_file_io_threads,
+ innobase_lock_wait_timeout, innobase_force_recovery,
+ innobase_open_files;
+-
++long innobase_read_io_threads, innobase_write_io_threads;
+ longlong innobase_buffer_pool_size, innobase_log_file_size;
+
+ /* The default values for the following char* start-up parameters
+@@ -175,6 +175,23 @@
+ my_bool innobase_rollback_on_timeout = FALSE;
+ my_bool innobase_create_status_file = FALSE;
+ my_bool innobase_adaptive_hash_index = TRUE;
++
++/* Max number of IO requests merged to perform large IO in background
++ IO threads.
++*/
++long innobase_max_merged_io = 64;
++
++/* time interval in seconds allowed to calling innodb_show_status functions */
++long innobase_min_status_update_time_interval = 30;
++
++
++/* Default number of IO per second supported by server. Tunes background
++ IO rate
++*/
++long innobase_io_capacity = 100;
++
++/* Write dirty pages when pct dirty is less than max pct dirty */
++my_bool innobase_extra_dirty_writes = TRUE;
+
+ static char *internal_innobase_data_file_path = NULL;
+
+@@ -1372,7 +1389,11 @@
+
+ srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size;
+
++ srv_io_capacity = (ulint) innobase_io_capacity;
++ srv_extra_dirty_writes = (ibool) innobase_extra_dirty_writes;
+ srv_n_file_io_threads = (ulint) innobase_file_io_threads;
++ srv_n_read_io_threads = (ulint) innobase_read_io_threads;
++ srv_n_write_io_threads = (ulint) innobase_write_io_threads;
+
+ srv_lock_wait_timeout = (ulint) innobase_lock_wait_timeout;
+ srv_force_recovery = (ulint) innobase_force_recovery;
+diff -r 322370200e6a sql/ha_innodb.h
+--- a/sql/ha_innodb.h Mon Nov 03 05:07:57 2008 -0800
++++ b/sql/ha_innodb.h Mon Nov 03 05:08:52 2008 -0800
+@@ -197,6 +197,7 @@
+
+ extern struct show_var_st innodb_status_variables[];
+ extern ulong innobase_fast_shutdown;
++extern long innobase_max_merged_io;
+ extern ulong innobase_large_page_size;
+ extern long innobase_mirrored_log_groups, innobase_log_files_in_group;
+ extern longlong innobase_buffer_pool_size, innobase_log_file_size;
+@@ -205,10 +206,14 @@
+ extern long innobase_buffer_pool_awe_mem_mb;
+ extern long innobase_file_io_threads, innobase_lock_wait_timeout;
+ extern long innobase_force_recovery;
++extern long innobase_read_io_threads, innobase_write_io_threads;
+ extern long innobase_open_files;
+ extern char *innobase_data_home_dir, *innobase_data_file_path;
+ extern char *innobase_log_group_home_dir, *innobase_log_arch_dir;
+ extern char *innobase_unix_file_flush_method;
++extern long innobase_io_capacity;
++extern my_bool innobase_extra_dirty_writes;
++
+ /* The following variables have to be my_bool for SHOW VARIABLES to work */
+ extern my_bool innobase_log_archive,
+ innobase_use_doublewrite,
+diff -r 322370200e6a sql/mysqld.cc
+--- a/sql/mysqld.cc Mon Nov 03 05:07:57 2008 -0800
++++ b/sql/mysqld.cc Mon Nov 03 05:08:52 2008 -0800
+@@ -4932,6 +4932,11 @@
+ OPT_INNODB_ADDITIONAL_MEM_POOL_SIZE,
+ OPT_INNODB_MAX_PURGE_LAG,
+ OPT_INNODB_FILE_IO_THREADS,
++ OPT_INNODB_READ_IO_THREADS,
++ OPT_INNODB_WRITE_IO_THREADS,
++ OPT_INNODB_MAX_MERGED_IO,
++ OPT_INNODB_IO_CAPACITY,
++ OPT_INNODB_EXTRA_DIRTY_WRITES,
+ OPT_INNODB_LOCK_WAIT_TIMEOUT,
+ OPT_INNODB_THREAD_CONCURRENCY,
+ OPT_INNODB_COMMIT_CONCURRENCY,
+@@ -5302,6 +5307,25 @@
+ (gptr*) &global_system_variables.innodb_table_locks,
+ (gptr*) &global_system_variables.innodb_table_locks,
+ 0, GET_BOOL, OPT_ARG, 1, 0, 0, 0, 0, 0},
++ {"innodb_max_merged_io", OPT_INNODB_MAX_MERGED_IO,
++ "Max number of IO requests merged to issue large IO from background IO threads.",
++ (gptr*) &innobase_max_merged_io,
++ (gptr*) &innobase_max_merged_io, 0, GET_LONG, REQUIRED_ARG, 64, 1, 64, 0, 0, 0},
++ {"innodb_read_io_threads", OPT_INNODB_READ_IO_THREADS,
++ "Number of background read I/O threads in InnoDB.", (gptr*) &innobase_read_io_threads,
++ (gptr*) &innobase_read_io_threads, 0, GET_LONG, REQUIRED_ARG, 1, 1, 64, 0, 1, 0},
++ {"innodb_write_io_threads", OPT_INNODB_WRITE_IO_THREADS,
++ "Number of background write I/O threads in InnoDB.", (gptr*) &innobase_write_io_threads,
++ (gptr*) &innobase_write_io_threads, 0, GET_LONG, REQUIRED_ARG, 1, 1, 64, 0, 1, 0},
++ {"innodb_io_capacity", OPT_INNODB_IO_CAPACITY,
++ "Number of IO operations per second the server can do. Tunes background IO rate.",
++ (gptr*) &innobase_io_capacity,
++ (gptr*) &innobase_io_capacity, 0, GET_LONG,
++ REQUIRED_ARG, 100, 100, 999999999, 0, 1, 0},
++ {"innodb_extra_dirty_writes", OPT_INNODB_EXTRA_DIRTY_WRITES,
++ "When set, flush dirty buffer pages when dirty pct is less than max dirty pct. ",
++ (gptr*) &innobase_extra_dirty_writes, (gptr*) &innobase_extra_dirty_writes,
++ 0, GET_BOOL, NO_ARG, 1, 0, 1, 0, 1, 0},
+ #endif /* End HAVE_INNOBASE_DB */
+ {"isam", OPT_ISAM, "Obsolete. ISAM storage engine is no longer supported.",
+ (gptr*) &opt_isam, (gptr*) &opt_isam, 0, GET_BOOL, NO_ARG, 0, 0, 0,
+diff -r 322370200e6a sql/set_var.cc
+--- a/sql/set_var.cc Mon Nov 03 05:07:57 2008 -0800
++++ b/sql/set_var.cc Mon Nov 03 05:08:52 2008 -0800
+@@ -919,12 +919,14 @@
+ {"innodb_data_home_dir", (char*) &innobase_data_home_dir, SHOW_CHAR_PTR},
+ {"innodb_adaptive_hash_index", (char*) &innobase_adaptive_hash_index, SHOW_MY_BOOL},
+ {"innodb_doublewrite", (char*) &innobase_use_doublewrite, SHOW_MY_BOOL},
++ {"innodb_extra_dirty_writes", (char*) &innobase_extra_dirty_writes, SHOW_MY_BOOL},
+ {sys_innodb_fast_shutdown.name,(char*) &sys_innodb_fast_shutdown, SHOW_SYS},
+ {"innodb_file_io_threads", (char*) &innobase_file_io_threads, SHOW_LONG },
+ {"innodb_file_per_table", (char*) &innobase_file_per_table, SHOW_MY_BOOL},
+ {sys_innodb_flush_log_at_trx_commit.name, (char*) &sys_innodb_flush_log_at_trx_commit, SHOW_SYS},
+ {"innodb_flush_method", (char*) &innobase_unix_file_flush_method, SHOW_CHAR_PTR},
+ {"innodb_force_recovery", (char*) &innobase_force_recovery, SHOW_LONG },
++ {"innodb_io_capacity", (char*) &innobase_io_capacity, SHOW_LONG },
+ {"innodb_lock_wait_timeout", (char*) &innobase_lock_wait_timeout, SHOW_LONG },
+ {"innodb_locks_unsafe_for_binlog", (char*) &innobase_locks_unsafe_for_binlog, SHOW_MY_BOOL},
+ {"innodb_log_arch_dir", (char*) &innobase_log_arch_dir, SHOW_CHAR_PTR},
+@@ -943,6 +945,9 @@
+ {sys_innodb_table_locks.name, (char*) &sys_innodb_table_locks, SHOW_SYS},
+ {sys_innodb_thread_concurrency.name, (char*) &sys_innodb_thread_concurrency, SHOW_SYS},
+ {sys_innodb_thread_sleep_delay.name, (char*) &sys_innodb_thread_sleep_delay, SHOW_SYS},
++ {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG },
++ {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG },
++ {"innodb_max_merged_io", (char*) &innobase_max_merged_io, SHOW_LONG},
+ #endif
+ {sys_interactive_timeout.name,(char*) &sys_interactive_timeout, SHOW_SYS},
+ {sys_join_buffer_size.name, (char*) &sys_join_buffer_size, SHOW_SYS},
diff --git a/percona/5.0.87-b20-20091116/innodb_rw_lock_old.patch b/percona/5.0.87-b20-20091116/innodb_rw_lock_old.patch
new file mode 100644
index 0000000..b4a1a79
--- /dev/null
+++ b/percona/5.0.87-b20-20091116/innodb_rw_lock_old.patch
@@ -0,0 +1,1357 @@
+diff -ruN a/innobase/btr/btr0sea.c b/innobase/btr/btr0sea.c
+--- a/innobase/btr/btr0sea.c 2009-05-20 14:21:44.000000000 +0900
++++ b/innobase/btr/btr0sea.c 2009-05-20 14:39:34.000000000 +0900
+@@ -773,7 +773,7 @@
+ rw_lock_s_lock(&btr_search_latch);
+ }
+
+- ut_ad(btr_search_latch.writer != RW_LOCK_EX);
++ ut_ad(btr_search_latch.writer_count == 0);
+ ut_ad(btr_search_latch.reader_count > 0);
+
+ rec = ha_search_and_get_data(btr_search_sys->hash_index, fold);
+diff -ruN a/innobase/include/sync0rw.h b/innobase/include/sync0rw.h
+--- a/innobase/include/sync0rw.h 2009-01-30 06:42:20.000000000 +0900
++++ b/innobase/include/sync0rw.h 2009-04-16 16:15:28.000000000 +0900
+@@ -325,7 +325,17 @@
+ Accessor functions for rw lock. */
+ UNIV_INLINE
+ ulint
+-rw_lock_get_waiters(
++rw_lock_get_s_waiters(
++/*==================*/
++ rw_lock_t* lock);
++UNIV_INLINE
++ulint
++rw_lock_get_x_waiters(
++/*==================*/
++ rw_lock_t* lock);
++UNIV_INLINE
++ulint
++rw_lock_get_wx_waiters(
+ /*================*/
+ rw_lock_t* lock);
+ UNIV_INLINE
+@@ -408,6 +418,17 @@
+ rw_lock_debug_t* info); /* in: debug struct */
+ #endif /* UNIV_SYNC_DEBUG */
+
++#ifdef HAVE_ATOMIC_BUILTINS
++/* This value means NOT_LOCKED */
++#define RW_LOCK_BIAS 0x00100000
++#else
++#error HAVE_ATOMIC_BUILTINS is not defined. Do you use enough new GCC or compatibles?
++#error Or do you use exact options for CFLAGS?
++#error e.g. (for x86_32): "-m32 -march=i586 -mtune=i686"
++#error e.g. (for Sparc_64): "-m64 -mcpu=v9"
++#error Otherwise, this build may be slower than normal version.
++#endif
++
+ /* NOTE! The structure appears here only for the compiler to know its size.
+ Do not use its fields directly! The structure used in the spin lock
+ implementation of a read-write lock. Several threads may have a shared lock
+@@ -417,9 +438,9 @@
+ field. Then no new readers are allowed in. */
+
+ struct rw_lock_struct {
+- os_event_t event; /* Used by sync0arr.c for thread queueing */
+-
+-#ifdef __WIN__
++ /* Used by sync0arr.c for thread queueing */
++ os_event_t s_event; /* Used for s_lock */
++ os_event_t x_event; /* Used for x_lock */
+ os_event_t wait_ex_event; /* This windows specific event is
+ used by the thread which has set the
+ lock state to RW_LOCK_WAIT_EX. The
+@@ -427,31 +448,35 @@
+ thread will be the next one to proceed
+ once the current the event gets
+ signalled. See LEMMA 2 in sync0sync.c */
++
++#ifdef HAVE_ATOMIC_BUILTINS
++ volatile lint lock_word; /* Used by using atomic builtin */
+ #endif
+
+- ulint reader_count; /* Number of readers who have locked this
++ volatile ulint reader_count; /* Number of readers who have locked this
+ lock in the shared mode */
+- ulint writer; /* This field is set to RW_LOCK_EX if there
++ volatile ulint writer; /* This field is set to RW_LOCK_EX if there
+ is a writer owning the lock (in exclusive
+ mode), RW_LOCK_WAIT_EX if a writer is
+ queueing for the lock, and
+ RW_LOCK_NOT_LOCKED, otherwise. */
+- os_thread_id_t writer_thread;
++ volatile os_thread_id_t writer_thread;
+ /* Thread id of a possible writer thread */
+- ulint writer_count; /* Number of times the same thread has
++ volatile ulint writer_count; /* Number of times the same thread has
+ recursively locked the lock in the exclusive
+ mode */
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_t mutex; /* The mutex protecting rw_lock_struct */
++#endif
+ ulint pass; /* Default value 0. This is set to some
+ value != 0 given by the caller of an x-lock
+ operation, if the x-lock is to be passed to
+ another thread to unlock (which happens in
+ asynchronous i/o). */
+- ulint waiters; /* This ulint is set to 1 if there are
+- waiters (readers or writers) in the global
+- wait array, waiting for this rw_lock.
+- Otherwise, == 0. */
+- ibool writer_is_wait_ex;
++ volatile ulint s_waiters; /* 1: there are waiters (s_lock) */
++ volatile ulint x_waiters; /* 1: there are waiters (x_lock) */
++ volatile ulint wait_ex_waiters; /* 1: there are waiters (wait_ex) */
++ volatile ibool writer_is_wait_ex;
+ /* This is TRUE if the writer field is
+ RW_LOCK_WAIT_EX; this field is located far
+ from the memory update hotspot fields which
+diff -ruN a/innobase/include/sync0rw.ic b/innobase/include/sync0rw.ic
+--- a/innobase/include/sync0rw.ic 2009-01-30 06:42:20.000000000 +0900
++++ b/innobase/include/sync0rw.ic 2009-04-16 17:06:53.000000000 +0900
+@@ -47,20 +47,64 @@
+ Accessor functions for rw lock. */
+ UNIV_INLINE
+ ulint
+-rw_lock_get_waiters(
++rw_lock_get_s_waiters(
+ /*================*/
+ rw_lock_t* lock)
+ {
+- return(lock->waiters);
++ return(lock->s_waiters);
+ }
+ UNIV_INLINE
+-void
+-rw_lock_set_waiters(
++ulint
++rw_lock_get_x_waiters(
+ /*================*/
++ rw_lock_t* lock)
++{
++ return(lock->x_waiters);
++}
++UNIV_INLINE
++ulint
++rw_lock_get_wx_waiters(
++/*================*/
++ rw_lock_t* lock)
++{
++ return(lock->wait_ex_waiters);
++}
++UNIV_INLINE
++void
++rw_lock_set_s_waiters(
+ rw_lock_t* lock,
+ ulint flag)
+ {
+- lock->waiters = flag;
++#ifdef HAVE_ATOMIC_BUILTINS
++ __sync_lock_test_and_set(&lock->s_waiters, flag);
++#else
++ lock->s_waiters = flag;
++#endif
++}
++UNIV_INLINE
++void
++rw_lock_set_x_waiters(
++ rw_lock_t* lock,
++ ulint flag)
++{
++#ifdef HAVE_ATOMIC_BUILTINS
++ __sync_lock_test_and_set(&lock->x_waiters, flag);
++#else
++ lock->x_waiters = flag;
++#endif
++}
++UNIV_INLINE
++void
++rw_lock_set_wx_waiters(
++/*================*/
++ rw_lock_t* lock,
++ ulint flag)
++{
++#ifdef HAVE_ATOMIC_BUILTINS
++ __sync_lock_test_and_set(&lock->wait_ex_waiters, flag);
++#else
++ lock->wait_ex_waiters = flag;
++#endif
+ }
+ UNIV_INLINE
+ ulint
+@@ -68,7 +112,19 @@
+ /*===============*/
+ rw_lock_t* lock)
+ {
++#ifdef HAVE_ATOMIC_BUILTINS
++ if (lock->writer == RW_LOCK_NOT_LOCKED) {
++ return(RW_LOCK_NOT_LOCKED);
++ }
++
++ if (lock->writer_is_wait_ex) {
++ return(RW_LOCK_WAIT_EX);
++ } else {
++ return(RW_LOCK_EX);
++ }
++#else
+ return(lock->writer);
++#endif
+ }
+ UNIV_INLINE
+ void
+@@ -96,6 +152,7 @@
+ {
+ lock->reader_count = count;
+ }
++#ifndef HAVE_ATOMIC_BUILTINS
+ UNIV_INLINE
+ mutex_t*
+ rw_lock_get_mutex(
+@@ -104,6 +161,7 @@
+ {
+ return(&(lock->mutex));
+ }
++#endif
+
+ /**********************************************************************
+ Returns the value of writer_count for the lock. Does not reserve the lock
+@@ -133,14 +191,26 @@
+ const char* file_name, /* in: file name where lock requested */
+ ulint line) /* in: line where requested */
+ {
+-#ifdef UNIV_SYNC_DEBUG
++#if defined(UNIV_SYNC_DEBUG) && !defined(HAVE_ATOMIC_BUILTINS)
+ ut_ad(mutex_own(rw_lock_get_mutex(lock)));
+ #endif /* UNIV_SYNC_DEBUG */
+ /* Check if the writer field is free */
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ if (UNIV_LIKELY(rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED)) {
++ /* try s-lock */
++ if(__sync_sub_and_fetch(&(lock->lock_word),1) <= 0) {
++ /* fail */
++ __sync_fetch_and_add(&(lock->lock_word),1);
++ return(FALSE); /* locking did not succeed */
++ }
++ /* success */
++ __sync_fetch_and_add(&(lock->reader_count),1);
++#else
+ if (UNIV_LIKELY(lock->writer == RW_LOCK_NOT_LOCKED)) {
+ /* Set the shared lock by incrementing the reader count */
+ lock->reader_count++;
++#endif
+
+ #ifdef UNIV_SYNC_DEBUG
+ rw_lock_add_debug_info(lock, pass, RW_LOCK_SHARED, file_name,
+@@ -167,11 +237,15 @@
+ const char* file_name, /* in: file name where requested */
+ ulint line) /* in: line where lock requested */
+ {
+- ut_ad(lock->writer == RW_LOCK_NOT_LOCKED);
++ ut_ad(rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED);
+ ut_ad(rw_lock_get_reader_count(lock) == 0);
+
+ /* Set the shared lock by incrementing the reader count */
++#ifdef HAVE_ATOMIC_BUILTINS
++ __sync_fetch_and_add(&(lock->reader_count),1);
++#else
+ lock->reader_count++;
++#endif
+
+ lock->last_s_file_name = file_name;
+ lock->last_s_line = line;
+@@ -199,7 +273,11 @@
+
+ rw_lock_set_writer(lock, RW_LOCK_EX);
+ lock->writer_thread = os_thread_get_curr_id();
++#ifdef HAVE_ATOMIC_BUILTINS
++ __sync_fetch_and_add(&(lock->writer_count),1);
++#else
+ lock->writer_count++;
++#endif
+ lock->pass = 0;
+
+ lock->last_x_file_name = file_name;
+@@ -241,15 +319,21 @@
+ ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED)); /* see NOTE above */
+ #endif /* UNIV_SYNC_DEBUG */
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_enter(rw_lock_get_mutex(lock));
++#endif
+
+ if (UNIV_LIKELY(rw_lock_s_lock_low(lock, pass, file_name, line))) {
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_exit(rw_lock_get_mutex(lock));
++#endif
+
+ return; /* Success */
+ } else {
+ /* Did not succeed, try spin wait */
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_exit(rw_lock_get_mutex(lock));
++#endif
+
+ rw_lock_s_lock_spin(lock, pass, file_name, line);
+
+@@ -272,11 +356,23 @@
+ {
+ ibool success = FALSE;
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ if (rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED) {
++ /* try s-lock */
++ if(__sync_sub_and_fetch(&(lock->lock_word),1) <= 0) {
++ /* fail */
++ __sync_fetch_and_add(&(lock->lock_word),1);
++ return(FALSE); /* locking did not succeed */
++ }
++ /* success */
++ __sync_fetch_and_add(&(lock->reader_count),1);
++#else
+ mutex_enter(rw_lock_get_mutex(lock));
+
+ if (lock->writer == RW_LOCK_NOT_LOCKED) {
+ /* Set the shared lock by incrementing the reader count */
+ lock->reader_count++;
++#endif
+
+ #ifdef UNIV_SYNC_DEBUG
+ rw_lock_add_debug_info(lock, 0, RW_LOCK_SHARED, file_name,
+@@ -289,7 +385,9 @@
+ success = TRUE;
+ }
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_exit(rw_lock_get_mutex(lock));
++#endif
+
+ return(success);
+ }
+@@ -309,6 +407,54 @@
+ {
+ ibool success = FALSE;
+ os_thread_id_t curr_thread = os_thread_get_curr_id();
++#ifdef HAVE_ATOMIC_BUILTINS
++ if (lock->reader_count == 0) {
++ /* try to lock writer */
++ if(__sync_lock_test_and_set(&(lock->writer),RW_LOCK_EX)
++ == RW_LOCK_NOT_LOCKED) {
++ /* success */
++retry_x_lock:
++ /* try x-lock */
++ if(__sync_sub_and_fetch(&(lock->lock_word),
++ RW_LOCK_BIAS) == 0) {
++ /* success */
++ lock->writer_thread = curr_thread;
++ lock->pass = 0;
++ lock->writer_is_wait_ex = FALSE;
++ /* next function may work as memory barrier */
++ relock:
++ __sync_fetch_and_add(&(lock->writer_count),1);
++
++#ifdef UNIV_SYNC_DEBUG
++ rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line);
++#endif
++
++ lock->last_x_file_name = file_name;
++ lock->last_x_line = line;
++
++ ut_ad(rw_lock_validate(lock));
++
++ return(TRUE);
++ } else {
++ /* fail (x-lock) */
++ if (__sync_fetch_and_add(&(lock->lock_word),RW_LOCK_BIAS)
++ == 0)
++ goto retry_x_lock;
++ }
++
++ __sync_lock_test_and_set(&(lock->writer),RW_LOCK_NOT_LOCKED);
++ }
++ }
++
++ if (lock->pass == 0
++ && os_thread_eq(lock->writer_thread, curr_thread)) {
++ goto relock;
++ }
++
++ //ut_ad(rw_lock_validate(lock));
++
++ return(FALSE);
++#else
+ mutex_enter(rw_lock_get_mutex(lock));
+
+ if (UNIV_UNLIKELY(rw_lock_get_reader_count(lock) != 0)) {
+@@ -339,6 +485,7 @@
+ ut_ad(rw_lock_validate(lock));
+
+ return(success);
++#endif
+ }
+
+ /**********************************************************************
+@@ -354,16 +501,33 @@
+ #endif
+ )
+ {
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_t* mutex = &(lock->mutex);
+- ibool sg = FALSE;
++#endif
++ ibool x_sg = FALSE;
++ ibool wx_sg = FALSE;
++#ifdef HAVE_ATOMIC_BUILTINS
++ ibool last = FALSE;
++#endif
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ /* Acquire the mutex protecting the rw-lock fields */
+ mutex_enter(mutex);
++#endif
+
+ /* Reset the shared lock by decrementing the reader count */
+
+ ut_a(lock->reader_count > 0);
++#ifdef HAVE_ATOMIC_BUILTINS
++ /* unlock lock_word */
++ __sync_fetch_and_add(&(lock->lock_word),1);
++
++ if(__sync_sub_and_fetch(&(lock->reader_count),1) == 0) {
++ last = TRUE;
++ }
++#else
+ lock->reader_count--;
++#endif
+
+ #ifdef UNIV_SYNC_DEBUG
+ rw_lock_remove_debug_info(lock, pass, RW_LOCK_SHARED);
+@@ -372,22 +536,39 @@
+ /* If there may be waiters and this was the last s-lock,
+ signal the object */
+
+- if (UNIV_UNLIKELY(lock->waiters)
++#ifdef HAVE_ATOMIC_BUILTINS
++ if (UNIV_UNLIKELY(last && __sync_lock_test_and_set(&lock->wait_ex_waiters, 0))) {
++ os_event_set(lock->wait_ex_event);
++ sync_array_object_signalled(sync_primary_wait_array);
++ }
++ else if (UNIV_UNLIKELY(last && __sync_lock_test_and_set(&lock->x_waiters, 0))) {
++ os_event_set(lock->x_event);
++ sync_array_object_signalled(sync_primary_wait_array);
++ }
++#else
++ if (UNIV_UNLIKELY(lock->wait_ex_waiters)
+ && lock->reader_count == 0) {
+- sg = TRUE;
++ wx_sg = TRUE;
+
+- rw_lock_set_waiters(lock, 0);
++ rw_lock_set_wx_waiters(lock, 0);
++ }
++ else if (UNIV_UNLIKELY(lock->x_waiters)
++ && lock->reader_count == 0) {
++ x_sg = TRUE;
++
++ rw_lock_set_x_waiters(lock, 0);
+ }
+
+ mutex_exit(mutex);
+
+- if (UNIV_UNLIKELY(sg)) {
+-#ifdef __WIN__
++ if (UNIV_UNLIKELY(wx_sg)) {
+ os_event_set(lock->wait_ex_event);
+-#endif
+- os_event_set(lock->event);
++ sync_array_object_signalled(sync_primary_wait_array);
++ } else if (UNIV_UNLIKELY(x_sg)) {
++ os_event_set(lock->x_event);
+ sync_array_object_signalled(sync_primary_wait_array);
+ }
++#endif
+
+ ut_ad(rw_lock_validate(lock));
+
+@@ -409,13 +590,22 @@
+
+ ut_ad(lock->reader_count > 0);
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ __sync_sub_and_fetch(&(lock->reader_count),1);
++#else
+ lock->reader_count--;
++#endif
+
+ #ifdef UNIV_SYNC_DEBUG
+ rw_lock_remove_debug_info(lock, 0, RW_LOCK_SHARED);
+ #endif
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ ut_ad(!lock->s_waiters);
++ ut_ad(!lock->x_waiters);
++#else
+ ut_ad(!lock->waiters);
++#endif
+ ut_ad(rw_lock_validate(lock));
+ #ifdef UNIV_SYNC_PERF_STAT
+ rw_s_exit_count++;
+@@ -435,41 +625,83 @@
+ #endif
+ )
+ {
+- ibool sg = FALSE;
++#ifdef HAVE_ATOMIC_BUILTINS
++ ibool last = FALSE;
++#endif
++ ibool s_sg = FALSE;
++ ibool x_sg = FALSE;
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ /* Acquire the mutex protecting the rw-lock fields */
+ mutex_enter(&(lock->mutex));
++#endif
+
+ /* Reset the exclusive lock if this thread no longer has an x-mode
+ lock */
+
+ ut_ad(lock->writer_count > 0);
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ if(__sync_sub_and_fetch(&(lock->writer_count),1) == 0) {
++ last = TRUE;
++ }
++
++ if (last) {
++ /* unlock lock_word */
++ __sync_fetch_and_add(&(lock->lock_word),RW_LOCK_BIAS);
++
++ /* FIXME: It is a value of bad manners for pthread.
++ But we shouldn't keep an ID of not-owner. */
++ lock->writer_thread = -1;
++ __sync_lock_test_and_set(&(lock->writer),RW_LOCK_NOT_LOCKED);
++ }
++#else
+ lock->writer_count--;
+
+ if (lock->writer_count == 0) {
+ rw_lock_set_writer(lock, RW_LOCK_NOT_LOCKED);
+ }
++#endif
+
+ #ifdef UNIV_SYNC_DEBUG
+ rw_lock_remove_debug_info(lock, pass, RW_LOCK_EX);
+ #endif
+
+ /* If there may be waiters, signal the lock */
+- if (UNIV_UNLIKELY(lock->waiters)
+- && lock->writer_count == 0) {
+-
+- sg = TRUE;
+- rw_lock_set_waiters(lock, 0);
++#ifdef HAVE_ATOMIC_BUILTINS
++ if (last) {
++ if(__sync_lock_test_and_set(&lock->s_waiters, 0)){
++ s_sg = TRUE;
++ }
++ if(__sync_lock_test_and_set(&lock->x_waiters, 0)){
++ x_sg = TRUE;
++ }
++ }
++#else
++ if (lock->writer_count == 0) {
++ if(lock->s_waiters){
++ s_sg = TRUE;
++ rw_lock_set_s_waiters(lock, 0);
++ }
++ if(lock->x_waiters){
++ x_sg = TRUE;
++ rw_lock_set_x_waiters(lock, 0);
++ }
+ }
+
+ mutex_exit(&(lock->mutex));
++#endif
+
+- if (UNIV_UNLIKELY(sg)) {
++ if (UNIV_UNLIKELY(s_sg)) {
++ os_event_set(lock->s_event);
++ sync_array_object_signalled(sync_primary_wait_array);
++ }
++ if (UNIV_UNLIKELY(x_sg)) {
+ #ifdef __WIN__
++ /* I doubt the necessity of it. */
+ os_event_set(lock->wait_ex_event);
+ #endif
+- os_event_set(lock->event);
++ os_event_set(lock->x_event);
+ sync_array_object_signalled(sync_primary_wait_array);
+ }
+
+@@ -494,9 +726,13 @@
+
+ ut_ad(lock->writer_count > 0);
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ if(__sync_sub_and_fetch(&(lock->writer_count),1) == 0) {
++#else
+ lock->writer_count--;
+
+ if (lock->writer_count == 0) {
++#endif
+ rw_lock_set_writer(lock, RW_LOCK_NOT_LOCKED);
+ }
+
+@@ -504,7 +740,12 @@
+ rw_lock_remove_debug_info(lock, 0, RW_LOCK_EX);
+ #endif
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ ut_ad(!lock->s_waiters);
++ ut_ad(!lock->x_waiters);
++#else
+ ut_ad(!lock->waiters);
++#endif
+ ut_ad(rw_lock_validate(lock));
+
+ #ifdef UNIV_SYNC_PERF_STAT
+diff -ruN a/innobase/sync/sync0arr.c b/innobase/sync/sync0arr.c
+--- a/innobase/sync/sync0arr.c 2009-01-30 06:42:24.000000000 +0900
++++ b/innobase/sync/sync0arr.c 2009-04-16 16:15:28.000000000 +0900
+@@ -309,13 +309,13 @@
+ {
+ if (type == SYNC_MUTEX) {
+ return(os_event_reset(((mutex_t *) object)->event));
+-#ifdef __WIN__
+ } else if (type == RW_LOCK_WAIT_EX) {
+ return(os_event_reset(
+ ((rw_lock_t *) object)->wait_ex_event));
+-#endif
+- } else {
+- return(os_event_reset(((rw_lock_t *) object)->event));
++ } else if (type == RW_LOCK_SHARED) {
++ return(os_event_reset(((rw_lock_t *) object)->s_event));
++ } else { /* RW_LOCK_EX */
++ return(os_event_reset(((rw_lock_t *) object)->x_event));
+ }
+ }
+
+@@ -415,15 +415,12 @@
+
+ if (cell->request_type == SYNC_MUTEX) {
+ event = ((mutex_t*) cell->wait_object)->event;
+-#ifdef __WIN__
+- /* On windows if the thread about to wait is the one which
+- has set the state of the rw_lock to RW_LOCK_WAIT_EX, then
+- it waits on a special event i.e.: wait_ex_event. */
+ } else if (cell->request_type == RW_LOCK_WAIT_EX) {
+ event = ((rw_lock_t*) cell->wait_object)->wait_ex_event;
+-#endif
+- } else {
+- event = ((rw_lock_t*) cell->wait_object)->event;
++ } else if (cell->request_type == RW_LOCK_SHARED) {
++ event = ((rw_lock_t*) cell->wait_object)->s_event;
++ } else {
++ event = ((rw_lock_t*) cell->wait_object)->x_event;
+ }
+
+ cell->waiting = TRUE;
+@@ -464,6 +461,7 @@
+ mutex_t* mutex;
+ rw_lock_t* rwlock;
+ ulint type;
++ ulint writer;
+
+ type = cell->request_type;
+
+@@ -492,12 +490,10 @@
+ (ulong) mutex->waiters);
+
+ } else if (type == RW_LOCK_EX
+-#ifdef __WIN__
+ || type == RW_LOCK_WAIT_EX
+-#endif
+ || type == RW_LOCK_SHARED) {
+
+- fputs(type == RW_LOCK_EX ? "X-lock on" : "S-lock on", file);
++ fputs(type == RW_LOCK_SHARED ? "S-lock on" : "X-lock on", file);
+
+ rwlock = cell->old_wait_rw_lock;
+
+@@ -505,21 +501,23 @@
+ " RW-latch at %p created in file %s line %lu\n",
+ rwlock, rwlock->cfile_name,
+ (ulong) rwlock->cline);
+- if (rwlock->writer != RW_LOCK_NOT_LOCKED) {
++ writer = rw_lock_get_writer(rwlock);
++ if (writer != RW_LOCK_NOT_LOCKED) {
+ fprintf(file,
+ "a writer (thread id %lu) has reserved it in mode %s",
+ (ulong) os_thread_pf(rwlock->writer_thread),
+- rwlock->writer == RW_LOCK_EX
++ writer == RW_LOCK_EX
+ ? " exclusive\n"
+ : " wait exclusive\n");
+ }
+
+ fprintf(file,
+- "number of readers %lu, waiters flag %lu\n"
++ "number of readers %lu, s_waiters flag %lu, x_waiters flag %lu\n"
+ "Last time read locked in file %s line %lu\n"
+ "Last time write locked in file %s line %lu\n",
+ (ulong) rwlock->reader_count,
+- (ulong) rwlock->waiters,
++ (ulong) rwlock->s_waiters,
++ (ulong) (rwlock->x_waiters || rwlock->wait_ex_waiters),
+ rwlock->last_s_file_name,
+ (ulong) rwlock->last_s_line,
+ rwlock->last_x_file_name,
+@@ -839,11 +837,15 @@
+ /*========================*/
+ sync_array_t* arr) /* in: wait array */
+ {
++#ifdef HAVE_ATOMIC_BUILTINS
++ __sync_fetch_and_add(&(arr->sg_count),1);
++#else
+ sync_array_enter(arr);
+
+ arr->sg_count++;
+
+ sync_array_exit(arr);
++#endif
+ }
+
+ /**************************************************************************
+@@ -880,19 +882,23 @@
+
+ mutex = cell->wait_object;
+ os_event_set(mutex->event);
+-#ifdef __WIN__
+ } else if (cell->request_type
+ == RW_LOCK_WAIT_EX) {
+ rw_lock_t* lock;
+
+ lock = cell->wait_object;
+ os_event_set(lock->wait_ex_event);
+-#endif
+- } else {
++ } else if (cell->request_type
++ == RW_LOCK_SHARED) {
+ rw_lock_t* lock;
+
+ lock = cell->wait_object;
+- os_event_set(lock->event);
++ os_event_set(lock->s_event);
++ } else {
++ rw_lock_t* lock;
++
++ lock = cell->wait_object;
++ os_event_set(lock->x_event);
+ }
+ }
+ }
+diff -ruN a/innobase/sync/sync0rw.c b/innobase/sync/sync0rw.c
+--- a/innobase/sync/sync0rw.c 2009-01-30 06:42:24.000000000 +0900
++++ b/innobase/sync/sync0rw.c 2009-04-16 17:33:59.000000000 +0900
+@@ -99,6 +99,7 @@
+ object is created, then the following call initializes
+ the sync system. */
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_create(rw_lock_get_mutex(lock));
+ mutex_set_level(rw_lock_get_mutex(lock), SYNC_NO_ORDER_CHECK);
+
+@@ -108,8 +109,14 @@
+ lock->mutex.cmutex_name = cmutex_name;
+ lock->mutex.mutex_type = 1;
+ #endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */
++#endif /* !HAVE_ATOMIC_BUILTINS */
+
+- rw_lock_set_waiters(lock, 0);
++#ifdef HAVE_ATOMIC_BUILTINS
++ lock->lock_word = RW_LOCK_BIAS;
++#endif
++ rw_lock_set_s_waiters(lock, 0);
++ rw_lock_set_x_waiters(lock, 0);
++ rw_lock_set_wx_waiters(lock, 0);
+ rw_lock_set_writer(lock, RW_LOCK_NOT_LOCKED);
+ lock->writer_count = 0;
+ rw_lock_set_reader_count(lock, 0);
+@@ -130,11 +137,9 @@
+ lock->last_x_file_name = "not yet reserved";
+ lock->last_s_line = 0;
+ lock->last_x_line = 0;
+- lock->event = os_event_create(NULL);
+-
+-#ifdef __WIN__
++ lock->s_event = os_event_create(NULL);
++ lock->x_event = os_event_create(NULL);
+ lock->wait_ex_event = os_event_create(NULL);
+-#endif
+
+ mutex_enter(&rw_lock_list_mutex);
+
+@@ -162,19 +167,21 @@
+ ut_a(rw_lock_validate(lock));
+ #endif /* UNIV_DEBUG */
+ ut_a(rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED);
+- ut_a(rw_lock_get_waiters(lock) == 0);
++ ut_a(rw_lock_get_s_waiters(lock) == 0);
++ ut_a(rw_lock_get_x_waiters(lock) == 0);
++ ut_a(rw_lock_get_wx_waiters(lock) == 0);
+ ut_a(rw_lock_get_reader_count(lock) == 0);
+
+ lock->magic_n = 0;
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_free(rw_lock_get_mutex(lock));
++#endif
+
+ mutex_enter(&rw_lock_list_mutex);
+- os_event_free(lock->event);
+-
+-#ifdef __WIN__
++ os_event_free(lock->s_event);
++ os_event_free(lock->x_event);
+ os_event_free(lock->wait_ex_event);
+-#endif
+
+ if (UT_LIST_GET_PREV(list, lock)) {
+ ut_a(UT_LIST_GET_PREV(list, lock)->magic_n == RW_LOCK_MAGIC_N);
+@@ -192,26 +199,43 @@
+ Checks that the rw-lock has been initialized and that there are no
+ simultaneous shared and exclusive locks. */
+
++/* MEMO: If HAVE_ATOMIC_BUILTINS, we should use this function statically. */
++
+ ibool
+ rw_lock_validate(
+ /*=============*/
+ rw_lock_t* lock)
+ {
++ ulint test;
+ ut_a(lock);
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_enter(rw_lock_get_mutex(lock));
++#endif
+
+ ut_a(lock->magic_n == RW_LOCK_MAGIC_N);
++#ifndef HAVE_ATOMIC_BUILTINS
+ ut_a((rw_lock_get_reader_count(lock) == 0)
+ || (rw_lock_get_writer(lock) != RW_LOCK_EX));
+- ut_a((rw_lock_get_writer(lock) == RW_LOCK_EX)
+- || (rw_lock_get_writer(lock) == RW_LOCK_WAIT_EX)
+- || (rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED));
+- ut_a((rw_lock_get_waiters(lock) == 0)
+- || (rw_lock_get_waiters(lock) == 1));
++#endif
++ test = rw_lock_get_writer(lock);
++ ut_a((test == RW_LOCK_EX)
++ || (test == RW_LOCK_WAIT_EX)
++ || (test == RW_LOCK_NOT_LOCKED));
++ test = rw_lock_get_s_waiters(lock);
++ ut_a((test == 0)
++ || (test == 1));
++ test = rw_lock_get_x_waiters(lock);
++ ut_a((test == 0)
++ || (test == 1));
++ test = rw_lock_get_wx_waiters(lock);
++ ut_a((test == 0)
++ || (test == 1));
++#ifndef HAVE_ATOMIC_BUILTINS
+ ut_a((lock->writer != RW_LOCK_EX) || (lock->writer_count > 0));
+
+ mutex_exit(rw_lock_get_mutex(lock));
++#endif
+
+ return(TRUE);
+ }
+@@ -237,13 +261,14 @@
+ ut_ad(rw_lock_validate(lock));
+
+ lock_loop:
++ i = 0;
++spin_loop:
+ rw_s_spin_wait_count++;
+
+ /* Spin waiting for the writer field to become free */
+- i = 0;
+
+- while (rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED
+- && i < SYNC_SPIN_ROUNDS) {
++ while (i < SYNC_SPIN_ROUNDS
++ && rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED) {
+ if (srv_spin_wait_delay) {
+ ut_delay(ut_rnd_interval(0, srv_spin_wait_delay));
+ }
+@@ -262,15 +287,27 @@
+ lock->cfile_name, (ulong) lock->cline, (ulong) i);
+ }
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_enter(rw_lock_get_mutex(lock));
++#endif
+
+ /* We try once again to obtain the lock */
+
+ if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) {
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_exit(rw_lock_get_mutex(lock));
++#endif
+
+ return; /* Success */
+ } else {
++#ifdef HAVE_ATOMIC_BUILTINS
++ /* like sync0sync.c doing */
++ i++;
++
++ if (i < SYNC_SPIN_ROUNDS) {
++ goto spin_loop;
++ }
++#endif
+ /* If we get here, locking did not succeed, we may
+ suspend the thread to wait in the wait array */
+
+@@ -281,9 +318,26 @@
+ file_name, line,
+ &index);
+
+- rw_lock_set_waiters(lock, 1);
++ rw_lock_set_s_waiters(lock, 1);
++
++#ifdef HAVE_ATOMIC_BUILTINS
++ /* like sync0sync.c doing */
++ for (i = 0; i < 4; i++) {
++ if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) {
++ sync_array_free_cell(sync_primary_wait_array, index);
++ return; /* Success */
++ }
++ }
+
++ /* If wait_ex_waiter stalls, wakes it. */
++ if (lock->reader_count == 0
++ && __sync_lock_test_and_set(&lock->wait_ex_waiters, 0)) {
++ os_event_set(lock->wait_ex_event);
++ sync_array_object_signalled(sync_primary_wait_array);
++ }
++#else
+ mutex_exit(rw_lock_get_mutex(lock));
++#endif
+
+ if (srv_print_latch_waits) {
+ fprintf(stderr,
+@@ -318,13 +372,19 @@
+ {
+ ut_ad(rw_lock_is_locked(lock, RW_LOCK_EX));
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_enter(&(lock->mutex));
++#endif
+
+ lock->writer_thread = os_thread_get_curr_id();
+
+ lock->pass = 0;
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_exit(&(lock->mutex));
++#else
++ __sync_synchronize();
++#endif
+ }
+
+ /**********************************************************************
+@@ -342,6 +402,89 @@
+ const char* file_name,/* in: file name where lock requested */
+ ulint line) /* in: line where requested */
+ {
++#ifdef HAVE_ATOMIC_BUILTINS
++ os_thread_id_t curr_thread = os_thread_get_curr_id();
++retry_writer:
++ /* try to lock writer */
++ if(__sync_lock_test_and_set(&(lock->writer),RW_LOCK_EX)
++ == RW_LOCK_NOT_LOCKED) {
++ /* success */
++ /* obtain RW_LOCK_WAIT_EX right */
++ lock->writer_thread = curr_thread;
++ lock->pass = pass;
++ lock->writer_is_wait_ex = TRUE;
++ /* atomic operation may be safer about memory order. */
++ __sync_synchronize();
++#ifdef UNIV_SYNC_DEBUG
++ rw_lock_add_debug_info(lock, pass, RW_LOCK_WAIT_EX,
++ file_name, line);
++#endif
++ }
++
++ if (!os_thread_eq(lock->writer_thread, curr_thread)) {
++ return(RW_LOCK_NOT_LOCKED);
++ }
++
++ switch(rw_lock_get_writer(lock)) {
++ case RW_LOCK_WAIT_EX:
++ /* have right to try x-lock */
++retry_x_lock:
++ /* try x-lock */
++ if(__sync_sub_and_fetch(&(lock->lock_word),
++ RW_LOCK_BIAS) == 0) {
++ /* success */
++ lock->pass = pass;
++ lock->writer_is_wait_ex = FALSE;
++ __sync_fetch_and_add(&(lock->writer_count),1);
++
++#ifdef UNIV_SYNC_DEBUG
++ rw_lock_remove_debug_info(lock, pass, RW_LOCK_WAIT_EX);
++ rw_lock_add_debug_info(lock, pass, RW_LOCK_EX,
++ file_name, line);
++#endif
++
++ lock->last_x_file_name = file_name;
++ lock->last_x_line = line;
++
++ /* Locking succeeded, we may return */
++ return(RW_LOCK_EX);
++ } else if(__sync_fetch_and_add(&(lock->lock_word),
++ RW_LOCK_BIAS) == 0) {
++ /* retry x-lock */
++ goto retry_x_lock;
++ }
++
++ /* There are readers, we have to wait */
++ return(RW_LOCK_WAIT_EX);
++
++ break;
++
++ case RW_LOCK_EX:
++ /* already have x-lock */
++ if ((lock->pass == 0)&&(pass == 0)) {
++ __sync_fetch_and_add(&(lock->writer_count),1);
++
++#ifdef UNIV_SYNC_DEBUG
++ rw_lock_add_debug_info(lock, pass, RW_LOCK_EX, file_name,
++ line);
++#endif
++
++ lock->last_x_file_name = file_name;
++ lock->last_x_line = line;
++
++ /* Locking succeeded, we may return */
++ return(RW_LOCK_EX);
++ }
++
++ return(RW_LOCK_NOT_LOCKED);
++
++ break;
++
++ default: /* RW_LOCK_NOT_LOCKED? maybe impossible */
++ goto retry_writer;
++ }
++#else /* HAVE_ATOMIC_BUILTINS */
++
+ #ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(rw_lock_get_mutex(lock)));
+ #endif /* UNIV_SYNC_DEBUG */
+@@ -423,6 +566,7 @@
+ /* Locking succeeded, we may return */
+ return(RW_LOCK_EX);
+ }
++#endif /* HAVE_ATOMIC_BUILTINS */
+
+ /* Locking did not succeed */
+ return(RW_LOCK_NOT_LOCKED);
+@@ -448,19 +592,33 @@
+ ulint line) /* in: line where requested */
+ {
+ ulint index; /* index of the reserved wait cell */
+- ulint state; /* lock state acquired */
++ ulint state = RW_LOCK_NOT_LOCKED; /* lock state acquired */
++#ifdef HAVE_ATOMIC_BUILTINS
++ ulint prev_state = RW_LOCK_NOT_LOCKED;
++#endif
+ ulint i; /* spin round count */
+
+ ut_ad(rw_lock_validate(lock));
+
+ lock_loop:
++ i = 0;
++
++#ifdef HAVE_ATOMIC_BUILTINS
++ prev_state = state;
++#else
+ /* Acquire the mutex protecting the rw-lock fields */
+ mutex_enter_fast(&(lock->mutex));
++#endif
+
+ state = rw_lock_x_lock_low(lock, pass, file_name, line);
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ if (state != prev_state) i=0; /* if progress, reset counter. */
++#else
+ mutex_exit(&(lock->mutex));
++#endif
+
++spin_loop:
+ if (state == RW_LOCK_EX) {
+
+ return; /* Locking succeeded */
+@@ -468,10 +626,9 @@
+ } else if (state == RW_LOCK_NOT_LOCKED) {
+
+ /* Spin waiting for the writer field to become free */
+- i = 0;
+
+- while (rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED
+- && i < SYNC_SPIN_ROUNDS) {
++ while (i < SYNC_SPIN_ROUNDS
++ && lock->lock_word != RW_LOCK_BIAS) {
+ if (srv_spin_wait_delay) {
+ ut_delay(ut_rnd_interval(0,
+ srv_spin_wait_delay));
+@@ -485,9 +642,12 @@
+ } else if (state == RW_LOCK_WAIT_EX) {
+
+ /* Spin waiting for the reader count field to become zero */
+- i = 0;
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ while (lock->lock_word != RW_LOCK_BIAS
++#else
+ while (rw_lock_get_reader_count(lock) != 0
++#endif
+ && i < SYNC_SPIN_ROUNDS) {
+ if (srv_spin_wait_delay) {
+ ut_delay(ut_rnd_interval(0,
+@@ -500,7 +660,6 @@
+ os_thread_yield();
+ }
+ } else {
+- i = 0; /* Eliminate a compiler warning */
+ ut_error;
+ }
+
+@@ -516,34 +675,69 @@
+ /* We try once again to obtain the lock. Acquire the mutex protecting
+ the rw-lock fields */
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ prev_state = state;
++#else
+ mutex_enter(rw_lock_get_mutex(lock));
++#endif
+
+ state = rw_lock_x_lock_low(lock, pass, file_name, line);
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ if (state != prev_state) i=0; /* if progress, reset counter. */
++#endif
++
+ if (state == RW_LOCK_EX) {
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_exit(rw_lock_get_mutex(lock));
++#endif
+
+ return; /* Locking succeeded */
+ }
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ /* like sync0sync.c doing */
++ i++;
++
++ if (i < SYNC_SPIN_ROUNDS) {
++ goto spin_loop;
++ }
++#endif
++
+ rw_x_system_call_count++;
+
+ sync_array_reserve_cell(sync_primary_wait_array,
+ lock,
+-#ifdef __WIN__
+- /* On windows RW_LOCK_WAIT_EX signifies
+- that this thread should wait on the
+- special wait_ex_event. */
+ (state == RW_LOCK_WAIT_EX)
+ ? RW_LOCK_WAIT_EX :
+-#endif
+ RW_LOCK_EX,
+ file_name, line,
+ &index);
+
+- rw_lock_set_waiters(lock, 1);
++ if (state == RW_LOCK_WAIT_EX) {
++ rw_lock_set_wx_waiters(lock, 1);
++ } else {
++ rw_lock_set_x_waiters(lock, 1);
++ }
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ /* like sync0sync.c doing */
++ for (i = 0; i < 4; i++) {
++ prev_state = state;
++ state = rw_lock_x_lock_low(lock, pass, file_name, line);
++ if (state == RW_LOCK_EX) {
++ sync_array_free_cell(sync_primary_wait_array, index);
++ return; /* Locking succeeded */
++ }
++ if (state != prev_state) {
++ /* retry! */
++ sync_array_free_cell(sync_primary_wait_array, index);
++ goto lock_loop;
++ }
++ }
++#else
+ mutex_exit(rw_lock_get_mutex(lock));
++#endif
+
+ if (srv_print_latch_waits) {
+ fprintf(stderr,
+@@ -718,7 +912,9 @@
+ ut_ad(lock);
+ ut_ad(rw_lock_validate(lock));
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_enter(&(lock->mutex));
++#endif
+
+ info = UT_LIST_GET_FIRST(lock->debug_list);
+
+@@ -728,7 +924,9 @@
+ && (info->pass == 0)
+ && (info->lock_type == lock_type)) {
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_exit(&(lock->mutex));
++#endif
+ /* Found! */
+
+ return(TRUE);
+@@ -736,7 +934,9 @@
+
+ info = UT_LIST_GET_NEXT(list, info);
+ }
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_exit(&(lock->mutex));
++#endif
+
+ return(FALSE);
+ }
+@@ -758,21 +958,25 @@
+ ut_ad(lock);
+ ut_ad(rw_lock_validate(lock));
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_enter(&(lock->mutex));
++#endif
+
+ if (lock_type == RW_LOCK_SHARED) {
+ if (lock->reader_count > 0) {
+ ret = TRUE;
+ }
+ } else if (lock_type == RW_LOCK_EX) {
+- if (lock->writer == RW_LOCK_EX) {
++ if (rw_lock_get_writer(lock) == RW_LOCK_EX) {
+ ret = TRUE;
+ }
+ } else {
+ ut_error;
+ }
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_exit(&(lock->mutex));
++#endif
+
+ return(ret);
+ }
+@@ -801,16 +1005,26 @@
+
+ count++;
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_enter(&(lock->mutex));
++#endif
+
+ if ((rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED)
+ || (rw_lock_get_reader_count(lock) != 0)
+- || (rw_lock_get_waiters(lock) != 0)) {
++ || (rw_lock_get_s_waiters(lock) != 0)
++ || (rw_lock_get_x_waiters(lock) != 0)
++ || (rw_lock_get_wx_waiters(lock) != 0)) {
+
+ fprintf(stderr, "RW-LOCK: %p ", lock);
+
+- if (rw_lock_get_waiters(lock)) {
+- fputs(" Waiters for the lock exist\n", stderr);
++ if (rw_lock_get_s_waiters(lock)) {
++ fputs(" s_waiters for the lock exist,", stderr);
++ }
++ if (rw_lock_get_x_waiters(lock)) {
++ fputs(" x_waiters for the lock exist\n", stderr);
++ }
++ if (rw_lock_get_wx_waiters(lock)) {
++ fputs(" wait_ex_waiters for the lock exist\n", stderr);
+ } else {
+ putc('\n', stderr);
+ }
+@@ -822,7 +1036,9 @@
+ }
+ }
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_exit(&(lock->mutex));
++#endif
+ lock = UT_LIST_GET_NEXT(list, lock);
+ }
+
+@@ -847,10 +1063,18 @@
+
+ if ((rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED)
+ || (rw_lock_get_reader_count(lock) != 0)
+- || (rw_lock_get_waiters(lock) != 0)) {
++ || (rw_lock_get_s_waiters(lock) != 0)
++ || (rw_lock_get_x_waiters(lock) != 0)
++ || (rw_lock_get_wx_waiters(lock) != 0)) {
+
+- if (rw_lock_get_waiters(lock)) {
+- fputs(" Waiters for the lock exist\n", stderr);
++ if (rw_lock_get_s_waiters(lock)) {
++ fputs(" s_waiters for the lock exist,", stderr);
++ }
++ if (rw_lock_get_x_waiters(lock)) {
++ fputs(" x_waiters for the lock exist\n", stderr);
++ }
++ if (rw_lock_get_wx_waiters(lock)) {
++ fputs(" wait_ex_waiters for the lock exist\n", stderr);
+ } else {
+ putc('\n', stderr);
+ }
+@@ -909,14 +1133,18 @@
+ lock = UT_LIST_GET_FIRST(rw_lock_list);
+
+ while (lock != NULL) {
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_enter(rw_lock_get_mutex(lock));
++#endif
+
+ if ((rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED)
+ || (rw_lock_get_reader_count(lock) != 0)) {
+ count++;
+ }
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_exit(rw_lock_get_mutex(lock));
++#endif
+ lock = UT_LIST_GET_NEXT(list, lock);
+ }
+
+diff -ruN a/patch_info/innodb_rw_lock.info b/patch_info/innodb_rw_lock.info
+--- /dev/null 1970-01-01 09:00:00.000000000 +0900
++++ b/patch_info/innodb_rw_lock.info 2009-04-16 16:15:28.000000000 +0900
+@@ -0,0 +1,6 @@
++File=innodb_rw_lock.patch
++Name=Fix of InnoDB rw_locks
++Version=1.0
++Author=Yasufumi Kinoshita
++License=BSD
++Comment=
diff --git a/percona/5.0.87-b20-20091116/innodb_show_hashed_memory_standalone.patch b/percona/5.0.87-b20-20091116/innodb_show_hashed_memory_standalone.patch
new file mode 100644
index 0000000..bf8f6b4
--- /dev/null
+++ b/percona/5.0.87-b20-20091116/innodb_show_hashed_memory_standalone.patch
@@ -0,0 +1,264 @@
+diff -ruN mysql-5.0.67_highperf/innobase/buf/buf0buf.c mysql-5.0.67_highperf_tmp/innobase/buf/buf0buf.c
+--- mysql-5.0.67_highperf/innobase/buf/buf0buf.c 2008-11-12 09:25:58.000000000 +0900
++++ mysql-5.0.67_highperf_tmp/innobase/buf/buf0buf.c 2008-11-12 09:27:52.000000000 +0900
+@@ -2454,13 +2454,15 @@
+ (ulong) UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped));
+ }
+ fprintf(file,
+- "Buffer pool size %lu\n"
+- "Free buffers %lu\n"
+- "Database pages %lu\n"
+- "Modified db pages %lu\n"
++ "Buffer pool size %lu\n"
++ "Buffer pool size, bytes %lu\n"
++ "Free buffers %lu\n"
++ "Database pages %lu\n"
++ "Modified db pages %lu\n"
+ "Pending reads %lu\n"
+ "Pending writes: LRU %lu, flush list %lu, single page %lu\n",
+ (ulong) size,
++ (ulong) size * UNIV_PAGE_SIZE,
+ (ulong) UT_LIST_GET_LEN(buf_pool->free),
+ (ulong) UT_LIST_GET_LEN(buf_pool->LRU),
+ (ulong) UT_LIST_GET_LEN(buf_pool->flush_list),
+diff -ruN mysql-5.0.67_highperf/innobase/fil/fil0fil.c mysql-5.0.67_highperf_tmp/innobase/fil/fil0fil.c
+--- mysql-5.0.67_highperf/innobase/fil/fil0fil.c 2008-11-12 09:26:07.000000000 +0900
++++ mysql-5.0.67_highperf_tmp/innobase/fil/fil0fil.c 2008-11-12 09:27:52.000000000 +0900
+@@ -4472,3 +4472,30 @@
+
+ return(mach_read_from_2(page + FIL_PAGE_TYPE));
+ }
++
++/*************************************************************************
++Return local hash table informations. */
++
++ulint
++fil_system_hash_cells(void)
++/*=======================*/
++{
++ if (fil_system) {
++ return (fil_system->spaces->n_cells
++ + fil_system->name_hash->n_cells);
++ } else {
++ return 0;
++ }
++}
++
++ulint
++fil_system_hash_nodes(void)
++/*=======================*/
++{
++ if (fil_system) {
++ return (UT_LIST_GET_LEN(fil_system->space_list)
++ * (sizeof(fil_space_t) + MEM_BLOCK_HEADER_SIZE));
++ } else {
++ return 0;
++ }
++}
+diff -ruN mysql-5.0.67_highperf/innobase/include/fil0fil.h mysql-5.0.67_highperf_tmp/innobase/include/fil0fil.h
+--- mysql-5.0.67_highperf/innobase/include/fil0fil.h 2008-11-12 09:26:07.000000000 +0900
++++ mysql-5.0.67_highperf_tmp/innobase/include/fil0fil.h 2008-11-12 09:27:52.000000000 +0900
+@@ -701,6 +701,16 @@
+ written to page, the return value not defined */
+ byte* page); /* in: file page */
+
++/*************************************************************************
++Return local hash table informations. */
++
++ulint
++fil_system_hash_cells(void);
++/*========================*/
++
++ulint
++fil_system_hash_nodes(void);
++/*========================*/
+
+ typedef struct fil_space_struct fil_space_t;
+
+diff -ruN mysql-5.0.67_highperf/innobase/include/thr0loc.h mysql-5.0.67_highperf_tmp/innobase/include/thr0loc.h
+--- mysql-5.0.67_highperf/innobase/include/thr0loc.h 2008-11-12 09:24:58.000000000 +0900
++++ mysql-5.0.67_highperf_tmp/innobase/include/thr0loc.h 2008-11-12 09:27:52.000000000 +0900
+@@ -77,6 +77,17 @@
+ /*=============================*/
+ /* out: pointer to the in_ibuf field */
+
++/*************************************************************************
++Return local hash table informations. */
++
++ulint
++thr_local_hash_cells(void);
++/*=======================*/
++
++ulint
++thr_local_hash_nodes(void);
++/*=======================*/
++
+ #ifndef UNIV_NONINL
+ #include "thr0loc.ic"
+ #endif
+diff -ruN mysql-5.0.67_highperf/innobase/srv/srv0srv.c mysql-5.0.67_highperf_tmp/innobase/srv/srv0srv.c
+--- mysql-5.0.67_highperf/innobase/srv/srv0srv.c 2008-11-12 09:26:07.000000000 +0900
++++ mysql-5.0.67_highperf_tmp/innobase/srv/srv0srv.c 2008-11-12 09:54:19.000000000 +0900
+@@ -1645,6 +1645,14 @@
+ time_t current_time;
+ ulint n_reserved;
+
++ ulint btr_search_sys_subtotal;
++ ulint lock_sys_subtotal;
++ ulint recv_sys_subtotal;
++ ulint io_counter_subtotal;
++
++ ulint i;
++ trx_t* trx;
++
+ mutex_enter(&srv_innodb_monitor_mutex);
+
+ current_time = time(NULL);
+@@ -1747,6 +1755,80 @@
+ ut_total_allocated_memory,
+ mem_pool_get_reserved(mem_comm_pool));
+
++ /* Calcurate reserved memories */
++ if (btr_search_sys && btr_search_sys->hash_index->heap) {
++ btr_search_sys_subtotal = mem_heap_get_size(btr_search_sys->hash_index->heap);
++ } else {
++ btr_search_sys_subtotal = 0;
++ for (i=0; i < btr_search_sys->hash_index->n_mutexes; i++) {
++ btr_search_sys_subtotal += mem_heap_get_size(btr_search_sys->hash_index->heaps[i]);
++ }
++ }
++
++ lock_sys_subtotal = 0;
++ if (trx_sys) {
++ mutex_enter(&kernel_mutex);
++ trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list);
++ while (trx) {
++ lock_sys_subtotal += ((trx->lock_heap) ? mem_heap_get_size(trx->lock_heap) : 0);
++ trx = UT_LIST_GET_NEXT(mysql_trx_list, trx);
++ }
++ mutex_exit(&kernel_mutex);
++ }
++
++ recv_sys_subtotal = ((recv_sys && recv_sys->addr_hash)
++ ? mem_heap_get_size(recv_sys->heap) : 0);
++
++ fprintf(file,
++ "Internal hash tables (constant factor + variable factor)\n"
++ " Adaptive hash index %lu \t(%lu + %lu)\n"
++ " Page hash %lu\n"
++ " Dictionary cache %lu \t(%lu + %lu)\n"
++ " File system %lu \t(%lu + %lu)\n"
++ " Lock system %lu \t(%lu + %lu)\n"
++ " Recovery system %lu \t(%lu + %lu)\n"
++ " Threads %lu \t(%lu + %lu)\n",
++
++ (ulong) (btr_search_sys
++ ? (btr_search_sys->hash_index->n_cells * sizeof(hash_cell_t)) : 0)
++ + btr_search_sys_subtotal,
++ (ulong) (btr_search_sys
++ ? (btr_search_sys->hash_index->n_cells * sizeof(hash_cell_t)) : 0),
++ (ulong) btr_search_sys_subtotal,
++
++ (ulong) (buf_pool->page_hash->n_cells * sizeof(hash_cell_t)),
++
++ (ulong) (dict_sys ? ((dict_sys->table_hash->n_cells
++ + dict_sys->table_id_hash->n_cells
++ + dict_sys->col_hash->n_cells) * sizeof(hash_cell_t)
++ + dict_sys->size) : 0),
++ (ulong) (dict_sys ? ((dict_sys->table_hash->n_cells
++ + dict_sys->table_id_hash->n_cells
++ + dict_sys->col_hash->n_cells) * sizeof(hash_cell_t)) : 0),
++ (ulong) (dict_sys ? (dict_sys->size) : 0),
++
++ (ulong) (fil_system_hash_cells() * sizeof(hash_cell_t)
++ + fil_system_hash_nodes()),
++ (ulong) (fil_system_hash_cells() * sizeof(hash_cell_t)),
++ (ulong) fil_system_hash_nodes(),
++
++ (ulong) ((lock_sys ? (lock_sys->rec_hash->n_cells * sizeof(hash_cell_t)) : 0)
++ + lock_sys_subtotal),
++ (ulong) (lock_sys ? (lock_sys->rec_hash->n_cells * sizeof(hash_cell_t)) : 0),
++ (ulong) lock_sys_subtotal,
++
++ (ulong) (((recv_sys && recv_sys->addr_hash)
++ ? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0)
++ + recv_sys_subtotal),
++ (ulong) ((recv_sys && recv_sys->addr_hash)
++ ? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0),
++ (ulong) recv_sys_subtotal,
++
++ (ulong) (thr_local_hash_cells() * sizeof(hash_cell_t)
++ + thr_local_hash_nodes()),
++ (ulong) (thr_local_hash_cells() * sizeof(hash_cell_t)),
++ (ulong) thr_local_hash_nodes());
++
+ if (srv_use_awe) {
+ fprintf(file,
+ "In addition to that %lu MB of AWE memory allocated\n",
+diff -ruN mysql-5.0.67_highperf/innobase/thr/thr0loc.c mysql-5.0.67_highperf_tmp/innobase/thr/thr0loc.c
+--- mysql-5.0.67_highperf/innobase/thr/thr0loc.c 2008-11-12 09:24:58.000000000 +0900
++++ mysql-5.0.67_highperf_tmp/innobase/thr/thr0loc.c 2008-11-12 09:27:52.000000000 +0900
+@@ -32,6 +32,7 @@
+
+ /* The hash table. The module is not yet initialized when it is NULL. */
+ hash_table_t* thr_local_hash = NULL;
++ulint thr_local_hash_n_nodes = 0;
+
+ /* The private data for each thread should be put to
+ the structure below and the accessor functions written
+@@ -223,6 +224,7 @@
+ HASH_INSERT(thr_local_t, hash, thr_local_hash,
+ os_thread_pf(os_thread_get_curr_id()),
+ local);
++ thr_local_hash_n_nodes++;
+
+ mutex_exit(&thr_local_mutex);
+ }
+@@ -251,6 +253,7 @@
+
+ HASH_DELETE(thr_local_t, hash, thr_local_hash,
+ os_thread_pf(id), local);
++ thr_local_hash_n_nodes--;
+
+ mutex_exit(&thr_local_mutex);
+
+@@ -274,3 +277,29 @@
+ mutex_create(&thr_local_mutex);
+ mutex_set_level(&thr_local_mutex, SYNC_THR_LOCAL);
+ }
++
++/*************************************************************************
++Return local hash table informations. */
++
++ulint
++thr_local_hash_cells(void)
++/*======================*/
++{
++ if (thr_local_hash) {
++ return (thr_local_hash->n_cells);
++ } else {
++ return 0;
++ }
++}
++
++ulint
++thr_local_hash_nodes(void)
++/*======================*/
++{
++ if (thr_local_hash) {
++ return (thr_local_hash_n_nodes
++ * (sizeof(thr_local_t) + MEM_BLOCK_HEADER_SIZE));
++ } else {
++ return 0;
++ }
++}
+diff -ruN mysql-5.0.67_highperf/patch_info/innodb_show_hashed_memory.info mysql-5.0.67_highperf_tmp/patch_info/innodb_show_hashed_memory.info
+--- /dev/null 1970-01-01 09:00:00.000000000 +0900
++++ mysql-5.0.67_highperf_tmp/patch_info/innodb_show_hashed_memory.info 2008-11-12 09:27:52.000000000 +0900
+@@ -0,0 +1,6 @@
++File=innodb_show_hashed_memory.patch
++Name=Adds additional information of InnoDB internal hash table memories in SHOW INNODB STATUS
++Version=1.0
++Author=Percona <info@percona.com>
++License=GPL
++Comment=
diff --git a/percona/5.0.87-b20-20091116/mirror_binlog.patch b/percona/5.0.87-b20-20091116/mirror_binlog.patch
new file mode 100644
index 0000000..d52e806
--- /dev/null
+++ b/percona/5.0.87-b20-20091116/mirror_binlog.patch
@@ -0,0 +1,2694 @@
+diff -r 66cc9e0a6768 mysql-test/lib/mtr_cases.pl
+--- a/mysql-test/lib/mtr_cases.pl Thu Dec 04 21:37:12 2008 -0800
++++ b/mysql-test/lib/mtr_cases.pl Thu Dec 04 21:46:15 2008 -0800
+@@ -334,6 +334,10 @@
+
+ $tinfo->{'slave_num'}= 1; # Default for rpl* tests, use one slave
+
++ if ( $tname eq 'rpl_mirror_binlog' )
++ {
++ $tinfo->{'slave_num'}= 3;
++ }
+ }
+
+ if ( defined mtr_match_prefix($tname,"federated") )
+@@ -344,15 +348,20 @@
+
+ my $master_opt_file= "$testdir/$tname-master.opt";
+ my $slave_opt_file= "$testdir/$tname-slave.opt";
+- my $slave_mi_file= "$testdir/$tname.slave-mi";
++ my $slave_mi_files= ["$testdir/$tname.slave-mi",
++ "$testdir/$tname.1.slave-mi",
++ "$testdir/$tname.2.slave-mi"];
+ my $master_sh= "$testdir/$tname-master.sh";
+ my $slave_sh= "$testdir/$tname-slave.sh";
+ my $disabled_file= "$testdir/$tname.disabled";
+ my $im_opt_file= "$testdir/$tname-im.opt";
+
+- $tinfo->{'master_opt'}= [];
+- $tinfo->{'slave_opt'}= [];
+- $tinfo->{'slave_mi'}= [];
++ $tinfo->{'master_opt'}= [];
++ $tinfo->{'slave_opt'}= [];
++ $tinfo->{'slave_mi'}= {};
++ $tinfo->{'slave_mi'}{0}= [];
++ $tinfo->{'slave_mi'}{1}= [];
++ $tinfo->{'slave_mi'}{2}= [];
+
+ if ( -f $master_opt_file )
+ {
+@@ -427,9 +436,14 @@
+ push(@{$tinfo->{'slave_opt'}}, @$slave_opt);
+ }
+
+- if ( -f $slave_mi_file )
++ my $mi_idx= 0;
++ foreach my $slave_mi_file ( @$slave_mi_files )
+ {
+- $tinfo->{'slave_mi'}= mtr_get_opts_from_file($slave_mi_file);
++ if ( -f $slave_mi_file )
++ {
++ $tinfo->{'slave_mi'}{$mi_idx}= mtr_get_opts_from_file($slave_mi_file);
++ }
++ $mi_idx+= 1;
+ }
+
+ if ( -f $master_sh )
+diff -r 66cc9e0a6768 mysql-test/mysql-test-run.pl
+--- a/mysql-test/mysql-test-run.pl Thu Dec 04 21:37:12 2008 -0800
++++ b/mysql-test/mysql-test-run.pl Thu Dec 04 21:46:15 2008 -0800
+@@ -275,6 +275,7 @@
+ our $opt_stress_test_file= "";
+
+ our $opt_warnings;
++our $opt_slave_innodb= 0;
+
+ our $opt_skip_ndbcluster= 0;
+ our $opt_skip_ndbcluster_slave= 0;
+@@ -299,6 +300,8 @@
+ our $used_binlog_format;
+ our $used_default_engine;
+ our $debug_compiled_binaries;
++
++our $current_testname= "";
+
+ our %mysqld_variables;
+
+@@ -645,6 +648,7 @@
+ 'testcase-timeout=i' => \$opt_testcase_timeout,
+ 'suite-timeout=i' => \$opt_suite_timeout,
+ 'warnings|log-warnings' => \$opt_warnings,
++ 'slave-innodb' => \$opt_slave_innodb,
+
+ # Options which are no longer used
+ (map { $_ => \&warn_about_removed_option } @removed_options),
+@@ -1001,6 +1005,14 @@
+ {
+ $ENV{'BIG_TEST'}= 1;
+ }
++
++ # --------------------------------------------------------------------------
++ # Big test flags
++ # --------------------------------------------------------------------------
++ if ( $opt_big_test )
++ {
++ $ENV{'BIG_TEST'}= 1;
++ }
+
+ # --------------------------------------------------------------------------
+ # Gcov flag
+@@ -1885,7 +1897,9 @@
+ $ENV{'SLAVE_MYSOCK'}= $slave->[0]->{'path_sock'};
+ $ENV{'SLAVE_MYPORT'}= $slave->[0]->{'port'};
+ $ENV{'SLAVE_MYPORT1'}= $slave->[1]->{'port'};
++ $ENV{'SLAVE_MYSOCK1'}= $slave->[1]->{'path_sock'};
+ $ENV{'SLAVE_MYPORT2'}= $slave->[2]->{'port'};
++ $ENV{'SLAVE_MYSOCK2'}= $slave->[2]->{'path_sock'};
+ $ENV{'MYSQL_TCP_PORT'}= $mysqld_variables{'port'};
+ $ENV{'DEFAULT_MASTER_PORT'}= $mysqld_variables{'master-port'};
+
+@@ -2375,6 +2389,8 @@
+ if ( ! $glob_win32 )
+ {
+ symlink("$glob_mysql_test_dir/std_data", "$opt_vardir/std_data_ln");
++ my @a = ("chmod", "-R", "o+r", "$glob_mysql_test_dir/std_data");
++ system(@a) == 0 or die "system @ failed: $?"
+ }
+ else
+ {
+@@ -3466,6 +3482,8 @@
+ $ENV{'TZ'}= $tinfo->{'timezone'};
+ mtr_verbose("Setting timezone: $tinfo->{'timezone'}");
+
++ $current_testname= $tinfo->{'name'};
++
+ my $master_restart= run_testcase_need_master_restart($tinfo);
+ my $slave_restart= run_testcase_need_slave_restart($tinfo);
+
+@@ -3881,7 +3899,8 @@
+ unless $mysqld->{'type'} eq 'slave';
+
+ mtr_add_arg($args, "%s--init-rpl-role=slave", $prefix);
+- if (! ( $opt_skip_slave_binlog || $skip_binlog ))
++
++ if (! ($opt_skip_slave_binlog or ($current_testname eq 'rpl_mirror_binlog')) )
+ {
+ mtr_add_arg($args, "%s--log-bin=%s/log/slave%s-bin", $prefix,
+ $opt_vardir, $sidx); # FIXME use own dir for binlogs
+@@ -4568,7 +4587,7 @@
+ if ( ! $slave->[$idx]->{'pid'} )
+ {
+ mysqld_start($slave->[$idx],$tinfo->{'slave_opt'},
+- $tinfo->{'slave_mi'});
++ $tinfo->{'slave_mi'}{$idx});
+
+ }
+ }
+@@ -4580,7 +4599,6 @@
+ # Wait for clusters to start
+ foreach my $cluster (@{$clusters})
+ {
+-
+ next if !$cluster->{'pid'};
+
+ if (ndbcluster_wait_started($cluster, ""))
+@@ -5179,6 +5197,7 @@
+ skip-im Don't start IM, and skip the IM test cases
+ big-test Set the environment variable BIG_TEST, which can be
+ checked from test cases.
++
+
+ Options that specify ports
+
+diff -r 66cc9e0a6768 mysql-test/r/rpl_mirror_binlog.result
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/mysql-test/r/rpl_mirror_binlog.result Thu Dec 04 21:46:15 2008 -0800
+@@ -0,0 +1,441 @@
++stop slave;
++drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
++reset master;
++reset slave;
++drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
++start slave;
++drop table if exists t1;
++create table t1(n int) engine = InnoDB;
++insert into t1 values (300);
++insert into t1 values (299);
++insert into t1 values (298);
++insert into t1 values (297);
++insert into t1 values (296);
++insert into t1 values (295);
++insert into t1 values (294);
++insert into t1 values (293);
++insert into t1 values (292);
++insert into t1 values (291);
++insert into t1 values (290);
++insert into t1 values (289);
++insert into t1 values (288);
++insert into t1 values (287);
++insert into t1 values (286);
++insert into t1 values (285);
++insert into t1 values (284);
++insert into t1 values (283);
++insert into t1 values (282);
++insert into t1 values (281);
++insert into t1 values (280);
++insert into t1 values (279);
++insert into t1 values (278);
++insert into t1 values (277);
++insert into t1 values (276);
++insert into t1 values (275);
++insert into t1 values (274);
++insert into t1 values (273);
++insert into t1 values (272);
++insert into t1 values (271);
++insert into t1 values (270);
++insert into t1 values (269);
++insert into t1 values (268);
++insert into t1 values (267);
++insert into t1 values (266);
++insert into t1 values (265);
++insert into t1 values (264);
++insert into t1 values (263);
++insert into t1 values (262);
++insert into t1 values (261);
++insert into t1 values (260);
++insert into t1 values (259);
++insert into t1 values (258);
++insert into t1 values (257);
++insert into t1 values (256);
++insert into t1 values (255);
++insert into t1 values (254);
++insert into t1 values (253);
++insert into t1 values (252);
++insert into t1 values (251);
++insert into t1 values (250);
++insert into t1 values (249);
++insert into t1 values (248);
++insert into t1 values (247);
++insert into t1 values (246);
++insert into t1 values (245);
++insert into t1 values (244);
++insert into t1 values (243);
++insert into t1 values (242);
++insert into t1 values (241);
++insert into t1 values (240);
++insert into t1 values (239);
++insert into t1 values (238);
++insert into t1 values (237);
++insert into t1 values (236);
++insert into t1 values (235);
++insert into t1 values (234);
++insert into t1 values (233);
++insert into t1 values (232);
++insert into t1 values (231);
++insert into t1 values (230);
++insert into t1 values (229);
++insert into t1 values (228);
++insert into t1 values (227);
++insert into t1 values (226);
++insert into t1 values (225);
++insert into t1 values (224);
++insert into t1 values (223);
++insert into t1 values (222);
++insert into t1 values (221);
++insert into t1 values (220);
++insert into t1 values (219);
++insert into t1 values (218);
++insert into t1 values (217);
++insert into t1 values (216);
++insert into t1 values (215);
++insert into t1 values (214);
++insert into t1 values (213);
++insert into t1 values (212);
++insert into t1 values (211);
++insert into t1 values (210);
++insert into t1 values (209);
++insert into t1 values (208);
++insert into t1 values (207);
++insert into t1 values (206);
++insert into t1 values (205);
++insert into t1 values (204);
++insert into t1 values (203);
++insert into t1 values (202);
++insert into t1 values (201);
++insert into t1 values (200);
++insert into t1 values (199);
++insert into t1 values (198);
++insert into t1 values (197);
++insert into t1 values (196);
++insert into t1 values (195);
++insert into t1 values (194);
++insert into t1 values (193);
++insert into t1 values (192);
++insert into t1 values (191);
++insert into t1 values (190);
++insert into t1 values (189);
++insert into t1 values (188);
++insert into t1 values (187);
++insert into t1 values (186);
++insert into t1 values (185);
++insert into t1 values (184);
++insert into t1 values (183);
++insert into t1 values (182);
++insert into t1 values (181);
++insert into t1 values (180);
++insert into t1 values (179);
++insert into t1 values (178);
++insert into t1 values (177);
++insert into t1 values (176);
++insert into t1 values (175);
++insert into t1 values (174);
++insert into t1 values (173);
++insert into t1 values (172);
++insert into t1 values (171);
++insert into t1 values (170);
++insert into t1 values (169);
++insert into t1 values (168);
++insert into t1 values (167);
++insert into t1 values (166);
++insert into t1 values (165);
++insert into t1 values (164);
++insert into t1 values (163);
++insert into t1 values (162);
++insert into t1 values (161);
++insert into t1 values (160);
++insert into t1 values (159);
++insert into t1 values (158);
++insert into t1 values (157);
++insert into t1 values (156);
++insert into t1 values (155);
++insert into t1 values (154);
++insert into t1 values (153);
++insert into t1 values (152);
++insert into t1 values (151);
++insert into t1 values (150);
++insert into t1 values (149);
++insert into t1 values (148);
++insert into t1 values (147);
++insert into t1 values (146);
++insert into t1 values (145);
++insert into t1 values (144);
++insert into t1 values (143);
++insert into t1 values (142);
++insert into t1 values (141);
++insert into t1 values (140);
++insert into t1 values (139);
++insert into t1 values (138);
++insert into t1 values (137);
++insert into t1 values (136);
++insert into t1 values (135);
++insert into t1 values (134);
++insert into t1 values (133);
++insert into t1 values (132);
++insert into t1 values (131);
++insert into t1 values (130);
++insert into t1 values (129);
++insert into t1 values (128);
++insert into t1 values (127);
++insert into t1 values (126);
++insert into t1 values (125);
++insert into t1 values (124);
++insert into t1 values (123);
++insert into t1 values (122);
++insert into t1 values (121);
++insert into t1 values (120);
++insert into t1 values (119);
++insert into t1 values (118);
++insert into t1 values (117);
++insert into t1 values (116);
++insert into t1 values (115);
++insert into t1 values (114);
++insert into t1 values (113);
++insert into t1 values (112);
++insert into t1 values (111);
++insert into t1 values (110);
++insert into t1 values (109);
++insert into t1 values (108);
++insert into t1 values (107);
++insert into t1 values (106);
++insert into t1 values (105);
++insert into t1 values (104);
++insert into t1 values (103);
++insert into t1 values (102);
++insert into t1 values (101);
++insert into t1 values (100);
++insert into t1 values (99);
++insert into t1 values (98);
++insert into t1 values (97);
++insert into t1 values (96);
++insert into t1 values (95);
++insert into t1 values (94);
++insert into t1 values (93);
++insert into t1 values (92);
++insert into t1 values (91);
++insert into t1 values (90);
++insert into t1 values (89);
++insert into t1 values (88);
++insert into t1 values (87);
++insert into t1 values (86);
++insert into t1 values (85);
++insert into t1 values (84);
++insert into t1 values (83);
++insert into t1 values (82);
++insert into t1 values (81);
++insert into t1 values (80);
++insert into t1 values (79);
++insert into t1 values (78);
++insert into t1 values (77);
++insert into t1 values (76);
++insert into t1 values (75);
++insert into t1 values (74);
++insert into t1 values (73);
++insert into t1 values (72);
++insert into t1 values (71);
++insert into t1 values (70);
++insert into t1 values (69);
++insert into t1 values (68);
++insert into t1 values (67);
++insert into t1 values (66);
++insert into t1 values (65);
++insert into t1 values (64);
++insert into t1 values (63);
++insert into t1 values (62);
++insert into t1 values (61);
++insert into t1 values (60);
++insert into t1 values (59);
++insert into t1 values (58);
++insert into t1 values (57);
++insert into t1 values (56);
++insert into t1 values (55);
++insert into t1 values (54);
++insert into t1 values (53);
++insert into t1 values (52);
++insert into t1 values (51);
++insert into t1 values (50);
++insert into t1 values (49);
++insert into t1 values (48);
++insert into t1 values (47);
++insert into t1 values (46);
++insert into t1 values (45);
++insert into t1 values (44);
++insert into t1 values (43);
++insert into t1 values (42);
++insert into t1 values (41);
++insert into t1 values (40);
++insert into t1 values (39);
++insert into t1 values (38);
++insert into t1 values (37);
++insert into t1 values (36);
++insert into t1 values (35);
++insert into t1 values (34);
++insert into t1 values (33);
++insert into t1 values (32);
++insert into t1 values (31);
++insert into t1 values (30);
++insert into t1 values (29);
++insert into t1 values (28);
++insert into t1 values (27);
++insert into t1 values (26);
++insert into t1 values (25);
++insert into t1 values (24);
++insert into t1 values (23);
++insert into t1 values (22);
++insert into t1 values (21);
++insert into t1 values (20);
++insert into t1 values (19);
++insert into t1 values (18);
++insert into t1 values (17);
++insert into t1 values (16);
++insert into t1 values (15);
++insert into t1 values (14);
++insert into t1 values (13);
++insert into t1 values (12);
++insert into t1 values (11);
++insert into t1 values (10);
++insert into t1 values (9);
++insert into t1 values (8);
++insert into t1 values (7);
++insert into t1 values (6);
++insert into t1 values (5);
++insert into t1 values (4);
++insert into t1 values (3);
++insert into t1 values (2);
++insert into t1 values (1);
++"The following are SLAVE."
++select count(distinct n) from t1;
++count(distinct n)
++300
++select min(n) from t1;
++min(n)
++1
++select max(n) from t1;
++max(n)
++300
++show slave status;
++Slave_IO_State Master_Host Master_User Master_Port Connect_Retry Master_Log_File Read_Master_Log_Pos Relay_Log_File Relay_Log_Pos Relay_Master_Log_File Slave_IO_Running Slave_SQL_Running Replicate_Do_DB Replicate_Ignore_DB Replicate_Do_Table Replicate_Ignore_Table Replicate_Wild_Do_Table Replicate_Wild_Ignore_Table Last_Errno Last_Error Skip_Counter Exec_Master_Log_Pos Relay_Log_Space Until_Condition Until_Log_File Until_Log_Pos Master_SSL_Allowed Master_SSL_CA_File Master_SSL_CA_Path Master_SSL_Cert Master_SSL_Cipher Master_SSL_Key Seconds_Behind_Master
++Waiting for master to send event 127.0.0.1 root 9306 1 master-bin.000014 2849 # # master-bin.000014 Yes Yes # 0 0 2849 # None 0 No #
++show master status;
++File Position Binlog_Do_DB Binlog_Ignore_DB
++master-bin.000014 2849
++"The following are SLAVE1."
++start slave;
++select count(distinct n) from t1;
++count(distinct n)
++300
++select min(n) from t1;
++min(n)
++1
++select max(n) from t1;
++max(n)
++300
++show slave status;
++Slave_IO_State Master_Host Master_User Master_Port Connect_Retry Master_Log_File Read_Master_Log_Pos Relay_Log_File Relay_Log_Pos Relay_Master_Log_File Slave_IO_Running Slave_SQL_Running Replicate_Do_DB Replicate_Ignore_DB Replicate_Do_Table Replicate_Ignore_Table Replicate_Wild_Do_Table Replicate_Wild_Ignore_Table Last_Errno Last_Error Skip_Counter Exec_Master_Log_Pos Relay_Log_Space Until_Condition Until_Log_File Until_Log_Pos Master_SSL_Allowed Master_SSL_CA_File Master_SSL_CA_Path Master_SSL_Cert Master_SSL_Cipher Master_SSL_Key Seconds_Behind_Master
++Waiting for master to send event 127.0.0.1 root 9308 1 master-bin.000014 2849 # # master-bin.000014 Yes Yes # 0 0 2849 # None 0 No #
++"The following are SLAVE."
++MAKE MASTER MASTER_LOG_FILE='master-bin',
++MASTER_SERVER_ID=2,
++INDEX='replication-log';
++ERROR HY000: Could not initialize master info structure; more error messages can be found in the MySQL error log
++stop slave;
++MAKE MASTER MASTER_LOG_FILE='master-bin',
++MASTER_SERVER_ID=2,
++INDEX='replication_log';
++ERROR HY000: Could not initialize master info structure; more error messages can be found in the MySQL error log
++MAKE MASTER REVOKE SESSION WITH KILL;
++MAKE MASTER MASTER_LOG_FILE='master-bin',
++MASTER_SERVER_ID=2,
++INDEX='replication_log'
++ WITH BINLOG;
++MAKE MASTER GRANT SESSION;
++delete from t1 where n > 250;
++select count(distinct n) from t1;
++count(distinct n)
++250
++"The following are SLAVE1."
++select count(distinct n) from t1;
++count(distinct n)
++250
++select min(n) from t1;
++min(n)
++1
++select max(n) from t1;
++max(n)
++250
++"The following are SLAVE2."
++start slave;
++select count(distinct n) from t1;
++count(distinct n)
++250
++select min(n) from t1;
++min(n)
++1
++select max(n) from t1;
++max(n)
++250
++show slave status;
++Slave_IO_State Master_Host Master_User Master_Port Connect_Retry Master_Log_File Read_Master_Log_Pos Relay_Log_File Relay_Log_Pos Relay_Master_Log_File Slave_IO_Running Slave_SQL_Running Replicate_Do_DB Replicate_Ignore_DB Replicate_Do_Table Replicate_Ignore_Table Replicate_Wild_Do_Table Replicate_Wild_Ignore_Table Last_Errno Last_Error Skip_Counter Exec_Master_Log_Pos Relay_Log_Space Until_Condition Until_Log_File Until_Log_Pos Master_SSL_Allowed Master_SSL_CA_File Master_SSL_CA_Path Master_SSL_Cert Master_SSL_Cipher Master_SSL_Key Seconds_Behind_Master
++Waiting for master to send event 127.0.0.1 root 9308 1 master-bin.000015 189 # # master-bin.000015 Yes Yes # 0 0 189 # None 0 No #
++drop table t1;
++drop table t1;
++"The following are SLAVE."
++show master logs;
++Log_name File_size
++master-bin.000001 4214
++master-bin.000002 4212
++master-bin.000003 4212
++master-bin.000004 4212
++master-bin.000005 4212
++master-bin.000006 4212
++master-bin.000007 4212
++master-bin.000008 4212
++master-bin.000009 4212
++master-bin.000010 4194
++master-bin.000011 4190
++master-bin.000012 4190
++master-bin.000013 4190
++master-bin.000014 2849
++master-bin.000015 265
++show master status;
++File Position Binlog_Do_DB Binlog_Ignore_DB
++master-bin.000015 265
++"The following are SLAVE2."
++show master logs;
++Log_name File_size
++master-bin.000001 4214
++master-bin.000002 4212
++master-bin.000003 4212
++master-bin.000004 4212
++master-bin.000005 4212
++master-bin.000006 4212
++master-bin.000007 4212
++master-bin.000008 4212
++master-bin.000009 4212
++master-bin.000010 4194
++master-bin.000011 4190
++master-bin.000012 4190
++master-bin.000013 4190
++master-bin.000014 2849
++master-bin.000015 265
++show master status;
++File Position Binlog_Do_DB Binlog_Ignore_DB
++master-bin.000015 265
++purge master logs to 'master-bin.000006';
++show master logs;
++Log_name File_size
++master-bin.000006 4212
++master-bin.000007 4212
++master-bin.000008 4212
++master-bin.000009 4212
++master-bin.000010 4194
++master-bin.000011 4190
++master-bin.000012 4190
++master-bin.000013 4190
++master-bin.000014 2849
++master-bin.000015 265
++reset master;
++ERROR HY000: Binlog closed, cannot RESET MASTER
+diff -r 66cc9e0a6768 mysql-test/t/rpl_mirror_binlog-master.opt
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/mysql-test/t/rpl_mirror_binlog-master.opt Thu Dec 04 21:46:15 2008 -0800
+@@ -0,0 +1,1 @@
++-O max_binlog_size=4096
+diff -r 66cc9e0a6768 mysql-test/t/rpl_mirror_binlog-slave.opt
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/mysql-test/t/rpl_mirror_binlog-slave.opt Thu Dec 04 21:46:15 2008 -0800
+@@ -0,0 +1,1 @@
++--rpl_mirror_binlog_enabled=1 --log-bin-index=replication_log
+diff -r 66cc9e0a6768 mysql-test/t/rpl_mirror_binlog.1.slave-mi
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/mysql-test/t/rpl_mirror_binlog.1.slave-mi Thu Dec 04 21:46:15 2008 -0800
+@@ -0,0 +1,1 @@
++--master-user=root --master-connect-retry=1 --master-host=127.0.0.1 --master-password="" --master-port=9308 --server-id=3
+diff -r 66cc9e0a6768 mysql-test/t/rpl_mirror_binlog.2.slave-mi
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/mysql-test/t/rpl_mirror_binlog.2.slave-mi Thu Dec 04 21:46:15 2008 -0800
+@@ -0,0 +1,1 @@
++--master-user=root --master-connect-retry=1 --master-host=127.0.0.1 --master-password="" --master-port=9308 --server-id=4
+diff -r 66cc9e0a6768 mysql-test/t/rpl_mirror_binlog.test
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/mysql-test/t/rpl_mirror_binlog.test Thu Dec 04 21:46:15 2008 -0800
+@@ -0,0 +1,119 @@
++-- source include/master-slave.inc
++-- source include/have_innodb.inc
++connect (slave_sec,localhost,root,,test,$SLAVE_MYPORT1,$SLAVE_MYSOCK1);
++connect (slave_ter,localhost,root,,test,$SLAVE_MYPORT2,$SLAVE_MYSOCK2);
++
++connection master;
++--disable_warnings
++drop table if exists t1;
++--enable_warnings
++create table t1(n int) engine = InnoDB;
++
++let $i=300;
++while ($i)
++{
++ eval insert into t1 values ($i);
++ dec $i;
++}
++
++save_master_pos;
++
++connection slave;
++sync_with_master;
++
++echo "The following are SLAVE.";
++select count(distinct n) from t1;
++select min(n) from t1;
++select max(n) from t1;
++--replace_column 8 # 9 # 18 # 23 # 33 #
++show slave status;
++show master status;
++
++connection slave_sec;
++echo "The following are SLAVE1.";
++start slave;
++sync_with_master;
++
++select count(distinct n) from t1;
++select min(n) from t1;
++select max(n) from t1;
++--replace_column 8 # 9 # 18 # 23 # 33 #
++show slave status;
++
++# make the slave the new master
++connection slave;
++echo "The following are SLAVE.";
++
++# The first 1201 error is caused by running slave.
++--error 1201
++MAKE MASTER MASTER_LOG_FILE='master-bin',
++ MASTER_SERVER_ID=2,
++ INDEX='replication-log';
++stop slave;
++
++# The second 1201 error is caused by failover mode.
++--error 1201
++MAKE MASTER MASTER_LOG_FILE='master-bin',
++ MASTER_SERVER_ID=2,
++ INDEX='replication_log';
++
++MAKE MASTER REVOKE SESSION WITH KILL;
++MAKE MASTER MASTER_LOG_FILE='master-bin',
++ MASTER_SERVER_ID=2,
++ INDEX='replication_log'
++ WITH BINLOG;
++
++MAKE MASTER GRANT SESSION;
++
++delete from t1 where n > 250;
++save_master_pos;
++
++select count(distinct n) from t1;
++
++connection slave_sec;
++echo "The following are SLAVE1.";
++
++sync_with_master;
++select count(distinct n) from t1;
++select min(n) from t1;
++select max(n) from t1;
++
++connection slave_ter;
++echo "The following are SLAVE2.";
++start slave;
++sync_with_master;
++
++select count(distinct n) from t1;
++select min(n) from t1;
++select max(n) from t1;
++
++--replace_column 8 # 9 # 18 # 23 # 33 #
++show slave status;
++
++connection master;
++drop table t1;
++
++connection slave;
++drop table t1;
++save_master_pos;
++
++connection slave_sec;
++sync_with_master;
++
++connection slave;
++echo "The following are SLAVE.";
++
++show master logs;
++show master status;
++
++
++connection slave_ter;
++echo "The following are SLAVE2.";
++sync_with_master;
++
++show master logs;
++show master status;
++purge master logs to 'master-bin.000006';
++show master logs;
++--error 1186
++reset master;
+diff -r 66cc9e0a6768 patch_info/mirror_binlog.info
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/patch_info/mirror_binlog.info Thu Dec 04 21:46:15 2008 -0800
+@@ -0,0 +1,6 @@
++File=mirror_binlog.patch
++Name=Mirroring binary logs on slave
++Version=V1
++Author=Google
++License=GPL
++Comment=contains FastMaster promotion patch
+diff -r 66cc9e0a6768 sql/Makefile.am
+--- a/sql/Makefile.am Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/Makefile.am Thu Dec 04 21:46:15 2008 -0800
+@@ -68,7 +68,7 @@
+ sql_array.h sql_cursor.h \
+ examples/ha_example.h ha_archive.h \
+ examples/ha_tina.h ha_blackhole.h \
+- ha_federated.h
++ ha_federated.h repl_mule.h
+ mysqld_SOURCES = sql_lex.cc sql_handler.cc \
+ item.cc item_sum.cc item_buff.cc item_func.cc \
+ item_cmpfunc.cc item_strfunc.cc item_timefunc.cc \
+@@ -105,7 +105,7 @@
+ sp_cache.cc parse_file.cc sql_trigger.cc \
+ examples/ha_example.cc ha_archive.cc \
+ examples/ha_tina.cc ha_blackhole.cc \
+- ha_federated.cc
++ ha_federated.cc repl_mule.cc
+
+ gen_lex_hash_SOURCES = gen_lex_hash.cc
+ gen_lex_hash_LDADD = $(LDADD) $(CXXLDFLAGS)
+diff -r 66cc9e0a6768 sql/Makefile.in
+--- a/sql/Makefile.in Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/Makefile.in Thu Dec 04 21:46:15 2008 -0800
+@@ -152,7 +152,7 @@
+ sp_rcontext.$(OBJEXT) sp.$(OBJEXT) sp_cache.$(OBJEXT) \
+ parse_file.$(OBJEXT) sql_trigger.$(OBJEXT) \
+ ha_example.$(OBJEXT) ha_archive.$(OBJEXT) ha_tina.$(OBJEXT) \
+- ha_blackhole.$(OBJEXT) ha_federated.$(OBJEXT)
++ ha_blackhole.$(OBJEXT) ha_federated.$(OBJEXT) repl_mule.$(OBJEXT)
+ mysqld_OBJECTS = $(am_mysqld_OBJECTS)
+ mysqld_DEPENDENCIES = $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_2) \
+ $(am__DEPENDENCIES_2) $(am__DEPENDENCIES_2) \
+@@ -516,7 +516,7 @@
+ sql_array.h sql_cursor.h \
+ examples/ha_example.h ha_archive.h \
+ examples/ha_tina.h ha_blackhole.h \
+- ha_federated.h
++ ha_federated.h repl_mule.h
+
+ mysqld_SOURCES = sql_lex.cc sql_handler.cc \
+ item.cc item_sum.cc item_buff.cc item_func.cc \
+@@ -554,7 +554,7 @@
+ sp_cache.cc parse_file.cc sql_trigger.cc \
+ examples/ha_example.cc ha_archive.cc \
+ examples/ha_tina.cc ha_blackhole.cc \
+- ha_federated.cc
++ ha_federated.cc repl_mule.cc
+
+ gen_lex_hash_SOURCES = gen_lex_hash.cc
+ gen_lex_hash_LDADD = $(LDADD) $(CXXLDFLAGS)
+@@ -748,6 +748,7 @@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/protocol.Po@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/records.Po@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/repl_failsafe.Po@am__quote@
++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/repl_mule.Po@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/set_var.Po@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/slave.Po@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sp.Po@am__quote@
+diff -r 66cc9e0a6768 sql/lex.h
+--- a/sql/lex.h Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/lex.h Thu Dec 04 21:46:15 2008 -0800
+@@ -292,6 +292,7 @@
+ { "LONGTEXT", SYM(LONGTEXT)},
+ { "LOOP", SYM(LOOP_SYM)},
+ { "LOW_PRIORITY", SYM(LOW_PRIORITY)},
++ { "MAKE", SYM(MAKE_SYM)},
+ { "MASTER", SYM(MASTER_SYM)},
+ { "MASTER_CONNECT_RETRY", SYM(MASTER_CONNECT_RETRY_SYM)},
+ { "MASTER_HOST", SYM(MASTER_HOST_SYM)},
+diff -r 66cc9e0a6768 sql/log.cc
+--- a/sql/log.cc Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/log.cc Thu Dec 04 21:46:15 2008 -0800
+@@ -79,7 +79,9 @@
+
+ bool binlog_init()
+ {
+- return !opt_bin_log;
++ if (!opt_bin_log)
++ binlog_hton.prepare = NULL;
++ return 0; /* return !opt_bin_log; */
+ }
+
+ static int binlog_close_connection(THD *thd)
+@@ -406,6 +408,7 @@
+ :bytes_written(0), last_time(0), query_start(0), name(0),
+ prepared_xids(0), log_type(LOG_CLOSED), file_id(1), open_count(1),
+ write_error(FALSE), inited(FALSE), need_start_event(TRUE),
++ mule_binlog_(0),
+ description_event_for_exec(0), description_event_for_queue(0)
+ {
+ /*
+@@ -506,7 +509,10 @@
+ const char *log_name)
+ {
+ File index_file_nr= -1;
+- DBUG_ASSERT(!my_b_inited(&index_file));
++
++ /* If the index is already opened, do not open it again. */
++ if (my_b_inited(&index_file))
++ return FALSE;
+
+ /*
+ First open of this class instance
+@@ -750,7 +756,7 @@
+ if (file >= 0)
+ my_close(file,MYF(0));
+ end_io_cache(&log_file);
+- end_io_cache(&index_file);
++ close_index_file();
+ safeFree(name);
+ log_type= LOG_CLOSED;
+ DBUG_RETURN(1);
+@@ -768,7 +774,10 @@
+ int MYSQL_LOG::raw_get_current_log(LOG_INFO* linfo)
+ {
+ strmake(linfo->log_file_name, log_file_name, sizeof(linfo->log_file_name)-1);
+- linfo->pos = my_b_tell(&log_file);
++ if (!mule_binlog_)
++ linfo->pos = my_b_tell(&log_file);
++ else
++ linfo->pos = my_b_filelength(&log_file);
+ return 0;
+ }
+
+@@ -935,6 +944,11 @@
+ if (need_lock)
+ pthread_mutex_lock(&LOCK_index);
+ safe_mutex_assert_owner(&LOCK_index);
++
++ if (open_index_file(index_file_name, NULL) != 0) {
++ error = -1;
++ goto err;
++ }
+
+ /* As the file is flushed, we can't get an error here */
+ (void) reinit_io_cache(&index_file, READ_CACHE, linfo->index_file_offset, 0,
+@@ -1446,18 +1460,19 @@
+ SYNOPSIS
+ new_file()
+ need_lock Set to 1 if caller has not locked LOCK_log
++ logfile_name the specified log filename.
+
+ NOTE
+ The new file name is stored last in the index file
+ */
+
+-void MYSQL_LOG::new_file(bool need_lock)
++void MYSQL_LOG::new_file(bool need_lock, const char* log_filename)
+ {
+ char new_name[FN_REFLEN], *new_name_ptr, *old_name;
+ enum_log_type save_log_type;
+
+ DBUG_ENTER("MYSQL_LOG::new_file");
+- if (!is_open())
++ if (!is_log_open())
+ {
+ DBUG_PRINT("info",("log is closed"));
+ DBUG_VOID_RETURN;
+@@ -1496,7 +1511,9 @@
+ We have to do this here and not in open as we want to store the
+ new file name in the current binary log file.
+ */
+- if (generate_new_name(new_name, name))
++ if (log_filename) {
++ fn_format(new_name,log_filename,mysql_data_home,"",4);
++ } else if (generate_new_name(new_name, name))
+ goto end;
+ new_name_ptr=new_name;
+
+@@ -1571,7 +1588,7 @@
+ bytes_written+= ev->data_written;
+ DBUG_PRINT("info",("max_size: %lu",max_size));
+ if ((uint) my_b_append_tell(&log_file) > max_size)
+- new_file(0);
++ new_file(0);
+
+ err:
+ pthread_mutex_unlock(&LOCK_log);
+@@ -1600,8 +1617,14 @@
+ bytes_written += len;
+ } while ((buf=va_arg(args,const char*)) && (len=va_arg(args,uint)));
+ DBUG_PRINT("info",("max_size: %lu",max_size));
+- if ((uint) my_b_append_tell(&log_file) > max_size)
+- new_file(0);
++
++ /* If max_size is BINLOG_NOSWITCH_SIZE, binlog would not switch because
++ * of file size limit.
++ */
++ if (max_size != BINLOG_NOSWITCH_SIZE &&
++ (uint) my_b_append_tell(&log_file) > max_size) {
++ new_file(0);
++ }
+
+ err:
+ if (!error)
+@@ -2492,6 +2515,17 @@
+ DBUG_VOID_RETURN;
+ }
+
++int MYSQL_LOG::flush_log_file() {
++ return flush_io_cache(&log_file);
++}
++
++int MYSQL_LOG::close_index_file() {
++ if (my_b_inited(&index_file)) {
++ end_io_cache(&index_file);
++ my_close(index_file.file, MYF(0));
++ }
++ return 0;
++}
+
+ /*
+ Check if a string is a valid number
+diff -r 66cc9e0a6768 sql/log_event.h
+--- a/sql/log_event.h Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/log_event.h Thu Dec 04 21:46:15 2008 -0800
+@@ -94,6 +94,14 @@
+ #define LINE_TERM_EMPTY 0x4
+ #define LINE_START_EMPTY 0x8
+ #define ESCAPED_EMPTY 0x10
++
++/* This server-id value is used to indicate a special master-info event
++ * in relay-log.
++ * We will enforce in database that replication can not set this value
++ * as the server-id.
++ */
++#define MASTER_INFO_SERVER_ID 0xffffffff
++
+
+ /*****************************************************************************
+
+diff -r 66cc9e0a6768 sql/mysql_priv.h
+--- a/sql/mysql_priv.h Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/mysql_priv.h Thu Dec 04 21:46:15 2008 -0800
+@@ -462,6 +462,7 @@
+ /* BINLOG_DUMP options */
+
+ #define BINLOG_DUMP_NON_BLOCK 1
++#define BINLOG_MIRROR_CLIENT 0x0004
+
+ /* sql_show.cc:show_log_files() */
+ #define SHOW_LOG_STATUS_FREE "FREE"
+@@ -1374,6 +1375,7 @@
+ extern const char **errmesg; /* Error messages */
+ extern const char *myisam_recover_options_str;
+ extern const char *in_left_expr_name, *in_additional_cond, *in_having_cond;
++extern char *opt_binlog_index_name;
+ extern const char * const triggers_file_ext;
+ extern const char * const trigname_file_ext;
+ extern Eq_creator eq_creator;
+@@ -1875,6 +1877,10 @@
+ extern "C" void unireg_abort(int exit_code);
+ void kill_delayed_threads(void);
+ bool check_stack_overrun(THD *thd, long margin, char *dummy);
++extern my_bool rpl_mirror_binlog_enabled;
++extern ulong sync_mirror_binlog_period;
++extern my_bool rpl_mirror_binlog_no_replicate;
++extern ulong rpl_mirror_binlog_clients, rpl_mirror_binlog_status;
+ #else
+ #define unireg_abort(exit_code) DBUG_RETURN(exit_code)
+ inline void kill_delayed_threads(void) {}
+diff -r 66cc9e0a6768 sql/mysqld.cc
+--- a/sql/mysqld.cc Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/mysqld.cc Thu Dec 04 21:46:15 2008 -0800
+@@ -555,6 +555,7 @@
+ pthread_mutex_t LOCK_global_user_client_stats;
+ pthread_mutex_t LOCK_global_table_stats;
+ pthread_mutex_t LOCK_global_index_stats;
++pthread_mutex_t LOCK_failover_master;
+ /*
+ The below lock protects access to two global server variables:
+ max_prepared_stmt_count and prepared_stmt_count. These variables
+@@ -584,13 +585,15 @@
+ char *master_ssl_key, *master_ssl_cert;
+ char *master_ssl_ca, *master_ssl_capath, *master_ssl_cipher;
+
++char *opt_binlog_index_name;
++
+ /* Static variables */
+
+ static bool kill_in_progress, segfaulted;
+ static my_bool opt_do_pstack, opt_bootstrap, opt_myisam_log;
+ static int cleanup_done;
+ static ulong opt_specialflag, opt_myisam_block_size;
+-static char *opt_logname, *opt_update_logname, *opt_binlog_index_name;
++static char *opt_logname, *opt_update_logname;
+ static char *opt_tc_heuristic_recover;
+ static char *mysql_home_ptr, *pidfile_name_ptr;
+ static char **defaults_argv;
+@@ -598,6 +601,32 @@
+
+ static my_socket unix_sock,ip_sock;
+ struct rand_struct sql_rand; // used by sql_class.cc:THD::THD()
++
++/* When set, we are inside a failover slave and deny all non-super access */
++bool failover_deny_access= 0;
++
++/* When set, binlog will be mirrored on the replica. */
++my_bool rpl_mirror_binlog_enabled;
++
++/* Sync the mirrored binlog to disk after every #th event. */
++ulong sync_mirror_binlog_period;
++
++/* The fixed size for replication event buffer. Replication event can exceed
++ * the size.
++ */
++//ulong rpl_event_buffer_size;
++
++/* This is a mirror binlog status variable on the primary to indicate how many
++ * mirror binlog servers are connecting.
++ */
++ulong rpl_mirror_binlog_clients = 0;
++
++/* This indicates whether mirror binlog is working on a replica database. It
++ * requires:
++ * . rpl_mirror_binlog_enabled = 1
++ * . the slave I/O thread is running and mirror binlog is also dumped
++ */
++ulong rpl_mirror_binlog_status = 0;
+
+ /* OS specific variables */
+
+@@ -1315,6 +1344,7 @@
+ (void) pthread_cond_destroy(&COND_flush_thread_cache);
+ (void) pthread_cond_destroy(&COND_manager);
+ (void) pthread_mutex_destroy(&LOCK_stats);
++ (void) pthread_mutex_destroy(&LOCK_failover_master);
+ (void) pthread_mutex_destroy(&LOCK_global_user_client_stats);
+ (void) pthread_mutex_destroy(&LOCK_global_table_stats);
+ (void) pthread_mutex_destroy(&LOCK_global_index_stats);
+@@ -3164,6 +3194,7 @@
+ (void) pthread_cond_init(&COND_rpl_status, NULL);
+ #endif
+ (void) pthread_mutex_init(&LOCK_stats, MY_MUTEX_INIT_FAST);
++ (void) pthread_mutex_init(&LOCK_failover_master, MY_MUTEX_INIT_FAST);
+ (void) pthread_mutex_init(&LOCK_global_user_client_stats, MY_MUTEX_INIT_FAST);
+ (void) pthread_mutex_init(&LOCK_global_table_stats, MY_MUTEX_INIT_FAST);
+ (void) pthread_mutex_init(&LOCK_global_index_stats, MY_MUTEX_INIT_FAST);
+@@ -3398,39 +3429,8 @@
+
+ if (opt_bin_log)
+ {
+- char buf[FN_REFLEN];
+- const char *ln;
+- ln= mysql_bin_log.generate_name(opt_bin_logname, "-bin", 1, buf);
+- if (!opt_bin_logname && !opt_binlog_index_name)
+- {
+- /*
+- User didn't give us info to name the binlog index file.
+- Picking `hostname`-bin.index like did in 4.x, causes replication to
+- fail if the hostname is changed later. So, we would like to instead
+- require a name. But as we don't want to break many existing setups, we
+- only give warning, not error.
+- */
+- sql_print_warning("No argument was provided to --log-bin, and "
+- "--log-bin-index was not used; so replication "
+- "may break when this MySQL server acts as a "
+- "master and has his hostname changed!! Please "
+- "use '--log-bin=%s' to avoid this problem.", ln);
+- }
+- if (ln == buf)
+- {
+- my_free(opt_bin_logname, MYF(MY_ALLOW_ZERO_PTR));
+- opt_bin_logname=my_strdup(buf, MYF(0));
+- }
+- if (mysql_bin_log.open_index_file(opt_binlog_index_name, ln))
+- {
+- unireg_abort(1);
+- }
+-
+- /*
+- Used to specify which type of lock we need to use for queries of type
+- INSERT ... SELECT. This will change when we have row level logging.
+- */
+- using_update_log=1;
++ if (make_master_open_index(&opt_bin_logname, opt_binlog_index_name) != 0)
++ unireg_abort(1);
+ }
+
+ if (xid_cache_init())
+@@ -3480,9 +3480,10 @@
+ unireg_abort(1);
+ }
+
+- if (opt_bin_log && mysql_bin_log.open(opt_bin_logname, LOG_BIN, 0,
+- WRITE_CACHE, 0, max_binlog_size, 0))
+- unireg_abort(1);
++ if (opt_bin_log &&
++ make_master(NULL, opt_bin_logname, opt_binlog_index_name, NULL) != 0) {
++ unireg_abort(1);
++ }
+
+ #ifdef HAVE_REPLICATION
+ if (opt_bin_log && expire_logs_days)
+@@ -5098,6 +5098,8 @@
+ OPT_INNODB_READ_IO_THREADS,
+ OPT_INNODB_WRITE_IO_THREADS,
+ OPT_INNODB_ADAPTIVE_HASH_INDEX,
++ OPT_RPL_MIRROR_BINLOG,
++ OPT_SYNC_MIRROR_BINLOG,
+ OPT_FEDERATED,
+ OPT_INNODB_USE_LEGACY_CARDINALITY_ALGORITHM
+ };
+@@ -5725,6 +5728,11 @@
+ {"rpl-recovery-rank", OPT_RPL_RECOVERY_RANK, "Undocumented.",
+ (gptr*) &rpl_recovery_rank, (gptr*) &rpl_recovery_rank, 0, GET_ULONG,
+ REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
++ {"rpl_mirror_binlog_enabled", OPT_RPL_MIRROR_BINLOG,
++ "1 = support mirroring binlogs. 0 = disable mirroring binlogs",
++ (gptr*) &rpl_mirror_binlog_enabled,
++ (gptr*) &rpl_mirror_binlog_enabled, 0, GET_BOOL, NO_ARG,
++ 0, 0, 1, 0, 1, 0},
+ {"safe-mode", OPT_SAFE, "Skip some optimize stages (for testing).",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ #ifndef TO_BE_DELETED
+@@ -5849,6 +5857,11 @@
+ {"symbolic-links", 's', "Enable symbolic link support.",
+ (gptr*) &my_use_symdir, (gptr*) &my_use_symdir, 0, GET_BOOL, NO_ARG,
+ IF_PURIFY(0,1), 0, 0, 0, 0, 0},
++ {"sync-mirror-binlog", OPT_SYNC_MIRROR_BINLOG,
++ "Sync the mirrored binlog to disk after every #th event. "
++ "#=0 (the default) does no sync. Syncing slows MySQL down",
++ (gptr*) &sync_mirror_binlog_period,
++ (gptr*) &sync_mirror_binlog_period, 0, GET_ULONG, REQUIRED_ARG, 0, 0, ~0L, 0, 1, 0},
+ {"sysdate-is-now", OPT_SYSDATE_IS_NOW,
+ "Non-default option to alias SYSDATE() to NOW() to make it safe-replicable. Since 5.0, SYSDATE() returns a `dynamic' value different for different invocations, even within the same statement.",
+ (gptr*) &global_system_variables.sysdate_is_now,
+@@ -6625,6 +6638,7 @@
+ {"Delayed_errors", (char*) &delayed_insert_errors, SHOW_LONG},
+ {"Delayed_insert_threads", (char*) &delayed_insert_threads, SHOW_LONG_CONST},
+ {"Delayed_writes", (char*) &delayed_insert_writes, SHOW_LONG},
++ {"Failover_deny_access", (char*) &failover_deny_access, SHOW_LONG},
+ {"Flush_commands", (char*) &refresh_version, SHOW_LONG_CONST},
+ {"Handler_commit", (char*) offsetof(STATUS_VAR, ha_commit_count), SHOW_LONG_STATUS},
+ {"Handler_delete", (char*) offsetof(STATUS_VAR, ha_delete_count), SHOW_LONG_STATUS},
+diff -r 66cc9e0a6768 sql/repl_mule.cc
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/sql/repl_mule.cc Thu Dec 04 21:46:15 2008 -0800
+@@ -0,0 +1,466 @@
++/*
++ Copyright (C) 2007 Google Inc.
++
++This program is free software; you can redistribute it and/or
++modify it under the terms of the GNU General Public License
++as published by the Free Software Foundation; either version 2
++of the License, or (at your option) any later version.
++
++This program is distributed in the hope that it will be useful,
++but WITHOUT ANY WARRANTY; without even the implied warranty of
++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License
++along with this program; if not, write to the Free Software
++Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
++*/
++
++#include "mysql_priv.h"
++#include <my_dir.h>
++#include "slave.h"
++#include "repl_mule.h"
++
++/* max log size: 2GB */
++#define MAX_LOG_SIZE BINLOG_NOSWITCH_SIZE
++
++ReplMule::ReplMule(THD* thd, MASTER_INFO *mi, RelayStatus status,
++ my_off_t file_size, const char *binlog_indexname,
++ MYSQL_LOG *binlog, ulong sync_period)
++ : desc_event_(new Format_description_log_event(BINLOG_VERSION)),
++ io_thd_(thd), mi_(mi), status_(status), dump_position_(0L),
++ file_size_(file_size), mule_log_(binlog),
++ mule_log_sync_period_(sync_period), mule_log_event_counter_(0) {
++ char llbuf1[22], llbuf2[22];
++
++ DBUG_ENTER("ReplMule::ReplMule");
++
++ /* Indicate that we are in replication mule mode. */
++ mule_log_->set_mule_mode();
++
++ strmake(curr_log_filename_, mi->master_log_name,
++ sizeof(curr_log_filename_)-1);
++ strmake(mule_indexname_, binlog_indexname, sizeof(mule_indexname_)-1);
++
++ /* Open the mule log file */
++ if (!mule_log_->is_log_open()) {
++ /* Do not open binlog file when master_log_name is not specified. We
++ * are at the I/O thread initialization time and we do not know what
++ * filename we are going to dump.
++ * We wait for the next rotation event to indicate the filename.
++ */
++ if (strlen(curr_log_filename_) > 0 &&
++ mule_log_->open(curr_log_filename_, LOG_BIN, NULL,
++ SEQ_READ_APPEND, true, MAX_LOG_SIZE, 0) != 0) {
++ sql_print_error("ReplMule: open binlog failed: %s",
++ curr_log_filename_);
++ status_ = MULE_ERROR;
++ DBUG_VOID_RETURN;
++ }
++ }
++
++ switch (status_) {
++ case MULE_BEHIND:
++ dump_position_ = mi->master_log_pos;
++ mi->master_log_pos = file_size_;
++ sql_print_information("ReplicationMule: MULE_BEHIND - new(%s), old(%s)",
++ llstr(mi->master_log_pos, llbuf1),
++ llstr(dump_position_, llbuf2));
++ break;
++ case RELAY_MATCH_MULE:
++ case RELAY_MATCH_MULE_RUN:
++ dump_position_ = mi->master_log_pos;
++ sql_print_information("ReplicationMule: RELAY_MATCH_MULE.");
++ break;
++ case MULE_VERIFY:
++ case MULE_VERIFY_RELAY_BEHIND:
++ dump_position_ = mi->master_log_pos;
++ mi->master_log_pos = BIN_LOG_HEADER_SIZE;
++ sql_print_information(
++ "ReplicationMule: MULE_VERIFY - old(%s), file_size(%s)",
++ llstr(dump_position_, llbuf1), llstr(file_size_, llbuf2));
++
++ /* seek to the beginning of the file for verification */
++ seekToPosition(BIN_LOG_HEADER_SIZE);
++ break;
++ }
++
++ DBUG_VOID_RETURN;
++}
++
++ReplMule::~ReplMule() {
++ DBUG_ENTER("ReplMule::~ReplMule");
++
++ if (mule_log_->is_log_open())
++ mule_log_->close(LOG_CLOSE_INDEX);
++ mule_log_->clear_mule_mode();
++
++ /* If we are still in MULE_BEHIND or MULE_VERIFY state and we exit from
++ * I/O thread, it means we encountered some errors.
++ * mi->master_log_pos might be used by later slave start. It is being
++ * changed here to do event dumping or event verification. So, we should
++ * restore it to its original value.
++ */
++ switch (status_) {
++ case MULE_BEHIND:
++ case MULE_VERIFY:
++ if (mi_->master_log_pos < dump_position_)
++ mi_->master_log_pos = dump_position_;
++ break;
++ }
++
++ delete desc_event_;
++
++ DBUG_VOID_RETURN;
++}
++
++ReplMule::WriteStatus ReplMule::writeEvent(const char* buf, ulong event_len) {
++ WriteStatus dump_status = WRITE_RELAY;
++ char llbuf1[22], llbuf2[22], llbuf3[22];
++ char *verify_event;
++ bool verified = false;
++ bool skip_event = false;
++
++ DBUG_ENTER("ReplMule::dumpEvent");
++ switch (status_) {
++ case MULE_VERIFY:
++ case MULE_VERIFY_RELAY_BEHIND:
++ if (buf[EVENT_TYPE_OFFSET] == ROTATE_EVENT &&
++ IsFakeRotation(buf, event_len)) {
++ /* Do not verify the faked rotate event */
++ if (status_ == MULE_VERIFY)
++ dump_status = SKIP_RELAY;
++ break;
++ }
++ verify_event = new char[event_len];
++ if (verify_event == NULL) {
++ sql_print_error(
++ "ReplMule::dumpEvent - insufficient memory in verification, "
++ "position(%s), event_len(%d).",
++ llstr(mi_->master_log_pos, llbuf1), event_len);
++ dump_status = WRITE_ERROR;
++ break;
++ }
++ if (my_b_read(mule_log_->get_log_file(), (byte*) verify_event,
++ event_len) != 0) {
++ sql_print_error(
++ "ReplMule::dumpEvent - read log error in verification, "
++ "position(%s), event_len(%d).",
++ llstr(mi_->master_log_pos, llbuf1), event_len);
++ dump_status = WRITE_ERROR;
++ delete verify_event;
++ break;
++ }
++ verified = (memcmp(buf, verify_event, event_len) == 0);
++ delete verify_event;
++ if (!verified) {
++ sql_print_error(
++ "ReplMule::dumpEvent - event does not match at position(%s)",
++ llstr(mi_->master_log_pos, llbuf1));
++ dump_status = WRITE_ERROR;
++ break;
++ }
++ /* fall through */
++ case MULE_BEHIND:
++ dump_status = SKIP_RELAY;
++ if (status_ == MULE_BEHIND &&
++ queueEvent(buf, event_len, &skip_event) != 0) {
++ dump_status = WRITE_ERROR;
++ break;
++ }
++
++ /* Skip faked rotation event */
++ if (!skip_event)
++ mi_->master_log_pos += event_len;
++
++ if (mi_->master_log_pos == dump_position_) {
++ if (dump_position_ < file_size_) {
++ status_ = MULE_VERIFY_RELAY_BEHIND;
++ } else {
++ status_ = RELAY_MATCH_MULE;
++ }
++ sql_print_information(
++ "ReplMule::dumpEvent - new status(%d) "
++ "master_log_pos(%s), dump_pos(%s), file_size(%s)", status_,
++ llstr(mi_->master_log_pos, llbuf1), llstr(dump_position_, llbuf2),
++ llstr(file_size_, llbuf3));
++ } else if (mi_->master_log_pos == file_size_) {
++ if (dump_position_ > file_size_) {
++ status_ = MULE_BEHIND;
++ } else {
++ status_ = RELAY_MATCH_MULE;
++ }
++ sql_print_information(
++ "ReplMule::dumpEvent - new status(%d) "
++ "master_log_pos(%s), dump_pos(%s), file_size(%s)", status_,
++ llstr(mi_->master_log_pos, llbuf1), llstr(dump_position_, llbuf2),
++ llstr(file_size_, llbuf3));
++ } else if (status_ != MULE_VERIFY_RELAY_BEHIND &&
++ mi_->master_log_pos > dump_position_) {
++ sql_print_error(
++ "ReplMule::dumpEvent - mule position(%s) does not match "
++ "relay-log position(%s).",
++ llstr(mi_->master_log_pos, llbuf1), llstr(dump_position_, llbuf2));
++ dump_status = WRITE_ERROR;
++ }
++ break;
++ case RELAY_MATCH_MULE_RUN:
++ if (buf[EVENT_TYPE_OFFSET] == FORMAT_DESCRIPTION_EVENT) {
++ sql_print_information(" RELAY_MATCH_MULE event %d", buf[EVENT_TYPE_OFFSET] );
++ /* Do not write format description record if size is the same */
++ break;
++ }
++ case RELAY_MATCH_MULE:
++ if (queueEvent(buf, event_len, &skip_event) != 0)
++ dump_status = WRITE_ERROR;
++ break;
++ }
++
++ DBUG_RETURN(dump_status);
++}
++
++int ReplMule::appendEvent(const char* buf, ulong event_len) {
++ char llbuf1[22];
++ int error;
++
++ DBUG_ENTER("ReplMule::appendEvent");
++
++ error = mule_log_->appendv(buf,event_len,0);
++ if (error != 0) {
++ sql_print_error("ReplMule::appendEvent - append error at %s(%s)",
++ mi_->master_log_name,
++ llstr(mi_->master_log_pos, llbuf1));
++ } else if (mule_log_->flush_log_file() != 0) {
++ sql_print_error("ReplMule::appendEvent - flush error at %s(%s)",
++ mi_->master_log_name,
++ llstr(mi_->master_log_pos, llbuf1));
++ error = -1;
++ } else if (mule_log_sync_period_ > 0) {
++ mule_log_event_counter_++;
++ if (mule_log_event_counter_ >= mule_log_sync_period_) {
++ mule_log_event_counter_ = 0;
++ error = my_sync(mule_log_->get_log_file()->file, MYF(MY_WME));
++ if (error != 0)
++ sql_print_error("ReplMule::appendEvent - sync error at %s(%s)",
++ mi_->master_log_name,
++ llstr(mi_->master_log_pos, llbuf1));
++ }
++ }
++
++ DBUG_RETURN(error);
++}
++
++int ReplMule::queueEvent(const char* buf, ulong event_len, bool *skip_event) {
++ int error = 0;
++
++ DBUG_ENTER("ReplMule::queueEvent");
++
++ *skip_event = false;
++
++ mule_log_->lock_log();
++ if (buf[EVENT_TYPE_OFFSET] == ROTATE_EVENT) {
++ Rotate_log_event rev(buf, event_len, desc_event_);
++
++ /* If this is a faked rotate event and the specified filename is
++ * the same as the current binlog filename, ignore the event.
++ */
++ if (IsFakeRotation(rev)) {
++ *skip_event = true;
++ DBUG_PRINT("info",("skipped faked rotation event"));
++ } else {
++ /* Only append real events. */
++ if (rev.when != 0)
++ error = appendEvent(buf, event_len);
++
++ /* Only rotate file when append succeeds. */
++ if (error == 0) {
++ /* Create a new file: lock both index and log. */
++ if (strlen(curr_log_filename_) == 0) {
++ /* If curr_log_filename_ is not specified, then this is the first
++ * valid rotation event to indicate the filename.
++ */
++ error = mule_log_->open(rev.new_log_ident, LOG_BIN, NULL,
++ SEQ_READ_APPEND, true, MAX_LOG_SIZE, 0);
++ } else {
++ mule_log_->new_file(0, rev.new_log_ident);
++ }
++
++ strmake(curr_log_filename_, rev.new_log_ident,
++ strlen(rev.new_log_ident));
++
++ DBUG_PRINT("info",("rotate file: %s", rev.new_log_ident));
++ }
++ }
++ } else {
++ error = appendEvent(buf, event_len);
++ }
++ mule_log_->unlock_log();
++
++ DBUG_RETURN(error);
++}
++
++void ReplMule::seekToPosition(my_off_t pos) {
++ DBUG_ENTER("ReplMule::seekToPosition");
++ DBUG_PRINT("enter",("seek_pos: %ld", (ulong) pos));
++
++ my_b_seek(mule_log_->get_log_file(), pos);
++ DBUG_VOID_RETURN;
++}
++
++bool ReplMule::IsFakeRotation(const char* buf, ulong event_len) {
++ DBUG_ENTER("ReplMule::IsFakeRotation");
++
++ Rotate_log_event rev(buf, event_len, desc_event_);
++ DBUG_RETURN(IsFakeRotation(rev));
++}
++
++bool ReplMule::IsFakeRotation(const Rotate_log_event& rev) {
++ DBUG_ENTER("ReplMule::IsFakeRotation");
++ DBUG_RETURN(rev.when == 0 &&
++ rev.ident_len == strlen(curr_log_filename_) &&
++ strcmp(rev.new_log_ident, curr_log_filename_) == 0);
++}
++
++/* createReplicationMule:
++ * Create a mule that relays master's replication binlog and
++ * generate an exact same copy on the local filesystem.
++ *
++ * Code flow:
++ * last_mulelog = scan the existing mule log index to find it
++ * if (mulelog index is not created or there is no mule log inside it)
++ * old_mule_log <- requested dumping position
++ * requested dumping position <- 0 in the file
++ * else
++ * check whether the mule log matches the requested dump
++ * (whether the last mule log name/size matches)
++ * if the mule log name does not match
++ * exit with an error
++ * if (the mule log size does not match the requested dump position)
++ * request the dump from position 0 and read all events
++ * verify all events with the corresponding events in mule log
++ * if (the verification succeeds)
++ * continue the dump
++ * else
++ * exit with an error
++ */
++ReplMule* ReplMule::createReplicationMule(
++ THD* thd, MASTER_INFO *mi, const char *binlog_indexname,
++ MYSQL_LOG *binlog) {
++ ReplMule *mule = NULL;
++ LOG_INFO linfo;
++ bool index_opened = false;
++
++ DBUG_ENTER("ReplMule::createReplicationMule");
++
++ /* binlog_indexname must be set to some real value. */
++ DBUG_ASSERT(binlog_indexname);
++
++ /* Lock binlog index for all binlog operations */
++ binlog->lock_index();
++ index_opened = binlog->open_index_file(binlog_indexname, NULL);
++ DBUG_PRINT("info",("open index file succeed: %d", index_opened));
++ sql_print_information("createReplicationMule");
++
++ /* Scan the existing binlog index to find the last relayed binlog */
++ if (index_opened ||
++ binlog->find_log_pos(&linfo, NullS, false) != 0) {
++ /* binlog index is not created or has no log file inside:
++ * . old_relay_binlog <- requested dumping position
++ * . requested dumping position <- 0 in the file
++ */
++ if (mi->master_log_pos == BIN_LOG_HEADER_SIZE) {
++ mule = new ReplMule(thd, mi, RELAY_MATCH_MULE, BIN_LOG_HEADER_SIZE,
++ binlog_indexname, binlog, sync_mirror_binlog_period);
++ } else {
++ mule = new ReplMule(thd, mi, MULE_BEHIND, BIN_LOG_HEADER_SIZE,
++ binlog_indexname, binlog, sync_mirror_binlog_period);
++ }
++
++ if (mule == NULL) {
++ sql_print_error("Mule malloc operation failed.");
++ }
++ } else {
++ IO_CACHE* log_file;
++ MY_STAT stat;
++ char last_binlog_name[FN_REFLEN];
++
++ /* Find the last log file from the binlog index.
++ * Check whether the last binlog matches the requested dump for both
++ * binlog name and binlog size.
++ */
++ for (;;) {
++ strmake(last_binlog_name, linfo.log_file_name, FN_REFLEN);
++ last_binlog_name[FN_REFLEN - 1] = '\0';
++ if (binlog->find_next_log(&linfo, false))
++ break;
++ }
++ DBUG_PRINT("info",("the last binlog: %s", last_binlog_name));
++
++ /* if the binlog name does not match, exit with an error. */
++ if (strcmp(last_binlog_name+dirname_length(last_binlog_name),
++ mi->master_log_name) != 0) {
++ sql_print_error("Mule binlog(%s) does not match new relay-binlog(%s)",
++ last_binlog_name, mi->master_log_name);
++ } /* Open the last binlog. */
++ else if (binlog->open(last_binlog_name, LOG_BIN, NULL,
++ SEQ_READ_APPEND, true, MAX_LOG_SIZE, 0) != 0) {
++ sql_print_error("Mule open last binlog failed: %s", last_binlog_name);
++ } else {
++ bool valid_file_size = true;
++
++ /* Get the binlog size. */
++ log_file = binlog->get_log_file();
++ if (my_fstat(log_file->file, &stat, MYF(0)) == 0) {
++ /* If the binlog size does not match the requested dump position, then
++ * request the dump from position 0 and verify all events, we need to
++ * verify events because the mule log might be used for serving during
++ * anytime. We must be sure that they are correct.
++ */
++ sql_print_information("Binglog size %d", stat.st_size);
++ if (stat.st_size == mi->master_log_pos) {
++ mule = new ReplMule(thd, mi, RELAY_MATCH_MULE_RUN, stat.st_size,
++ binlog_indexname, binlog,
++ sync_mirror_binlog_period);
++ } else if (stat.st_size > BIN_LOG_HEADER_SIZE) {
++ mule = new ReplMule(thd, mi, MULE_VERIFY, stat.st_size,
++ binlog_indexname, binlog,
++ sync_mirror_binlog_period);
++ } else if (stat.st_size == BIN_LOG_HEADER_SIZE) {
++ mule = new ReplMule(thd, mi, MULE_BEHIND, BIN_LOG_HEADER_SIZE,
++ binlog_indexname, binlog,
++ sync_mirror_binlog_period);
++ } else {
++ char llbuf[22];
++ valid_file_size = false;
++ sql_print_error("Mule binlog file(%s) invalid size: %s",
++ last_binlog_name, llstr(stat.st_size, llbuf));
++ }
++ } else {
++ valid_file_size = false;
++ sql_print_error("Mule binlog file(%s): fstat failed.",
++ last_binlog_name);
++ }
++
++ if (valid_file_size) {
++ if (mule == NULL) {
++ sql_print_error("Mule malloc operation failed.");
++ } else if (mule->status_ == MULE_ERROR) {
++ /* If mule creation fails, indicate the error. */
++ delete mule;
++ mule = NULL;
++ }
++ }
++ }
++ }
++
++ /* Clear the mule binlog mode if there are errors. */
++ if (mule == NULL) {
++ binlog->clear_mule_mode();
++ binlog->close_index_file();
++ }
++
++ /* Unlock binlog index */
++ binlog->unlock_index();
++
++ DBUG_RETURN(mule);
++}
+diff -r 66cc9e0a6768 sql/repl_mule.h
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/sql/repl_mule.h Thu Dec 04 21:46:15 2008 -0800
+@@ -0,0 +1,166 @@
++/*
++ Copyright (C) 2007 Google Inc.
++
++This program is free software; you can redistribute it and/or
++modify it under the terms of the GNU General Public License
++as published by the Free Software Foundation; either version 2
++of the License, or (at your option) any later version.
++
++This program is distributed in the hope that it will be useful,
++but WITHOUT ANY WARRANTY; without even the implied warranty of
++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License
++along with this program; if not, write to the Free Software
++Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
++*/
++
++#ifndef SQL_REPL_MULE_H__
++#define SQL_REPL_MULE_H__
++
++/* Replication Mule is the class that is responsible for generating
++ * an exact copy of the binlog from a master database. We call this feature
++ * mirror binlog and it can be enabled by setting rpl_mirror_binlog. We
++ * need to keep the same copy for the following purposes:
++ * . The replica can serve the binlog transparently as if they are the
++ * master database. This can relieve master connection overhead.
++ * . During failover, the replica can become the new master and serve
++ * old binlogs transparently.
++ * (The Mule name comes from the popular P2P software eMule.)
++ *
++ * Internally, we call the mirrored binlog mule log.
++ */
++
++class THD;
++class Rotate_log_event;
++class Format_description_log_event;
++typedef struct st_master_info MASTER_INFO;
++
++class ReplMule {
++ public:
++ /* Because I/O thread also creates relay-binlog, instead of an exact
++ * copy of the original master's binlog, we have two resources that
++ * might get out of sync.
++ * This enum indicates the status:
++ * MULE_BEHIND - the mule's header is behind:
++ * (mule is activated for the first time)
++ * RELAY_MATCH_MULE - mule matches relay-log
++ * RELAY_MATCH_MULE_RUN - mule matches relay-log and it was not empty binlog
++ * MULE_VERIFY - mule has more events than the relay-log and needs
++ * verification; we can not verify based on relay-log
++ * events because events might get changed a little;
++ * verification starts with downloading all events in
++ * the last binlog from the master and compare with
++ * all events in the mule log;
++ * MULE_VERIFY_RELAY_BEHIND - mule has more events than the relay-log
++ * and relay-log needs to write events
++ * MULE_ERROR - mule detects errors in event duplicate
++ *
++ * When the mule mirrors binlogs, it writes an event into the mule log
++ * first. Then, I/O thread writes the event into the relay log.
++ */
++ enum RelayStatus {
++ MULE_BEHIND = 1,
++ RELAY_MATCH_MULE = 2,
++ RELAY_MATCH_MULE_RUN = 7,
++ MULE_VERIFY = 3,
++ MULE_VERIFY_RELAY_BEHIND = 4,
++ MULE_ERROR = 5,
++ };
++
++ enum WriteStatus {
++ WRITE_RELAY = 1,
++ WRITE_ERROR = 2,
++ SKIP_RELAY = 3,
++ };
++
++ private:
++ const Format_description_log_event *desc_event_;
++ THD *io_thd_;
++ MASTER_INFO *mi_;
++
++ /*
++ * I/O thread will write both mule log for mirror binlog and relay log
++ * for SQL thread.
++ * The variable indicates whether the two are in sync.
++ */
++ RelayStatus status_;
++
++ /* The starting event writing position. */
++ my_off_t dump_position_;
++
++ /* During the initial setup, the last mule log's file size. */
++ my_off_t file_size_;
++
++ /* Internally, we call the mirrored binlog mule log. */
++ MYSQL_LOG *mule_log_;
++
++ /* Sync the mule log to disk for every #N events. */
++ ulong mule_log_sync_period_;
++ ulong mule_log_event_counter_;
++
++ /* mule log's index filename */
++ char mule_indexname_[FN_REFLEN];
++
++ /* the current mule log's filename */
++ char curr_log_filename_[FN_REFLEN];
++
++ ReplMule(THD* thd, MASTER_INFO *mi, RelayStatus status,
++ my_off_t file_size, const char *binlog_indexname,
++ MYSQL_LOG *binlog, ulong sync_period);
++
++ /*
++ * Queue the event into the current mule log. If it is a rotation
++ * event, generate a new mule log file.
++ * Indicate whether the event is skipped because it is an fake event.
++ * A fake event is generated by the master to indicate the current
++ * reading position.
++ */
++ int queueEvent(const char* buf, ulong event_len, bool *skip_event);
++
++ /* Append the event to the current mule log. */
++ int appendEvent(const char* buf, ulong event_len);
++
++ bool IsFakeRotation(const char* buf, ulong event_len);
++ bool IsFakeRotation(const Rotate_log_event& rev);
++
++ /* Seek to the specified position in the current open mule log. */
++ void seekToPosition(my_off_t pos);
++
++ public:
++
++ ~ReplMule();
++
++ /* Dump the event into mule binlog.
++ * Input:
++ * buf (IN) - replication event buffer
++ * event_len (IN) - the event length
++ *
++ * Return:
++ * . WRITE_RELAY: the relay log needs to writing the event
++ * . WRITE_ERROR: the writing encountered errors
++ * . SKIP_RELAY: the relay log should skip the event
++ */
++ WriteStatus writeEvent(const char* buf, ulong event_len);
++
++ /* createReplicationMule:
++ * Create a mule that relays master's replication binlog and
++ * generate an exact same copy on the local filesystem.
++ *
++ * Input:
++ * thd (IN) - replication I/O thread
++ * mi (IN) - master info struct for I/O thread's progress
++ * binlog_indexname (IN) - filename for binlog's index
++ * binlog (IN) - replication binlog
++ *
++ * Return:
++ * . a replication mule if success
++ * . NULL if there are any errors
++ */
++ static ReplMule *createReplicationMule(THD* thd, MASTER_INFO *mi,
++ const char *binlog_indexname,
++ MYSQL_LOG *binlog);
++};
++
++#endif /* SQL_REPL_MULE_H__ */
+diff -r 66cc9e0a6768 sql/set_var.cc
+--- a/sql/set_var.cc Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/set_var.cc Thu Dec 04 21:46:15 2008 -0800
+@@ -345,6 +345,8 @@
+ slog_verb);
+ sys_var_long_ptr sys_rpl_recovery_rank("rpl_recovery_rank",
+ &rpl_recovery_rank);
++sys_var_bool_ptr sys_rpl_mirror_binlog_enabled("rpl_mirror_binlog_enabled",
++ &rpl_mirror_binlog_enabled);
+ sys_var_long_ptr sys_query_cache_size("query_cache_size",
+ &query_cache_size,
+ fix_query_cache_size);
+@@ -364,6 +366,9 @@
+ sys_var_thd_ulong sys_trans_prealloc_size("transaction_prealloc_size",
+ &SV::trans_prealloc_size,
+ 0, fix_trans_mem_root);
++sys_var_long_ptr sys_sync_mirror_binlog_period(
++ "sync_mirror_binlog_period",
++ &sync_mirror_binlog_period);
+
+ #ifdef HAVE_QUERY_CACHE
+ sys_var_long_ptr sys_query_cache_limit("query_cache_limit",
+@@ -774,6 +779,7 @@
+ &sys_relay_log_purge,
+ #endif
+ &sys_rpl_recovery_rank,
++ &sys_rpl_mirror_binlog_enabled,
+ &sys_safe_updates,
+ &sys_secure_auth,
+ &sys_secure_file_priv,
+@@ -1113,6 +1119,8 @@
+ {"relay_log_space_limit", (char*) &relay_log_space_limit, SHOW_LONGLONG},
+ #endif
+ {sys_rpl_recovery_rank.name,(char*) &sys_rpl_recovery_rank, SHOW_SYS},
++ {sys_rpl_mirror_binlog_enabled.name,
++ (char *) &sys_rpl_mirror_binlog_enabled, SHOW_SYS},
+ {"secure_auth", (char*) &sys_secure_auth, SHOW_SYS},
+ {"secure_file_priv", (char*) &sys_secure_file_priv, SHOW_SYS},
+ #ifdef HAVE_SMEM
+diff -r 66cc9e0a6768 sql/slave.cc
+--- a/sql/slave.cc Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/slave.cc Thu Dec 04 21:46:15 2008 -0800
+@@ -25,6 +25,7 @@
+ #include <thr_alarm.h>
+ #include <my_dir.h>
+ #include <sql_common.h>
++#include "repl_mule.h"
+ #include <errmsg.h>
+ #include <mysys_err.h>
+
+@@ -3527,6 +3528,7 @@
+ RELAY_LOG_INFO *rli= &mi->rli;
+ char llbuff[22];
+ uint retry_count;
++ ReplMule *mule = NULL;
+
+ // needs to call my_thread_init(), otherwise we get a coredump in DBUG_ stuff
+ my_thread_init();
+@@ -3609,6 +3611,23 @@
+ if (get_master_version_and_clock(mysql, mi))
+ goto err;
+
++ if (rpl_mirror_binlog_enabled && !mule) {
++ if (opt_binlog_index_name == NULL) {
++ sql_print_error("\"log-bin-index\" must be set in mirror binlog.");
++ goto err;
++ }
++
++ /* Create the mule to generate the exact copy of the binlog */
++ mule = ReplMule::createReplicationMule(
++ thd, mi, opt_binlog_index_name, &mysql_bin_log);
++
++ /* If we could not create the mule, we stop the I/O thread and report
++ * an error.
++ */
++ if (mule == NULL)
++ goto err;
++ }
++
+ if (mi->rli.relay_log.description_event_for_queue->binlog_version > 1)
+ {
+ /*
+@@ -3624,6 +3643,7 @@
+ DBUG_PRINT("info",("Starting reading binary log from master"));
+ while (!io_slave_killed(thd,mi))
+ {
++ const char* event_buf;
+ bool suppress_warnings= 0;
+ thd_proc_info(thd, "Requesting binlog dump");
+ if (request_dump(mysql, mi, &suppress_warnings))
+@@ -3754,10 +3774,25 @@
+ goto connected;
+ } // if (event_len == packet_error)
+
++ event_buf = (const char*)mysql->net.read_pos + 1;
++
++ if (mule) {
++ ReplMule::WriteStatus d_status =
++ mule->writeEvent(event_buf, event_len);
++ switch (d_status) {
++ case ReplMule::WRITE_RELAY:
++ break;
++ case ReplMule::SKIP_RELAY:
++ /* Skip writing relay event; go back to read the next event */
++ continue;
++ case ReplMule::WRITE_ERROR:
++ goto err;
++ }
++ }
++
+ retry_count=0; // ok event, reset retry counter
+ thd_proc_info(thd, "Queueing master event to the relay log");
+- if (queue_event(mi,(const char*)mysql->net.read_pos + 1,
+- event_len))
++ if (queue_event(mi, event_buf, event_len))
+ {
+ sql_print_error("Slave I/O thread could not queue event from master");
+ goto err;
+@@ -3847,6 +3882,7 @@
+ change_rpl_status(RPL_ACTIVE_SLAVE,RPL_IDLE_SLAVE);
+ DBUG_ASSERT(thd->net.buff != 0);
+ net_end(&thd->net); // destructor will not free it, because net.vio is 0
++ delete mule;
+ close_thread_tables(thd, 0);
+ pthread_mutex_lock(&LOCK_thread_count);
+ THD_CHECK_SENTRY(thd);
+diff -r 66cc9e0a6768 sql/sql_class.h
+--- a/sql/sql_class.h Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/sql_class.h Thu Dec 04 21:46:15 2008 -0800
+@@ -152,6 +152,12 @@
+ #define LOG_INFO_FATAL -7
+ #define LOG_INFO_IN_USE -8
+
++/* If the maximum size is equal to this value, binlog would not rotate on
++ * size limit.
++ */
++#define BINLOG_NOSWITCH_SIZE ((ulong) -1)
++
++
+ /* bitmap to SQL_LOG::close() */
+ #define LOG_CLOSE_INDEX 1
+ #define LOG_CLOSE_TO_BE_OPENED 2
+@@ -245,6 +251,9 @@
+ bool no_auto_events;
+ friend class Log_event;
+
++ /* mule replication mode */
++ bool mule_binlog_;
++
+ public:
+ /*
+ These describe the log's format. This is used only for relay logs.
+@@ -317,7 +326,8 @@
+ }
+ bool open_index_file(const char *index_file_name_arg,
+ const char *log_name);
+- void new_file(bool need_lock);
++ int close_index_file();
++ void new_file(bool need_lock= 1, const char* log_filename= NULL);
+ bool write(THD *thd, enum enum_server_command command,
+ const char *format, ...) ATTRIBUTE_FORMAT(printf, 4, 5);
+ bool write(THD *thd, const char *query, uint query_length,
+@@ -357,7 +367,27 @@
+ int get_current_log(LOG_INFO* linfo);
+ int raw_get_current_log(LOG_INFO* linfo);
+ uint next_file_id();
+- inline bool is_open() { return log_type != LOG_CLOSED; }
++
++ /* Because mysql use is_open() to check whether replication is on,
++ * we will let the check fail during binlog mule mode. Mule replication
++ * and normal master replication can not be on at the same time.
++ *
++ * is_log_open(): the binlog file is open for either purpose
++ *
++ * is_open(): the binlog is open for master replication.
++ * is_mule_open(): the binlog is open for mirror binlog or for
++ * replication mule; refer repl_mule.h for details
++ */
++ bool is_log_open() {
++ return log_type != LOG_CLOSED;
++ }
++ bool is_open() {
++ return (!mule_binlog_) && is_log_open();
++ }
++ bool is_mule_open() {
++ return (mule_binlog_) && is_log_open();
++ }
++
+ inline char* get_index_fname() { return index_file_name;}
+ inline char* get_log_fname() { return log_file_name; }
+ inline char* get_name() { return name; }
+@@ -366,8 +396,18 @@
+
+ inline void lock_index() { pthread_mutex_lock(&LOCK_index);}
+ inline void unlock_index() { pthread_mutex_unlock(&LOCK_index);}
++ inline void lock_log() { pthread_mutex_lock(&LOCK_log);}
++ inline void unlock_log() { pthread_mutex_unlock(&LOCK_log);}
+ inline IO_CACHE *get_index_file() { return &index_file;}
+ inline uint32 get_open_count() { return open_count; }
++ /* Look in file repl_mule.h for the definition of mule. */
++ void set_mule_mode() {
++ mule_binlog_ = 1;
++ }
++ void clear_mule_mode() {
++ mule_binlog_ = 0;
++ }
++ int flush_log_file();
+ };
+
+ /*
+diff -r 66cc9e0a6768 sql/sql_lex.h
+--- a/sql/sql_lex.h Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/sql_lex.h Thu Dec 04 21:46:15 2008 -0800
+@@ -104,6 +104,7 @@
+ // TODO(mcallaghan): update status_vars in mysqld to export these
+ SQLCOM_SHOW_USER_STATS, SQLCOM_SHOW_TABLE_STATS, SQLCOM_SHOW_INDEX_STATS,
+ SQLCOM_SHOW_CLIENT_STATS,
++ SQLCOM_MAKE_MASTER,
+ /* This should be the last !!! */
+ SQLCOM_END
+ };
+@@ -171,6 +172,12 @@
+ char *ssl_key, *ssl_cert, *ssl_ca, *ssl_capath, *ssl_cipher;
+ char *relay_log_name;
+ ulong relay_log_pos;
++
++ /* the following fields are used for make master command */
++ char *log_index_name;
++ bool in_failover;
++ bool kill_session;
++ bool with_old_binlog;
+ } LEX_MASTER_INFO;
+
+
+diff -r 66cc9e0a6768 sql/sql_parse.cc
+--- a/sql/sql_parse.cc Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/sql_parse.cc Thu Dec 04 21:46:15 2008 -0800
+@@ -402,6 +402,15 @@
+ passwd_len ? "yes": "no",
+ thd->main_security_ctx.master_access,
+ (thd->db ? thd->db : "*none*")));
++
++ /* If we are in failover mode, reject all non-super user connections. */
++ if (is_in_failover() &&
++ !(thd->main_security_ctx.master_access & SUPER_ACL)) {
++ net_send_error(thd, ER_SPECIFIC_ACCESS_DENIED_ERROR,
++ "super-user only during failover");
++ DBUG_RETURN(-1);
++ }
++
+
+ if (check_count)
+ {
+@@ -3470,6 +3479,22 @@
+ else
+ res = load_master_data(thd);
+ break;
++
++ case SQLCOM_MAKE_MASTER:
++ {
++ thd_proc_info(thd, "Making master");
++
++ if (check_global_access(thd, SUPER_ACL))
++ goto error;
++ res = make_master(thd, NULL, NULL, &lex->mi);
++ if (res == 0) {
++ // TODO -- wei is this OK, setting it to NULL?
++ thd_proc_info(thd, 0);
++ send_ok(thd);
++ }
++ break;
++ }
++
+ #endif /* HAVE_REPLICATION */
+ #ifdef HAVE_NDBCLUSTER_DB
+ case SQLCOM_SHOW_NDBCLUSTER_STATUS:
+diff -r 66cc9e0a6768 sql/sql_repl.cc
+--- a/sql/sql_repl.cc Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/sql_repl.cc Thu Dec 04 21:46:15 2008 -0800
+@@ -20,11 +20,19 @@
+ #include "log_event.h"
+ #include <my_dir.h>
+
++extern pthread_mutex_t LOCK_failover_master;
++extern bool failover_deny_access;
++
+ int max_binlog_dump_events = 0; // unlimited
+ my_bool opt_sporadic_binlog_dump_fail = 0;
+ #ifndef DBUG_OFF
+ static int binlog_dump_count = 0;
+ #endif
++
++static int make_master_open_log(MYSQL_LOG *log, const char *opt_name,
++ bool no_auto_events, ulong max_size);
++static int set_in_failover(bool kill_session);
++static void clear_in_failover(void);
+
+ /*
+ fake_rotate_event() builds a fake (=which does not exist physically in any
+@@ -255,7 +263,7 @@
+ bool purge_master_logs(THD* thd, const char* to_log)
+ {
+ char search_file_name[FN_REFLEN];
+- if (!mysql_bin_log.is_open())
++ if (!mysql_bin_log.is_log_open())
+ {
+ send_ok(thd);
+ return FALSE;
+@@ -308,6 +316,44 @@
+ return error;
+ }
+
++/* Show processlist command dump the binlog state.
++ *
++ * Input:
++ * output_info - (OUT) the output proc_info
++ * output_len - (IN) output proc_info's length
++ * thd - (IN) the thread
++ * input_msg - (IN) the input proc_info
++ * log_file_name - (IN) binlog file name
++ * log_pos - (IN) binlog position
++ */
++static void processlist_show_binlog_state(char *output_info,
++ int output_len,
++ THD *thd,
++ const char *input_msg,
++ const char *log_file_name,
++ my_off_t log_pos) {
++ DBUG_ENTER("processlist_show_binlog_state");
++
++ /* Point to input_msg in case "show processlist" access it before the copy
++ * is finished.
++ */
++ thd_proc_info(thd, input_msg);
++
++ if (snprintf(output_info, output_len, "%s :%s:%lld:", input_msg,
++ log_file_name + dirname_length(log_file_name),
++ log_pos) > 0) {
++ thd_proc_info(thd, output_info);
++ }
++
++ DBUG_VOID_RETURN;
++}
++
++static void repl_cleanup(ushort flags) {
++ if (flags & BINLOG_MIRROR_CLIENT) {
++ /* One less mirror binlog client. */
++ thread_safe_sub(rpl_mirror_binlog_clients, 1, &LOCK_stats);
++ }
++}
+
+ /*
+ TODO: Clean up loop to only have one call to send_file()
+@@ -319,6 +365,11 @@
+ LOG_INFO linfo;
+ char *log_file_name = linfo.log_file_name;
+ char search_file_name[FN_REFLEN], *name;
++
++ /* This buffer should be enough for "comments + :file_name:file_pos:". */
++ char binlog_state_msg[FN_REFLEN + 100];
++ int binlog_state_msg_len = FN_REFLEN + 100;
++
+ IO_CACHE log;
+ File file = -1;
+ String* packet = &thd->packet;
+@@ -335,6 +386,15 @@
+
+ bzero((char*) &log,sizeof(log));
+
++ sql_print_information("Start %s binlog_dump to slave_server(%d), pos(%s, %lu)",
++ "asynchronous",
++ thd->server_id, log_ident, (ulong)pos);
++
++ if (flags & BINLOG_MIRROR_CLIENT) {
++ /* One more mirror binlog clients. */
++ thread_safe_increment(rpl_mirror_binlog_clients, &LOCK_stats);
++ }
++
+ #ifndef DBUG_OFF
+ if (opt_sporadic_binlog_dump_fail && (binlog_dump_count++ % 2))
+ {
+@@ -344,7 +404,7 @@
+ }
+ #endif
+
+- if (!mysql_bin_log.is_open())
++ if (!mysql_bin_log.is_log_open())
+ {
+ errmsg = "Binary log is not open";
+ my_errno= ER_MASTER_FATAL_ERROR_READING_BINLOG;
+@@ -529,6 +589,12 @@
+ }
+ #endif
+
++ /* Update the binlog sending state. */
++ processlist_show_binlog_state(
++ binlog_state_msg, binlog_state_msg_len, thd,
++ "Send binlog events to slave",
++ log_file_name, pos);
++
+ if ((*packet)[EVENT_TYPE_OFFSET+1] == FORMAT_DESCRIPTION_EVENT)
+ {
+ binlog_can_be_corrupted= test((*packet)[FLAGS_OFFSET+1] &
+@@ -634,6 +700,13 @@
+ }
+ if (!thd->killed)
+ {
++ /* Update the binlog sending state. */
++ processlist_show_binlog_state(
++ binlog_state_msg, binlog_state_msg_len, thd,
++ "Has sent all binlog to slave; "
++ "waiting for binlog to be updated",
++ log_file_name, pos);
++
+ /* Note that the following call unlocks lock_log */
+ mysql_bin_log.wait_for_update(thd, 0);
+ }
+@@ -650,7 +723,12 @@
+
+ if (read_packet)
+ {
+- thd_proc_info(thd, "Sending binlog event to slave");
++ // thd_proc_info(thd, "Sending binlog event to slave");
++ /* Update the binlog sending state. */
++ processlist_show_binlog_state(binlog_state_msg,
++ binlog_state_msg_len, thd,
++ "Sending binlog event to slave",
++ log_file_name, pos);
+ if (my_net_write(net, (char*)packet->ptr(), packet->length()) )
+ {
+ errmsg = "Failed on my_net_write()";
+@@ -685,10 +763,21 @@
+ }
+ else
+ {
++ char old_log_file_name[FN_REFLEN];
+ bool loop_breaker = 0;
+ /* need this to break out of the for loop from switch */
+
+- thd_proc_info(thd, "Finished reading one binlog; switching to next binlog");
++ // thd_proc_info(thd, "Finished reading one binlog; switching to next binlog");
++ /* Update the binlog sending state. */
++ processlist_show_binlog_state(
++ binlog_state_msg, binlog_state_msg_len, thd,
++ "Finished reading one binlog; switching to next binlog",
++ log_file_name, pos);
++
++ /* Keep the old fileename. */
++ strmake(old_log_file_name, log_file_name,
++ sizeof(old_log_file_name) - 1);
++
+ switch (mysql_bin_log.find_next_log(&linfo, 1)) {
+ case LOG_INFO_EOF:
+ loop_breaker = (flags & BINLOG_DUMP_NON_BLOCK);
+@@ -706,6 +795,16 @@
+
+ end_io_cache(&log);
+ (void) my_close(file, MYF(MY_WME));
++
++ /* A sanity check that we can not serve the same binlog twice because
++ * the filenames are stored in a .index file.
++ */
++ if (strcmp(old_log_file_name, log_file_name) >= 0) {
++ errmsg = "Re-serving an already served binlog file.";
++ my_errno = ER_MASTER_FATAL_ERROR_READING_BINLOG;
++ goto err;
++ }
++
+
+ /*
+ Call fake_rotate_event() in case the previous log (the one which
+@@ -733,6 +832,8 @@
+ end_io_cache(&log);
+ (void)my_close(file, MYF(MY_WME));
+
++ repl_cleanup(flags);
++
+ send_eof(thd);
+ thd_proc_info(thd, "Waiting to finalize termination");
+ pthread_mutex_lock(&LOCK_thread_count);
+@@ -743,6 +844,7 @@
+ err:
+ thd_proc_info(thd, "Waiting to finalize termination");
+ end_io_cache(&log);
++ repl_cleanup(flags);
+ /*
+ Exclude iteration through thread list
+ this is needed for purge_logs() - it will iterate through
+@@ -1316,7 +1418,7 @@
+ Format_description_log_event *description_event= new
+ Format_description_log_event(3); /* MySQL 4.0 by default */
+
+- if (mysql_bin_log.is_open())
++ if (mysql_bin_log.is_log_open())
+ {
+ LEX_MASTER_INFO *lex_mi= &thd->lex->mi;
+ SELECT_LEX_UNIT *unit= &thd->lex->unit;
+@@ -1456,7 +1558,7 @@
+ DBUG_RETURN(TRUE);
+ protocol->prepare_for_resend();
+
+- if (mysql_bin_log.is_open())
++ if (mysql_bin_log.is_log_open())
+ {
+ LOG_INFO li;
+ mysql_bin_log.get_current_log(&li);
+@@ -1497,7 +1599,7 @@
+ Protocol *protocol= thd->protocol;
+ DBUG_ENTER("show_binlogs");
+
+- if (!mysql_bin_log.is_open())
++ if (!mysql_bin_log.is_log_open())
+ {
+ my_message(ER_NO_BINARY_LOGGING, ER(ER_NO_BINARY_LOGGING), MYF(0));
+ return 1;
+@@ -1606,6 +1708,235 @@
+ DBUG_RETURN(0);
+ }
+
++
++/* make_master: Make the current database a primary and starts the
++ * binlog logging for all updates.
++ *
++ * The function handles the following sql commands:
++ * . MAKE MASTER MASTER_LOG_FILE='replication_log', MASTER_SERVER_ID=1,
++ * [WITH BINLOG];
++ * . MAKE MASTER MASTER_LOG_FILE='replication_log', MASTER_SERVER_ID=1,
++ * INDEX='replication_log.index' [WITH BINLOG];
++ * . MAKE MASTER REVOKE SESSION;
++ * . MAKE MASTER REVOKE SESSION WITH KILL;
++ * . MAKE MASTER GRANT SESSION;
++ *
++ * Args:
++ * thd - the current thread
++ * binlog_name - binlog's filename
++ * binlog_indexname - binlog index's filename
++ * mi - master info struct containing binlog name
++ * (set when we enable master during runtime)
++ *
++ * Return:
++ * 0 : success
++ * -1 : failure
++ */
++int make_master(THD* thd,
++ const char *binlog_name,
++ const char *binlog_indexname,
++ const LEX_MASTER_INFO* mi) {
++ int error = 0;
++
++ DBUG_ENTER("make_master");
++ /* In two mode, we enable the binlog:
++ * . !mi - LEX is not provided; this is called from startup time
++ * . mi->log_file_name - binlog is specified in the command
++ */
++ if (!mi || mi->log_file_name) {
++ /* Get the mutex */
++ VOID(pthread_mutex_lock(&LOCK_failover_master));
++
++ /* If the binlog is already opened, we issue an error. We reuse one
++ * existing error, which might not be fully accurate.
++ */
++ if (mysql_bin_log.is_log_open()) {
++ my_error(ER_MASTER_INFO, MYF(0));
++ sql_print_error("Replication master log is already open: cannot "
++ "make another master!");
++ error = -1;
++ } else {
++ if (!mi) {
++ /* This opening happens at mysql startup time. */
++ if (make_master_open_log(&mysql_bin_log, binlog_name,
++ 0, max_binlog_size) != 0) {
++ error = -1;
++ }
++ } else {
++ /* This opening happens during mysql runtime, which is mostly
++ * requested to do failover.
++ */
++
++ error = -1;
++ if (!is_in_failover()) {
++ sql_print_error(
++ "\"make master\" runs only in failover mode. "
++ "Please run \"make master revoke session (with kill)\"");
++ } else if (strlen(mi->log_file_name) == 0) {
++ sql_print_error("Master log filename is not specified correctly.");
++ } else if (!mi->server_id || mi->server_id == MASTER_INFO_SERVER_ID) {
++ sql_print_error("\"make master\": invalid server_id(%d)",
++ mi->server_id);
++ } else {
++ /* Open the new log files and delete all existing ones to avoid
++ * conflicts.
++ */
++ uint32 old_server_id = server_id;
++ char *binlog_name = NULL;
++
++ /* Set the global master server id.
++ * We would not change server id for all connection threads.
++ * All non-super sessions should be blocked by revoke sessions.
++ * Super-user sessions are responsible for their own operations.
++ */
++ server_id = mi->server_id;
++ thd->server_id = mi->server_id;
++
++ if (!(binlog_name = my_strdup(mi->log_file_name, MYF(0))) ||
++ make_master_open_index(&binlog_name, mi->log_index_name) != 0 ||
++ make_master_open_log(&mysql_bin_log, binlog_name,
++ 0, max_binlog_size) != 0) {
++ sql_print_error("Open master logfile failed.");
++ thd->server_id = old_server_id;
++ server_id = old_server_id;
++ } else if (!mi->with_old_binlog &&
++ mysql_bin_log.reset_logs(thd) != 0) {
++ sql_print_error("Cleanup existing master logfiles failed.");
++ thd->server_id = old_server_id;
++ server_id = old_server_id;
++ } else {
++ error = 0;
++ }
++ }
++ if (error == -1)
++ my_error(ER_MASTER_INFO, MYF(0));
++ }
++ }
++
++ if (error == 0) {
++ /* indicates that binlog is enabled now */
++ using_update_log = 1;
++ } else if (mysql_bin_log.is_open()) {
++ mysql_bin_log.close(LOG_CLOSE_INDEX);
++ }
++
++ /* Release the mutex */
++ VOID(pthread_mutex_unlock(&LOCK_failover_master));
++ } else {
++ /* The following actions are related to session management during
++ * failover operation. We do not want some sessions come in
++ * during failover and make updates.
++ * This is invoked for command: MAKE MASTER GRANT/REVOKE SESSION;
++ */
++ if (mi->in_failover) {
++ set_in_failover(mi->kill_session);
++ } else {
++ clear_in_failover();
++ }
++ }
++
++ DBUG_RETURN(error);
++}
++
++static int make_master_open_log(MYSQL_LOG *log,
++ const char *opt_name,
++ bool no_auto_events,
++ ulong max_size) {
++ char tmp[FN_REFLEN];
++
++ // get rid of extension
++ char *p = fn_ext(opt_name);
++ uint length=(uint) (p-opt_name);
++ strmake(tmp,opt_name,min(length,FN_REFLEN));
++ opt_name=tmp;
++
++ return log->open(opt_name, LOG_BIN, NULL, WRITE_CACHE, 0,
++ max_size, 0);
++}
++
++int make_master_open_index(char **binlog_name,
++ const char *binlog_indexname) {
++ char buf[FN_REFLEN];
++ const char *ln;
++ DBUG_ENTER("make_master_open_index");
++
++ ln= mysql_bin_log.generate_name(*binlog_name, "-bin", 1, buf);
++ if (!(*binlog_name) && !binlog_indexname) {
++ /*
++ User didn't give us info to name the binlog index file.
++ Picking `hostname`-bin.index like did in 4.x, causes replication to
++ fail if the hostname is changed later. So, we would like to instead
++ require a name. But as we don't want to break many existing setups, we
++ only give warning, not error.
++ */
++ sql_print_warning("No argument was provided to --log-bin, and "
++ "--log-bin-index was not used; so replication "
++ "may break when this MySQL server acts as a "
++ "master and has his hostname changed!! Please "
++ "use '--log-bin=%s' to avoid this problem.", ln);
++ }
++ if (ln == buf) {
++ my_free(*binlog_name, MYF(MY_ALLOW_ZERO_PTR));
++ *binlog_name = my_strdup(buf, MYF(0));
++ }
++ if (mysql_bin_log.open_index_file(binlog_indexname, ln) != 0) {
++ DBUG_RETURN(-1);
++ }
++
++ /*
++ Used to specify which type of lock we need to use for queries of type
++ INSERT ... SELECT. This will change when we have row level logging.
++ */
++ using_update_log=1;
++
++ DBUG_RETURN(0);
++}
++
++/* Set the status indicating that we are in failover and deny all non-super
++ * user access.
++ *
++ * Args:
++ * kill_session - kill all non-super sessions if specified
++ *
++ * Return:
++ * 0 - success
++ * -1 - failure (caused by not killing all sessions)
++ */
++static int set_in_failover(bool kill_session) {
++ failover_deny_access = 1;
++
++ if (kill_session) {
++ /* If kill session option is specified, we need to kill all non-super
++ * user sessions.
++ */
++ THD *kill_thd;
++
++ uint error=ER_NO_SUCH_THREAD;
++ pthread_mutex_lock(&LOCK_thread_count); // For unlink from list
++ I_List_iterator<THD> it(threads);
++ while ((kill_thd=it++)) {
++ if (!(kill_thd->main_security_ctx.master_access & SUPER_ACL)) {
++ pthread_mutex_lock(&kill_thd->LOCK_delete); // Lock from delete
++
++ /* ask the thread to die */
++ kill_thd->awake(THD::KILL_CONNECTION);
++ pthread_mutex_unlock(&kill_thd->LOCK_delete);
++ }
++ }
++ pthread_mutex_unlock(&LOCK_thread_count);
++ }
++ return 0;
++}
++
++static void clear_in_failover(void) {
++ failover_deny_access = 0;
++}
++
++bool is_in_failover(void) {
++ return failover_deny_access;
++}
++
++
+ #endif /* HAVE_REPLICATION */
+
+
+diff -r 66cc9e0a6768 sql/sql_repl.h
+--- a/sql/sql_repl.h Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/sql_repl.h Thu Dec 04 21:46:15 2008 -0800
+@@ -38,6 +38,10 @@
+ int start_slave(THD* thd, MASTER_INFO* mi, bool net_report);
+ int stop_slave(THD* thd, MASTER_INFO* mi, bool net_report);
+ bool change_master(THD* thd, MASTER_INFO* mi);
++int make_master(THD* thd, const char *binlog_name,
++ const char *binlog_indexname, const LEX_MASTER_INFO* mi);
++int make_master_open_index(char **binlog_name, const char *binlog_indexname);
++bool is_in_failover(void);
+ bool mysql_show_binlog_events(THD* thd);
+ int cmp_master_pos(const char* log_file_name1, ulonglong log_pos1,
+ const char* log_file_name2, ulonglong log_pos2);
+diff -r 66cc9e0a6768 sql/sql_yacc.yy
+--- a/sql/sql_yacc.yy Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/sql_yacc.yy Thu Dec 04 21:46:15 2008 -0800
+@@ -735,6 +735,7 @@
+ %token LOOP_SYM
+ %token LOW_PRIORITY
+ %token LT
++%token MAKE_SYM
+ %token MAKE_SET_SYM
+ %token MASTER_CONNECT_RETRY_SYM
+ %token MASTER_HOST_SYM
+@@ -1167,7 +1168,7 @@
+ query verb_clause create change select do drop insert replace insert2
+ insert_values update delete truncate rename
+ show describe load alter optimize keycache preload flush
+- reset purge begin commit rollback savepoint release
++ make reset purge begin commit rollback savepoint release
+ slave master_def master_defs master_file_def slave_until_opts
+ repair restore backup analyze check start checksum
+ field_list field_list_item field_spec kill column_def key_def
+@@ -1301,6 +1302,7 @@
+ | kill
+ | load
+ | lock
++ | make
+ | optimize
+ | keycache
+ | preload
+@@ -1428,6 +1430,56 @@
+ master_defs
+ {}
+ ;
++
++/* make master */
++make:
++ MAKE_SYM MASTER_SYM
++ {
++ LEX *lex = Lex;
++ lex->sql_command = SQLCOM_MAKE_MASTER;
++ bzero((char*) &lex->mi, sizeof(lex->mi));
++ }
++ make_master_defs
++ {
++ }
++ ;
++
++make_master_defs:
++ MASTER_LOG_FILE_SYM EQ TEXT_STRING ',' MASTER_SERVER_ID_SYM EQ ulong_num
++ {
++ Lex->mi.log_file_name = $3.str;
++ Lex->mi.server_id = $7;
++ }
++ make_master_with_defs {}
++ | MASTER_LOG_FILE_SYM EQ TEXT_STRING ',' MASTER_SERVER_ID_SYM EQ ulong_num ',' INDEX_SYM EQ TEXT_STRING
++ {
++ Lex->mi.log_file_name = $3.str;
++ Lex->mi.server_id = $7;
++ Lex->mi.log_index_name = $11.str;
++ }
++ make_master_with_defs {}
++ | GRANT SESSION_SYM
++ {
++ Lex->mi.in_failover = 0;
++ }
++ | REVOKE SESSION_SYM
++ {
++ Lex->mi.in_failover = 1;
++ }
++ | REVOKE SESSION_SYM WITH KILL_SYM
++ {
++ Lex->mi.in_failover = 1;
++ Lex->mi.kill_session = 1;
++ }
++ ;
++
++make_master_with_defs:
++ /* empty */ {}
++ | WITH BINLOG_SYM
++ {
++ /* All old binlogs will be kept after "make master" command. */
++ Lex->mi.with_old_binlog = 1;
++ }
+
+ master_defs:
+ master_def
+@@ -8396,6 +8448,7 @@
+ | HANDLER_SYM {}
+ | HELP_SYM {}
+ | LANGUAGE_SYM {}
++ | MAKE_SYM {}
+ | NO_SYM {}
+ | OPEN_SYM {}
+ | PREPARE_SYM {}