diff options
author | Robin H. Johnson <robbat2@gentoo.org> | 2010-08-09 00:21:00 +0000 |
---|---|---|
committer | Robin H. Johnson <robbat2@gentoo.org> | 2010-08-09 00:21:00 +0000 |
commit | 4b509e569a5d958c4a81e18dedd3df31a6092391 (patch) | |
tree | 18750b10edc3f28dc63d67ed9549ec1fb53bdd04 /percona/5.0.91-b22-20100522/innodb_rw_lock.patch | |
parent | Updated 07110 patch for mysql-5.1.49. (diff) | |
download | mysql-extras-4b509e569a5d958c4a81e18dedd3df31a6092391.tar.gz mysql-extras-4b509e569a5d958c4a81e18dedd3df31a6092391.tar.bz2 mysql-extras-4b509e569a5d958c4a81e18dedd3df31a6092391.zip |
Adding latest Percona patches.
Diffstat (limited to 'percona/5.0.91-b22-20100522/innodb_rw_lock.patch')
-rw-r--r-- | percona/5.0.91-b22-20100522/innodb_rw_lock.patch | 2480 |
1 files changed, 2480 insertions, 0 deletions
diff --git a/percona/5.0.91-b22-20100522/innodb_rw_lock.patch b/percona/5.0.91-b22-20100522/innodb_rw_lock.patch new file mode 100644 index 0000000..a509f70 --- /dev/null +++ b/percona/5.0.91-b22-20100522/innodb_rw_lock.patch @@ -0,0 +1,2480 @@ +diff -ruN a/innobase/btr/btr0cur.c b/innobase/btr/btr0cur.c +--- a/innobase/btr/btr0cur.c 2009-10-22 15:15:05.000000000 +0900 ++++ b/innobase/btr/btr0cur.c 2009-10-22 15:18:44.000000000 +0900 +@@ -313,7 +313,7 @@ + #ifdef UNIV_SEARCH_PERF_STAT + info->n_searches++; + #endif +- if (btr_search_latch.writer == RW_LOCK_NOT_LOCKED ++ if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_NOT_LOCKED + && latch_mode <= BTR_MODIFY_LEAF && info->last_hash_succ + && !estimate + #ifdef PAGE_CUR_LE_OR_EXTENDS +diff -ruN a/innobase/btr/btr0sea.c b/innobase/btr/btr0sea.c +--- a/innobase/btr/btr0sea.c 2009-10-22 15:15:05.000000000 +0900 ++++ b/innobase/btr/btr0sea.c 2009-10-22 15:18:44.000000000 +0900 +@@ -773,8 +773,8 @@ + rw_lock_s_lock(&btr_search_latch); + } + +- ut_ad(btr_search_latch.writer != RW_LOCK_EX); +- ut_ad(btr_search_latch.reader_count > 0); ++ ut_ad(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_EX); ++ ut_ad(rw_lock_get_reader_count(&btr_search_latch) > 0); + + rec = ha_search_and_get_data(btr_search_sys->hash_index, fold); + +diff -ruN a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c +--- a/innobase/buf/buf0buf.c 2009-10-22 15:15:05.000000000 +0900 ++++ b/innobase/buf/buf0buf.c 2009-10-22 15:18:44.000000000 +0900 +@@ -1292,7 +1292,7 @@ + + if (mode == BUF_GET_NOWAIT) { + if (rw_latch == RW_S_LATCH) { +- success = rw_lock_s_lock_func_nowait(&(block->lock), ++ success = rw_lock_s_lock_nowait(&(block->lock), + file, line); + fix_type = MTR_MEMO_PAGE_S_FIX; + } else { +@@ -1442,7 +1442,7 @@ + ut_ad(!ibuf_inside() || ibuf_page(block->space, block->offset)); + + if (rw_latch == RW_S_LATCH) { +- success = rw_lock_s_lock_func_nowait(&(block->lock), ++ success = rw_lock_s_lock_nowait(&(block->lock), + file, line); + fix_type = MTR_MEMO_PAGE_S_FIX; + } else { +@@ -1596,7 +1596,7 @@ + ut_ad(!ibuf_inside() || (mode == BUF_KEEP_OLD)); + + if (rw_latch == RW_S_LATCH) { +- success = rw_lock_s_lock_func_nowait(&(block->lock), ++ success = rw_lock_s_lock_nowait(&(block->lock), + file, line); + fix_type = MTR_MEMO_PAGE_S_FIX; + } else { +diff -ruN a/innobase/include/buf0buf.ic b/innobase/include/buf0buf.ic +--- a/innobase/include/buf0buf.ic 2009-10-22 15:15:05.000000000 +0900 ++++ b/innobase/include/buf0buf.ic 2009-10-22 16:12:25.000000000 +0900 +@@ -523,7 +523,7 @@ + #ifdef UNIV_SYNC_DEBUG + ibool ret; + +- ret = rw_lock_s_lock_func_nowait(&(block->debug_latch), file, line); ++ ret = rw_lock_s_lock_nowait(&(block->debug_latch), file, line); + + ut_ad(ret == TRUE); + ut_ad(mutex_own(&block->mutex)); +diff -ruN a/innobase/include/os0sync.h b/innobase/include/os0sync.h +--- a/innobase/include/os0sync.h 2009-09-10 04:02:59.000000000 +0900 ++++ b/innobase/include/os0sync.h 2009-10-22 15:18:44.000000000 +0900 +@@ -1,11 +1,35 @@ ++/***************************************************************************** ++ ++Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. ++Copyright (c) 2008, Google Inc. ++ ++Portions of this file contain modifications contributed and copyrighted by ++Google, Inc. Those modifications are gratefully acknowledged and are described ++briefly in the InnoDB documentation. The contributions by Google are ++incorporated with their permission, and subject to the conditions contained in ++the file COPYING.Google. ++ ++This program is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License as published by the Free Software ++Foundation; version 2 of the License. ++ ++This program is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License along with ++this program; if not, write to the Free Software Foundation, Inc., 59 Temple ++Place, Suite 330, Boston, MA 02111-1307 USA ++ ++*****************************************************************************/ ++ + /****************************************************** + The interface to the operating system + synchronization primitives. + +-(c) 1995 Innobase Oy +- + Created 9/6/1995 Heikki Tuuri + *******************************************************/ ++ + #ifndef os0sync_h + #define os0sync_h + +@@ -261,6 +285,23 @@ + /*===============*/ + os_fast_mutex_t* fast_mutex); /* in: mutex to free */ + ++#ifdef HAVE_ATOMIC_BUILTINS ++/************************************************************** ++Atomic compare-and-swap for InnoDB. Currently requires GCC atomic builtins. ++Returns true if swapped, ptr is pointer to target, old_val is value to ++compare to, new_val is the value to swap in. */ ++#define os_compare_and_swap(ptr, old_val, new_val) \ ++ __sync_bool_compare_and_swap(ptr, old_val, new_val) ++ ++/************************************************************** ++Atomic increment for InnoDB. Currently requires GCC atomic builtins. ++Returns the resulting value, ptr is pointer to target, amount is the ++amount of increment. */ ++#define os_atomic_increment(ptr, amount) \ ++ __sync_add_and_fetch(ptr, amount) ++ ++#endif /* HAVE_ATOMIC_BUILTINS */ ++ + #ifndef UNIV_NONINL + #include "os0sync.ic" + #endif +diff -ruN a/innobase/include/sync0rw.h b/innobase/include/sync0rw.h +--- a/innobase/include/sync0rw.h 2009-09-10 04:02:59.000000000 +0900 ++++ b/innobase/include/sync0rw.h 2009-10-22 15:18:44.000000000 +0900 +@@ -1,8 +1,31 @@ ++/***************************************************************************** ++ ++Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. ++Copyright (c) 2008, Google Inc. ++ ++Portions of this file contain modifications contributed and copyrighted by ++Google, Inc. Those modifications are gratefully acknowledged and are described ++briefly in the InnoDB documentation. The contributions by Google are ++incorporated with their permission, and subject to the conditions contained in ++the file COPYING.Google. ++ ++This program is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License as published by the Free Software ++Foundation; version 2 of the License. ++ ++This program is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License along with ++this program; if not, write to the Free Software Foundation, Inc., 59 Temple ++Place, Suite 330, Boston, MA 02111-1307 USA ++ ++*****************************************************************************/ ++ + /****************************************************** + The read-write lock (for threads, not for database transactions) + +-(c) 1995 Innobase Oy +- + Created 9/11/1995 Heikki Tuuri + *******************************************************/ + +@@ -24,6 +47,12 @@ + #define RW_X_LATCH 2 + #define RW_NO_LATCH 3 + ++/* We decrement lock_word by this amount for each x_lock. It is also the ++start value for the lock_word, meaning that it limits the maximum number ++of concurrent read locks before the rw_lock breaks. The current value of ++0x00100000 allows 1,048,575 concurrent readers and 2047 recursive writers.*/ ++#define X_LOCK_DECR 0x00100000 ++ + typedef struct rw_lock_struct rw_lock_t; + #ifdef UNIV_SYNC_DEBUG + typedef struct rw_lock_debug_struct rw_lock_debug_t; +@@ -47,14 +76,14 @@ + there may be waiters for the event */ + #endif /* UNIV_SYNC_DEBUG */ + +-extern ulint rw_s_system_call_count; +-extern ulint rw_s_spin_wait_count; +-extern ulint rw_s_exit_count; +-extern ulint rw_s_os_wait_count; +-extern ulint rw_x_system_call_count; +-extern ulint rw_x_spin_wait_count; +-extern ulint rw_x_os_wait_count; +-extern ulint rw_x_exit_count; ++extern ib_longlong rw_s_spin_wait_count; ++extern ib_longlong rw_s_spin_round_count; ++extern ib_longlong rw_s_exit_count; ++extern ib_longlong rw_s_os_wait_count; ++extern ib_longlong rw_x_spin_wait_count; ++extern ib_longlong rw_x_spin_round_count; ++extern ib_longlong rw_x_os_wait_count; ++extern ib_longlong rw_x_exit_count; + + /********************************************************************** + Creates, or rather, initializes an rw-lock object in a specified memory +@@ -116,8 +145,22 @@ + NOTE! The following macros should be used in rw s-locking, not the + corresponding function. */ + +-#define rw_lock_s_lock_nowait(M) rw_lock_s_lock_func_nowait(\ +- (M), __FILE__, __LINE__) ++#define rw_lock_s_lock_nowait(M, F, L) rw_lock_s_lock_low(\ ++ (M), 0, (F), (L)) ++/********************************************************************** ++Low-level function which tries to lock an rw-lock in s-mode. Performs no ++spinning. */ ++UNIV_INLINE ++ibool ++rw_lock_s_lock_low( ++/*===============*/ ++ /* out: TRUE if success */ ++ rw_lock_t* lock, /* in: pointer to rw-lock */ ++ ulint pass __attribute__((unused)), ++ /* in: pass value; != 0, if the lock will be ++ passed to another thread to unlock */ ++ const char* file_name, /* in: file name where lock requested */ ++ ulint line); /* in: line where requested */ + /********************************************************************** + NOTE! Use the corresponding macro, not directly this function, except if + you supply the file name and line number. Lock an rw-lock in shared mode +@@ -135,18 +178,6 @@ + const char* file_name,/* in: file name where lock requested */ + ulint line); /* in: line where requested */ + /********************************************************************** +-NOTE! Use the corresponding macro, not directly this function, except if +-you supply the file name and line number. Lock an rw-lock in shared mode +-for the current thread if the lock can be acquired immediately. */ +-UNIV_INLINE +-ibool +-rw_lock_s_lock_func_nowait( +-/*=======================*/ +- /* out: TRUE if success */ +- rw_lock_t* lock, /* in: pointer to rw-lock */ +- const char* file_name,/* in: file name where lock requested */ +- ulint line); /* in: line where requested */ +-/********************************************************************** + NOTE! Use the corresponding macro, not directly this function! Lock an + rw-lock in exclusive mode for the current thread if the lock can be + obtained immediately. */ +@@ -338,6 +369,41 @@ + rw_lock_get_reader_count( + /*=====================*/ + rw_lock_t* lock); ++/********************************************************************** ++Decrements lock_word the specified amount if it is greater than 0. ++This is used by both s_lock and x_lock operations. */ ++UNIV_INLINE ++ibool ++rw_lock_lock_word_decr( ++/*===================*/ ++ /* out: TRUE if decr occurs */ ++ rw_lock_t* lock, /* in: rw-lock */ ++ ulint amount); /* in: amount to decrement */ ++/********************************************************************** ++Increments lock_word the specified amount and returns new value. */ ++UNIV_INLINE ++lint ++rw_lock_lock_word_incr( ++/*===================*/ ++ /* out: TRUE if decr occurs */ ++ rw_lock_t* lock, ++ ulint amount); /* in: rw-lock */ ++/********************************************************************** ++This function sets the lock->writer_thread and lock->recursive fields. ++For platforms where we are using atomic builtins instead of lock->mutex ++it sets the lock->writer_thread field using atomics to ensure memory ++ordering. Note that it is assumed that the caller of this function ++effectively owns the lock i.e.: nobody else is allowed to modify ++lock->writer_thread at this point in time. ++The protocol is that lock->writer_thread MUST be updated BEFORE the ++lock->recursive flag is set. */ ++UNIV_INLINE ++void ++rw_lock_set_writer_id_and_recursion_flag( ++/*=====================================*/ ++ rw_lock_t* lock, /* in/out: lock to work on */ ++ ibool recursive); /* in: TRUE if recursion ++ allowed */ + #ifdef UNIV_SYNC_DEBUG + /********************************************************************** + Checks if the thread has locked the rw-lock in the specified mode, with +@@ -417,47 +483,33 @@ + field. Then no new readers are allowed in. */ + + struct rw_lock_struct { ++ volatile lint lock_word; ++ /* Holds the state of the lock. */ ++ volatile ulint waiters;/* 1: there are waiters */ ++ volatile ibool recursive;/* Default value FALSE which means the lock ++ is non-recursive. The value is typically set ++ to TRUE making normal rw_locks recursive. In ++ case of asynchronous IO, when a non-zero ++ value of 'pass' is passed then we keep the ++ lock non-recursive. ++ This flag also tells us about the state of ++ writer_thread field. If this flag is set ++ then writer_thread MUST contain the thread ++ id of the current x-holder or wait-x thread. ++ This flag must be reset in x_unlock ++ functions before incrementing the lock_word */ ++ volatile os_thread_id_t writer_thread; ++ /* Thread id of writer thread. Is only ++ guaranteed to have sane and non-stale ++ value iff recursive flag is set. */ + os_event_t event; /* Used by sync0arr.c for thread queueing */ +- +-#ifdef __WIN__ +- os_event_t wait_ex_event; /* This windows specific event is +- used by the thread which has set the +- lock state to RW_LOCK_WAIT_EX. The +- rw_lock design guarantees that this +- thread will be the next one to proceed +- once the current the event gets +- signalled. See LEMMA 2 in sync0sync.c */ +-#endif +- +- ulint reader_count; /* Number of readers who have locked this +- lock in the shared mode */ +- ulint writer; /* This field is set to RW_LOCK_EX if there +- is a writer owning the lock (in exclusive +- mode), RW_LOCK_WAIT_EX if a writer is +- queueing for the lock, and +- RW_LOCK_NOT_LOCKED, otherwise. */ +- os_thread_id_t writer_thread; +- /* Thread id of a possible writer thread */ +- ulint writer_count; /* Number of times the same thread has +- recursively locked the lock in the exclusive +- mode */ ++ os_event_t wait_ex_event; ++ /* Event for next-writer to wait on. A thread ++ must decrement lock_word before waiting. */ ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_t mutex; /* The mutex protecting rw_lock_struct */ +- ulint pass; /* Default value 0. This is set to some +- value != 0 given by the caller of an x-lock +- operation, if the x-lock is to be passed to +- another thread to unlock (which happens in +- asynchronous i/o). */ +- ulint waiters; /* This ulint is set to 1 if there are +- waiters (readers or writers) in the global +- wait array, waiting for this rw_lock. +- Otherwise, == 0. */ +- ibool writer_is_wait_ex; +- /* This is TRUE if the writer field is +- RW_LOCK_WAIT_EX; this field is located far +- from the memory update hotspot fields which +- are at the start of this struct, thus we can +- peek this field without causing much memory +- bus traffic */ ++#endif /* HAVE_ATOMIC_BUILTINS */ ++ + UT_LIST_NODE_T(rw_lock_t) list; + /* All allocated rw locks are put into a + list */ +@@ -465,15 +517,23 @@ + UT_LIST_BASE_NODE_T(rw_lock_debug_t) debug_list; + /* In the debug version: pointer to the debug + info list of the lock */ ++ ulint level; /* Level in the global latching order. */ + #endif /* UNIV_SYNC_DEBUG */ +- ulint level; /* Level in the global latching +- order; default SYNC_LEVEL_NONE */ ++ ulint count_os_wait; /* Count of os_waits. May not be accurate */ + const char* cfile_name;/* File name where lock created */ +- ulint cline; /* Line where created */ ++ /* last s-lock file/line is not guaranteed to be correct */ + const char* last_s_file_name;/* File name where last s-locked */ + const char* last_x_file_name;/* File name where last x-locked */ +- ulint last_s_line; /* Line number where last time s-locked */ +- ulint last_x_line; /* Line number where last time x-locked */ ++ ibool writer_is_wait_ex; ++ /* This is TRUE if the writer field is ++ RW_LOCK_WAIT_EX; this field is located far ++ from the memory update hotspot fields which ++ are at the start of this struct, thus we can ++ peek this field without causing much memory ++ bus traffic */ ++ unsigned cline:14; /* Line where created */ ++ unsigned last_s_line:14; /* Line number where last time s-locked */ ++ unsigned last_x_line:14; /* Line number where last time x-locked */ + ulint magic_n; + }; + +diff -ruN a/innobase/include/sync0rw.ic b/innobase/include/sync0rw.ic +--- a/innobase/include/sync0rw.ic 2009-09-10 04:02:59.000000000 +0900 ++++ b/innobase/include/sync0rw.ic 2009-10-22 15:18:44.000000000 +0900 +@@ -1,8 +1,31 @@ ++/***************************************************************************** ++ ++Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. ++Copyright (c) 2008, Google Inc. ++ ++Portions of this file contain modifications contributed and copyrighted by ++Google, Inc. Those modifications are gratefully acknowledged and are described ++briefly in the InnoDB documentation. The contributions by Google are ++incorporated with their permission, and subject to the conditions contained in ++the file COPYING.Google. ++ ++This program is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License as published by the Free Software ++Foundation; version 2 of the License. ++ ++This program is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License along with ++this program; if not, write to the Free Software Foundation, Inc., 59 Temple ++Place, Suite 330, Boston, MA 02111-1307 USA ++ ++*****************************************************************************/ ++ + /****************************************************** + The read-write lock (for threads) + +-(c) 1995 Innobase Oy +- + Created 9/11/1995 Heikki Tuuri + *******************************************************/ + +@@ -49,53 +72,88 @@ + ulint + rw_lock_get_waiters( + /*================*/ +- rw_lock_t* lock) ++ /* out: 1 if waiters, 0 otherwise */ ++ rw_lock_t* lock) /* in: rw-lock */ + { + return(lock->waiters); + } ++ ++/************************************************************************ ++Sets lock->waiters to 1. It is not an error if lock->waiters is already ++1. On platforms where ATOMIC builtins are used this function enforces a ++memory barrier. */ + UNIV_INLINE + void +-rw_lock_set_waiters( +-/*================*/ +- rw_lock_t* lock, +- ulint flag) ++rw_lock_set_waiter_flag( ++/*====================*/ ++ rw_lock_t* lock) /* in: rw-lock */ + { +- lock->waiters = flag; ++#ifdef HAVE_ATOMIC_BUILTINS ++ os_compare_and_swap(&lock->waiters, 0, 1); ++#else /* HAVE_ATOMIC_BUILTINS */ ++ lock->waiters = 1; ++#endif /* HAVE_ATOMIC_BUILTINS */ + } ++ ++/************************************************************************ ++Resets lock->waiters to 0. It is not an error if lock->waiters is already ++0. On platforms where ATOMIC builtins are used this function enforces a ++memory barrier. */ + UNIV_INLINE +-ulint +-rw_lock_get_writer( +-/*===============*/ +- rw_lock_t* lock) ++void ++rw_lock_reset_waiter_flag( ++/*======================*/ ++ rw_lock_t* lock) /* in: rw-lock */ + { +- return(lock->writer); ++#ifdef HAVE_ATOMIC_BUILTINS ++ os_compare_and_swap(&lock->waiters, 1, 0); ++#else /* HAVE_ATOMIC_BUILTINS */ ++ lock->waiters = 0; ++#endif /* HAVE_ATOMIC_BUILTINS */ + } ++ ++/********************************************************************** ++Returns the write-status of the lock - this function made more sense ++with the old rw_lock implementation. */ + UNIV_INLINE +-void +-rw_lock_set_writer( ++ulint ++rw_lock_get_writer( + /*===============*/ +- rw_lock_t* lock, +- ulint flag) ++ rw_lock_t* lock) + { +- lock->writer = flag; ++ lint lock_word = lock->lock_word; ++ if(lock_word > 0) { ++ /* return NOT_LOCKED in s-lock state, like the writer ++ member of the old lock implementation. */ ++ return(RW_LOCK_NOT_LOCKED); ++ } else if (((-lock_word) % X_LOCK_DECR) == 0) { ++ return(RW_LOCK_EX); ++ } else { ++ ut_ad(lock_word > -X_LOCK_DECR); ++ return(RW_LOCK_WAIT_EX); ++ } + } ++ ++/********************************************************************** ++Returns number of readers. */ + UNIV_INLINE + ulint + rw_lock_get_reader_count( + /*=====================*/ + rw_lock_t* lock) + { +- return(lock->reader_count); +-} +-UNIV_INLINE +-void +-rw_lock_set_reader_count( +-/*=====================*/ +- rw_lock_t* lock, +- ulint count) +-{ +- lock->reader_count = count; ++ lint lock_word = lock->lock_word; ++ if(lock_word > 0) { ++ /* s-locked, no x-waiters */ ++ return(X_LOCK_DECR - lock_word); ++ } else if (lock_word < 0 && lock_word > -X_LOCK_DECR) { ++ /* s-locked, with x-waiters */ ++ return((ulint)(-lock_word)); ++ } ++ return(0); + } ++ ++#ifndef HAVE_ATOMIC_BUILTINS + UNIV_INLINE + mutex_t* + rw_lock_get_mutex( +@@ -104,6 +162,7 @@ + { + return(&(lock->mutex)); + } ++#endif + + /********************************************************************** + Returns the value of writer_count for the lock. Does not reserve the lock +@@ -115,7 +174,126 @@ + /* out: value of writer_count */ + rw_lock_t* lock) /* in: rw-lock */ + { +- return(lock->writer_count); ++ lint lock_copy = lock->lock_word; ++ /* If there is a reader, lock_word is not divisible by X_LOCK_DECR */ ++ if(lock_copy > 0 || (-lock_copy) % X_LOCK_DECR != 0) { ++ return(0); ++ } ++ return(((-lock_copy) / X_LOCK_DECR) + 1); ++} ++ ++/********************************************************************** ++Two different implementations for decrementing the lock_word of a rw_lock: ++one for systems supporting atomic operations, one for others. This does ++does not support recusive x-locks: they should be handled by the caller and ++need not be atomic since they are performed by the current lock holder. ++Returns true if the decrement was made, false if not. */ ++UNIV_INLINE ++ibool ++rw_lock_lock_word_decr( ++/*===================*/ ++ /* out: TRUE if decr occurs */ ++ rw_lock_t* lock, /* in: rw-lock */ ++ ulint amount) /* in: amount of decrement */ ++{ ++ ++#ifdef HAVE_ATOMIC_BUILTINS ++ ++ lint local_lock_word = lock->lock_word; ++ while (local_lock_word > 0) { ++ if(os_compare_and_swap(&(lock->lock_word), ++ local_lock_word, ++ local_lock_word - amount)) { ++ return(TRUE); ++ } ++ local_lock_word = lock->lock_word; ++ } ++ return(FALSE); ++ ++#else /* HAVE_ATOMIC_BUILTINS */ ++ ++ ibool success = FALSE; ++ mutex_enter(&(lock->mutex)); ++ if(lock->lock_word > 0) { ++ lock->lock_word -= amount; ++ success = TRUE; ++ } ++ mutex_exit(&(lock->mutex)); ++ return(success); ++ ++#endif /* HAVE_ATOMIC_BUILTINS */ ++} ++ ++/********************************************************************** ++Two different implementations for incrementing the lock_word of a rw_lock: ++one for systems supporting atomic operations, one for others. ++Returns the value of lock_word after increment. */ ++UNIV_INLINE ++lint ++rw_lock_lock_word_incr( ++/*===================*/ ++ /* out: lock->lock_word after increment */ ++ rw_lock_t* lock, /* in: rw-lock */ ++ ulint amount) /* in: amount of increment */ ++{ ++ ++#ifdef HAVE_ATOMIC_BUILTINS ++ ++ return(os_atomic_increment(&(lock->lock_word), amount)); ++ ++#else /* HAVE_ATOMIC_BUILTINS */ ++ ++ lint local_lock_word; ++ ++ mutex_enter(&(lock->mutex)); ++ ++ lock->lock_word += amount; ++ local_lock_word = lock->lock_word; ++ ++ mutex_exit(&(lock->mutex)); ++ ++ return(local_lock_word); ++ ++#endif /* HAVE_ATOMIC_BUILTINS */ ++} ++ ++/********************************************************************** ++This function sets the lock->writer_thread and lock->recursive fields. ++For platforms where we are using atomic builtins instead of lock->mutex ++it sets the lock->writer_thread field using atomics to ensure memory ++ordering. Note that it is assumed that the caller of this function ++effectively owns the lock i.e.: nobody else is allowed to modify ++lock->writer_thread at this point in time. ++The protocol is that lock->writer_thread MUST be updated BEFORE the ++lock->recursive flag is set. */ ++UNIV_INLINE ++void ++rw_lock_set_writer_id_and_recursion_flag( ++/*=====================================*/ ++ rw_lock_t* lock, /* in/out: lock to work on */ ++ ibool recursive) /* in: TRUE if recursion ++ allowed */ ++{ ++ os_thread_id_t curr_thread = os_thread_get_curr_id(); ++ ++#ifdef HAVE_ATOMIC_BUILTINS ++ os_thread_id_t local_thread; ++ ibool success; ++ ++ local_thread = lock->writer_thread; ++ success = os_compare_and_swap(&lock->writer_thread, ++ local_thread, curr_thread); ++ ut_a(success); ++ lock->recursive = recursive; ++ ++#else /* HAVE_ATOMIC_BUILTINS */ ++ ++ mutex_enter(&lock->mutex); ++ lock->writer_thread = curr_thread; ++ lock->recursive = recursive; ++ mutex_exit(&lock->mutex); ++ ++#endif /* HAVE_ATOMIC_BUILTINS */ + } + + /********************************************************************** +@@ -133,26 +311,21 @@ + const char* file_name, /* in: file name where lock requested */ + ulint line) /* in: line where requested */ + { +-#ifdef UNIV_SYNC_DEBUG +- ut_ad(mutex_own(rw_lock_get_mutex(lock))); +-#endif /* UNIV_SYNC_DEBUG */ +- /* Check if the writer field is free */ +- +- if (UNIV_LIKELY(lock->writer == RW_LOCK_NOT_LOCKED)) { +- /* Set the shared lock by incrementing the reader count */ +- lock->reader_count++; ++ /* TODO: study performance of UNIV_LIKELY branch prediction hints. */ ++ if (!rw_lock_lock_word_decr(lock, 1)) { ++ /* Locking did not succeed */ ++ return(FALSE); ++ } + + #ifdef UNIV_SYNC_DEBUG +- rw_lock_add_debug_info(lock, pass, RW_LOCK_SHARED, file_name, +- line); ++ rw_lock_add_debug_info(lock, pass, RW_LOCK_SHARED, file_name, line); + #endif +- lock->last_s_file_name = file_name; +- lock->last_s_line = line; +- +- return(TRUE); /* locking succeeded */ +- } ++ /* These debugging values are not set safely: they may be incorrect ++ or even refer to a line that is invalid for the file name. */ ++ lock->last_s_file_name = file_name; ++ lock->last_s_line = line; + +- return(FALSE); /* locking did not succeed */ ++ return(TRUE); /* locking succeeded */ + } + + /********************************************************************** +@@ -167,11 +340,10 @@ + const char* file_name, /* in: file name where requested */ + ulint line) /* in: line where lock requested */ + { +- ut_ad(lock->writer == RW_LOCK_NOT_LOCKED); +- ut_ad(rw_lock_get_reader_count(lock) == 0); ++ ut_ad(lock->lock_word == X_LOCK_DECR); + +- /* Set the shared lock by incrementing the reader count */ +- lock->reader_count++; ++ /* Indicate there is a new reader by decrementing lock_word */ ++ lock->lock_word--; + + lock->last_s_file_name = file_name; + lock->last_s_line = line; +@@ -194,13 +366,11 @@ + ulint line) /* in: line where lock requested */ + { + ut_ad(rw_lock_validate(lock)); +- ut_ad(rw_lock_get_reader_count(lock) == 0); +- ut_ad(rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED); ++ ut_ad(lock->lock_word == X_LOCK_DECR); + +- rw_lock_set_writer(lock, RW_LOCK_EX); ++ lock->lock_word -= X_LOCK_DECR; + lock->writer_thread = os_thread_get_curr_id(); +- lock->writer_count++; +- lock->pass = 0; ++ lock->recursive = TRUE; + + lock->last_x_file_name = file_name; + lock->last_x_line = line; +@@ -241,15 +411,12 @@ + ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED)); /* see NOTE above */ + #endif /* UNIV_SYNC_DEBUG */ + +- mutex_enter(rw_lock_get_mutex(lock)); +- +- if (UNIV_LIKELY(rw_lock_s_lock_low(lock, pass, file_name, line))) { +- mutex_exit(rw_lock_get_mutex(lock)); ++ /* TODO: study performance of UNIV_LIKELY branch prediction hints. */ ++ if (rw_lock_s_lock_low(lock, pass, file_name, line)) { + + return; /* Success */ + } else { + /* Did not succeed, try spin wait */ +- mutex_exit(rw_lock_get_mutex(lock)); + + rw_lock_s_lock_spin(lock, pass, file_name, line); + +@@ -259,86 +426,60 @@ + + /********************************************************************** + NOTE! Use the corresponding macro, not directly this function! Lock an +-rw-lock in shared mode for the current thread if the lock can be acquired +-immediately. */ ++rw-lock in exclusive mode for the current thread if the lock can be ++obtained immediately. */ + UNIV_INLINE + ibool +-rw_lock_s_lock_func_nowait( ++rw_lock_x_lock_func_nowait( + /*=======================*/ + /* out: TRUE if success */ + rw_lock_t* lock, /* in: pointer to rw-lock */ + const char* file_name,/* in: file name where lock requested */ + ulint line) /* in: line where requested */ + { +- ibool success = FALSE; +- +- mutex_enter(rw_lock_get_mutex(lock)); +- +- if (lock->writer == RW_LOCK_NOT_LOCKED) { +- /* Set the shared lock by incrementing the reader count */ +- lock->reader_count++; ++ os_thread_id_t curr_thread = os_thread_get_curr_id(); + +-#ifdef UNIV_SYNC_DEBUG +- rw_lock_add_debug_info(lock, 0, RW_LOCK_SHARED, file_name, +- line); +-#endif ++ ibool success; + +- lock->last_s_file_name = file_name; +- lock->last_s_line = line; ++#ifdef HAVE_ATOMIC_BUILTINS ++ success = os_compare_and_swap(&(lock->lock_word), X_LOCK_DECR, 0); ++#else + ++ success = FALSE; ++ mutex_enter(&(lock->mutex)); ++ if (lock->lock_word == X_LOCK_DECR) { ++ lock->lock_word = 0; + success = TRUE; + } ++ mutex_exit(&(lock->mutex)); + +- mutex_exit(rw_lock_get_mutex(lock)); +- +- return(success); +-} ++#endif ++ if (success) { ++ rw_lock_set_writer_id_and_recursion_flag(lock, TRUE); + +-/********************************************************************** +-NOTE! Use the corresponding macro, not directly this function! Lock an +-rw-lock in exclusive mode for the current thread if the lock can be +-obtained immediately. */ +-UNIV_INLINE +-ibool +-rw_lock_x_lock_func_nowait( +-/*=======================*/ +- /* out: TRUE if success */ +- rw_lock_t* lock, /* in: pointer to rw-lock */ +- const char* file_name,/* in: file name where lock requested */ +- ulint line) /* in: line where requested */ +-{ +- ibool success = FALSE; +- os_thread_id_t curr_thread = os_thread_get_curr_id(); +- mutex_enter(rw_lock_get_mutex(lock)); ++ } else if (lock->recursive ++ && os_thread_eq(lock->writer_thread, curr_thread)) { ++ /* Relock: this lock_word modification is safe since no other ++ threads can modify (lock, unlock, or reserve) lock_word while ++ there is an exclusive writer and this is the writer thread. */ ++ lock->lock_word -= X_LOCK_DECR; + +- if (UNIV_UNLIKELY(rw_lock_get_reader_count(lock) != 0)) { +- } else if (UNIV_LIKELY(rw_lock_get_writer(lock) +- == RW_LOCK_NOT_LOCKED)) { +- rw_lock_set_writer(lock, RW_LOCK_EX); +- lock->writer_thread = curr_thread; +- lock->pass = 0; +- relock: +- lock->writer_count++; ++ ut_ad(((-lock->lock_word) % X_LOCK_DECR) == 0); + ++ } else { ++ /* Failure */ ++ return(FALSE); ++ } + #ifdef UNIV_SYNC_DEBUG +- rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line); ++ rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line); + #endif + +- lock->last_x_file_name = file_name; +- lock->last_x_line = line; +- +- success = TRUE; +- } else if (rw_lock_get_writer(lock) == RW_LOCK_EX +- && lock->pass == 0 +- && os_thread_eq(lock->writer_thread, curr_thread)) { +- goto relock; +- } +- +- mutex_exit(rw_lock_get_mutex(lock)); ++ lock->last_x_file_name = file_name; ++ lock->last_x_line = line; + + ut_ad(rw_lock_validate(lock)); + +- return(success); ++ return(TRUE); + } + + /********************************************************************** +@@ -354,39 +495,21 @@ + #endif + ) + { +- mutex_t* mutex = &(lock->mutex); +- ibool sg = FALSE; +- +- /* Acquire the mutex protecting the rw-lock fields */ +- mutex_enter(mutex); +- +- /* Reset the shared lock by decrementing the reader count */ +- +- ut_a(lock->reader_count > 0); +- lock->reader_count--; ++ ut_ad((lock->lock_word % X_LOCK_DECR) != 0); + + #ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, pass, RW_LOCK_SHARED); + #endif + +- /* If there may be waiters and this was the last s-lock, +- signal the object */ ++ /* Increment lock_word to indicate 1 less reader */ ++ if (rw_lock_lock_word_incr(lock, 1) == 0) { + +- if (UNIV_UNLIKELY(lock->waiters) +- && lock->reader_count == 0) { +- sg = TRUE; +- +- rw_lock_set_waiters(lock, 0); +- } +- +- mutex_exit(mutex); +- +- if (UNIV_UNLIKELY(sg)) { +-#ifdef __WIN__ ++ /* wait_ex waiter exists. It may not be asleep, but we signal ++ anyway. We do not wake other waiters, because they can't ++ exist without wait_ex waiter and wait_ex waiter goes first.*/ + os_event_set(lock->wait_ex_event); +-#endif +- os_event_set(lock->event); + sync_array_object_signalled(sync_primary_wait_array); ++ + } + + ut_ad(rw_lock_validate(lock)); +@@ -405,16 +528,15 @@ + /*====================*/ + rw_lock_t* lock) /* in: rw-lock */ + { +- /* Reset the shared lock by decrementing the reader count */ +- +- ut_ad(lock->reader_count > 0); +- +- lock->reader_count--; ++ ut_ad(lock->lock_word < X_LOCK_DECR); + + #ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, 0, RW_LOCK_SHARED); + #endif + ++ /* Decrease reader count by incrementing lock_word */ ++ lock->lock_word++; ++ + ut_ad(!lock->waiters); + ut_ad(rw_lock_validate(lock)); + #ifdef UNIV_SYNC_PERF_STAT +@@ -435,42 +557,32 @@ + #endif + ) + { +- ibool sg = FALSE; +- +- /* Acquire the mutex protecting the rw-lock fields */ +- mutex_enter(&(lock->mutex)); +- +- /* Reset the exclusive lock if this thread no longer has an x-mode +- lock */ +- +- ut_ad(lock->writer_count > 0); ++ ut_ad((lock->lock_word % X_LOCK_DECR) == 0); + +- lock->writer_count--; +- +- if (lock->writer_count == 0) { +- rw_lock_set_writer(lock, RW_LOCK_NOT_LOCKED); ++ /* lock->recursive flag also indicates if lock->writer_thread is ++ valid or stale. If we are the last of the recursive callers ++ then we must unset lock->recursive flag to indicate that the ++ lock->writer_thread is now stale. ++ Note that since we still hold the x-lock we can safely read the ++ lock_word. */ ++ if (lock->lock_word == 0) { ++ /* Last caller in a possible recursive chain. */ ++ lock->recursive = FALSE; + } + + #ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, pass, RW_LOCK_EX); + #endif + +- /* If there may be waiters, signal the lock */ +- if (UNIV_UNLIKELY(lock->waiters) +- && lock->writer_count == 0) { +- +- sg = TRUE; +- rw_lock_set_waiters(lock, 0); +- } +- +- mutex_exit(&(lock->mutex)); +- +- if (UNIV_UNLIKELY(sg)) { +-#ifdef __WIN__ +- os_event_set(lock->wait_ex_event); +-#endif +- os_event_set(lock->event); +- sync_array_object_signalled(sync_primary_wait_array); ++ if (rw_lock_lock_word_incr(lock, X_LOCK_DECR) == X_LOCK_DECR) { ++ /* Lock is now free. May have to signal read/write waiters. ++ We do not need to signal wait_ex waiters, since they cannot ++ exist when there is a writer. */ ++ if (lock->waiters) { ++ rw_lock_reset_waiter_flag(lock); ++ os_event_set(lock->event); ++ sync_array_object_signalled(sync_primary_wait_array); ++ } + } + + ut_ad(rw_lock_validate(lock)); +@@ -492,18 +604,18 @@ + /* Reset the exclusive lock if this thread no longer has an x-mode + lock */ + +- ut_ad(lock->writer_count > 0); +- +- lock->writer_count--; +- +- if (lock->writer_count == 0) { +- rw_lock_set_writer(lock, RW_LOCK_NOT_LOCKED); +- } ++ ut_ad((lock->lock_word % X_LOCK_DECR) == 0); + + #ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, 0, RW_LOCK_EX); + #endif + ++ if (lock->lock_word == 0) { ++ lock->recursive = FALSE; ++ } ++ ++ lock->lock_word += X_LOCK_DECR; ++ + ut_ad(!lock->waiters); + ut_ad(rw_lock_validate(lock)); + +diff -ruN a/innobase/include/sync0sync.h b/innobase/include/sync0sync.h +--- a/innobase/include/sync0sync.h 2009-10-22 15:15:05.000000000 +0900 ++++ b/innobase/include/sync0sync.h 2009-10-22 15:18:44.000000000 +0900 +@@ -1,8 +1,31 @@ ++/***************************************************************************** ++ ++Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. ++Copyright (c) 2008, Google Inc. ++ ++Portions of this file contain modifications contributed and copyrighted by ++Google, Inc. Those modifications are gratefully acknowledged and are described ++briefly in the InnoDB documentation. The contributions by Google are ++incorporated with their permission, and subject to the conditions contained in ++the file COPYING.Google. ++ ++This program is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License as published by the Free Software ++Foundation; version 2 of the License. ++ ++This program is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License along with ++this program; if not, write to the Free Software Foundation, Inc., 59 Temple ++Place, Suite 330, Boston, MA 02111-1307 USA ++ ++*****************************************************************************/ ++ + /****************************************************** + Mutex, the basic synchronization primitive + +-(c) 1995 Innobase Oy +- + Created 9/5/1995 Heikki Tuuri + *******************************************************/ + +@@ -465,8 +488,11 @@ + struct mutex_struct { + os_event_t event; /* Used by sync0arr.c for the wait queue */ + ulint lock_word; /* This ulint is the target of the atomic +- test-and-set instruction in Win32 */ +-#if !defined(_WIN32) || !defined(UNIV_CAN_USE_X86_ASSEMBLER) ++ test-and-set instruction in Win32 and ++ x86 32/64 with GCC 4.1.0 or later version */ ++#if defined(_WIN32) && defined(UNIV_CAN_USE_X86_ASSEMBLER) ++#elif defined(HAVE_ATOMIC_BUILTINS) ++#else + os_fast_mutex_t + os_fast_mutex; /* In other systems we use this OS mutex + in place of lock_word */ +@@ -525,8 +551,7 @@ + /* The number of system calls made in this module. Intended for performance + monitoring. */ + +-extern ulint mutex_system_call_count; +-extern ulint mutex_exit_count; ++extern ib_longlong mutex_exit_count; + + /* Latching order checks start when this is set TRUE */ + extern ibool sync_order_checks_on; +diff -ruN a/innobase/include/sync0sync.ic b/innobase/include/sync0sync.ic +--- a/innobase/include/sync0sync.ic 2009-09-10 04:02:59.000000000 +0900 ++++ b/innobase/include/sync0sync.ic 2009-10-22 15:18:44.000000000 +0900 +@@ -1,21 +1,34 @@ ++/***************************************************************************** ++ ++Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. ++Copyright (c) 2008, Google Inc. ++ ++Portions of this file contain modifications contributed and copyrighted by ++Google, Inc. Those modifications are gratefully acknowledged and are described ++briefly in the InnoDB documentation. The contributions by Google are ++incorporated with their permission, and subject to the conditions contained in ++the file COPYING.Google. ++ ++This program is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License as published by the Free Software ++Foundation; version 2 of the License. ++ ++This program is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License along with ++this program; if not, write to the Free Software Foundation, Inc., 59 Temple ++Place, Suite 330, Boston, MA 02111-1307 USA ++ ++*****************************************************************************/ ++ + /****************************************************** + Mutex, the basic synchronization primitive + +-(c) 1995 Innobase Oy +- + Created 9/5/1995 Heikki Tuuri + *******************************************************/ + +-#if defined(not_defined) && defined(__GNUC__) && defined(UNIV_INTEL_X86) +-/* %z0: Use the size of operand %0 which in our case is *m to determine +-instruction size, it should end up as xchgl. "1" in the input constraint, +-says that "in" has to go in the same place as "out".*/ +-#define TAS(m, in, out) \ +- asm volatile ("xchg%z0 %2, %0" \ +- : "=g" (*(m)), "=r" (out) \ +- : "1" (in)) /* Note: "1" here refers to "=r" (out) */ +-#endif +- + /********************************************************************** + Sets the waiters field in a mutex. */ + +@@ -94,12 +107,8 @@ + /* mutex_fence(); */ + + return(res); +-#elif defined(not_defined) && defined(__GNUC__) && defined(UNIV_INTEL_X86) +- ulint res; +- +- TAS(&mutex->lock_word, 1, res); +- +- return(res); ++#elif defined(HAVE_ATOMIC_BUILTINS) ++ return __sync_lock_test_and_set(&(mutex->lock_word), 1); + #else + ibool ret; + +@@ -136,10 +145,11 @@ + __asm MOV EDX, 0 + __asm MOV ECX, lw + __asm XCHG EDX, DWORD PTR [ECX] +-#elif defined(not_defined) && defined(__GNUC__) && defined(UNIV_INTEL_X86) +- ulint res; +- +- TAS(&mutex->lock_word, 0, res); ++#elif defined(HAVE_ATOMIC_BUILTINS) ++ /* In theory __sync_lock_release should be used to release the lock. ++ Unfortunately, it does not work properly alone. The workaround is ++ that more conservative __sync_lock_test_and_set is used instead. */ ++ __sync_lock_test_and_set(&(mutex->lock_word), 0); + #else + mutex->lock_word = 0; + +diff -ruN a/innobase/row/row0sel.c b/innobase/row/row0sel.c +--- a/innobase/row/row0sel.c 2009-10-22 15:15:05.000000000 +0900 ++++ b/innobase/row/row0sel.c 2009-10-22 15:18:44.000000000 +0900 +@@ -1178,7 +1178,7 @@ + rw_lock_s_lock(&btr_search_latch); + + search_latch_locked = TRUE; +- } else if (btr_search_latch.writer_is_wait_ex) { ++ } else if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_WAIT_EX) { + + /* There is an x-latch request waiting: release the + s-latch for a moment; as an s-latch here is often +@@ -3123,7 +3123,7 @@ + /* PHASE 0: Release a possible s-latch we are holding on the + adaptive hash index latch if there is someone waiting behind */ + +- if (UNIV_UNLIKELY(btr_search_latch.writer != RW_LOCK_NOT_LOCKED) ++ if (UNIV_UNLIKELY(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_NOT_LOCKED) + && trx->has_search_latch) { + + /* There is an x-latch request on the adaptive hash index: +diff -ruN a/innobase/sync/sync0arr.c b/innobase/sync/sync0arr.c +--- a/innobase/sync/sync0arr.c 2009-09-10 04:03:01.000000000 +0900 ++++ b/innobase/sync/sync0arr.c 2009-10-22 15:18:44.000000000 +0900 +@@ -1,8 +1,31 @@ ++/***************************************************************************** ++ ++Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. ++Copyright (c) 2008, Google Inc. ++ ++Portions of this file contain modifications contributed and copyrighted by ++Google, Inc. Those modifications are gratefully acknowledged and are described ++briefly in the InnoDB documentation. The contributions by Google are ++incorporated with their permission, and subject to the conditions contained in ++the file COPYING.Google. ++ ++This program is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License as published by the Free Software ++Foundation; version 2 of the License. ++ ++This program is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License along with ++this program; if not, write to the Free Software Foundation, Inc., 59 Temple ++Place, Suite 330, Boston, MA 02111-1307 USA ++ ++*****************************************************************************/ ++ + /****************************************************** + The wait array used in synchronization primitives + +-(c) 1995 Innobase Oy +- + Created 9/5/1995 Heikki Tuuri + *******************************************************/ + +@@ -297,25 +320,21 @@ + } + + /*********************************************************************** +-Puts the cell event in reset state. */ ++Returns the event that the thread owning the cell waits for. */ + static +-ib_longlong +-sync_cell_event_reset( +-/*==================*/ +- /* out: value of signal_count +- at the time of reset. */ +- ulint type, /* in: lock type mutex/rw_lock */ +- void* object) /* in: the rw_lock/mutex object */ ++os_event_t ++sync_cell_get_event( ++/*================*/ ++ sync_cell_t* cell) /* in: non-empty sync array cell */ + { ++ ulint type = cell->request_type; ++ + if (type == SYNC_MUTEX) { +- return(os_event_reset(((mutex_t *) object)->event)); +-#ifdef __WIN__ ++ return(((mutex_t *) cell->wait_object)->event); + } else if (type == RW_LOCK_WAIT_EX) { +- return(os_event_reset( +- ((rw_lock_t *) object)->wait_ex_event)); +-#endif +- } else { +- return(os_event_reset(((rw_lock_t *) object)->event)); ++ return(((rw_lock_t *) cell->wait_object)->wait_ex_event); ++ } else { /* RW_LOCK_SHARED and RW_LOCK_EX wait on the same event */ ++ return(((rw_lock_t *) cell->wait_object)->event); + } + } + +@@ -334,6 +353,7 @@ + ulint* index) /* out: index of the reserved cell */ + { + sync_cell_t* cell; ++ os_event_t event; + ulint i; + + ut_a(object); +@@ -372,8 +392,8 @@ + /* Make sure the event is reset and also store + the value of signal_count at which the event + was reset. */ +- cell->signal_count = sync_cell_event_reset(type, +- object); ++ event = sync_cell_get_event(cell); ++ cell->signal_count = os_event_reset(event); + + cell->reservation_time = time(NULL); + +@@ -413,19 +433,7 @@ + ut_a(!cell->waiting); + ut_ad(os_thread_get_curr_id() == cell->thread); + +- if (cell->request_type == SYNC_MUTEX) { +- event = ((mutex_t*) cell->wait_object)->event; +-#ifdef __WIN__ +- /* On windows if the thread about to wait is the one which +- has set the state of the rw_lock to RW_LOCK_WAIT_EX, then +- it waits on a special event i.e.: wait_ex_event. */ +- } else if (cell->request_type == RW_LOCK_WAIT_EX) { +- event = ((rw_lock_t*) cell->wait_object)->wait_ex_event; +-#endif +- } else { +- event = ((rw_lock_t*) cell->wait_object)->event; +- } +- ++ event = sync_cell_get_event(cell); + cell->waiting = TRUE; + + #ifdef UNIV_SYNC_DEBUG +@@ -464,6 +472,7 @@ + mutex_t* mutex; + rw_lock_t* rwlock; + ulint type; ++ ulint writer; + + type = cell->request_type; + +@@ -492,9 +501,7 @@ + (ulong) mutex->waiters); + + } else if (type == RW_LOCK_EX +-#ifdef __WIN__ + || type == RW_LOCK_WAIT_EX +-#endif + || type == RW_LOCK_SHARED) { + + fputs(type == RW_LOCK_EX ? "X-lock on" : "S-lock on", file); +@@ -505,21 +512,24 @@ + " RW-latch at %p created in file %s line %lu\n", + rwlock, rwlock->cfile_name, + (ulong) rwlock->cline); +- if (rwlock->writer != RW_LOCK_NOT_LOCKED) { ++ writer = rw_lock_get_writer(rwlock); ++ if (writer != RW_LOCK_NOT_LOCKED) { + fprintf(file, + "a writer (thread id %lu) has reserved it in mode %s", + (ulong) os_thread_pf(rwlock->writer_thread), +- rwlock->writer == RW_LOCK_EX ++ writer == RW_LOCK_EX + ? " exclusive\n" + : " wait exclusive\n"); + } + + fprintf(file, +- "number of readers %lu, waiters flag %lu\n" ++ "number of readers %lu, waiters flag %lu, " ++ "lock_word: %lx\n" + "Last time read locked in file %s line %lu\n" + "Last time write locked in file %s line %lu\n", +- (ulong) rwlock->reader_count, ++ (ulong) rw_lock_get_reader_count(rwlock), + (ulong) rwlock->waiters, ++ rwlock->lock_word, + rwlock->last_s_file_name, + (ulong) rwlock->last_s_line, + rwlock->last_x_file_name, +@@ -773,28 +783,30 @@ + return(TRUE); + } + +- } else if (cell->request_type == RW_LOCK_EX +- || cell->request_type == RW_LOCK_WAIT_EX) { ++ } else if (cell->request_type == RW_LOCK_EX) { + + lock = cell->wait_object; + +- if (rw_lock_get_reader_count(lock) == 0 +- && rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED) { ++ if (lock->lock_word > 0) { ++ /* Either unlocked or only read locked. */ + + return(TRUE); + } + +- if (rw_lock_get_reader_count(lock) == 0 +- && rw_lock_get_writer(lock) == RW_LOCK_WAIT_EX +- && os_thread_eq(lock->writer_thread, cell->thread)) { ++ } else if (cell->request_type == RW_LOCK_WAIT_EX) { ++ ++ lock = cell->wait_object; ++ ++ /* lock_word == 0 means all readers have left */ ++ if (lock->lock_word == 0) { + + return(TRUE); + } +- + } else if (cell->request_type == RW_LOCK_SHARED) { + lock = cell->wait_object; + +- if (rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED) { ++ /* lock_word > 0 means no writer or reserved writer */ ++ if (lock->lock_word > 0) { + + return(TRUE); + } +@@ -839,11 +851,15 @@ + /*========================*/ + sync_array_t* arr) /* in: wait array */ + { ++#ifdef HAVE_ATOMIC_BUILTINS ++ (void) os_atomic_increment(&arr->sg_count, 1); ++#else + sync_array_enter(arr); + + arr->sg_count++; + + sync_array_exit(arr); ++#endif + } + + /************************************************************************** +@@ -859,6 +875,7 @@ + sync_cell_t* cell; + ulint count; + ulint i; ++ os_event_t event; + + sync_array_enter(arr); + +@@ -868,36 +885,20 @@ + while (count < arr->n_reserved) { + + cell = sync_array_get_nth_cell(arr, i); ++ i++; + +- if (cell->wait_object != NULL) { +- ++ if (cell->wait_object == NULL) { ++ continue; ++ } + count++; + + if (sync_arr_cell_can_wake_up(cell)) { + +- if (cell->request_type == SYNC_MUTEX) { +- mutex_t* mutex; ++ event = sync_cell_get_event(cell); + +- mutex = cell->wait_object; +- os_event_set(mutex->event); +-#ifdef __WIN__ +- } else if (cell->request_type +- == RW_LOCK_WAIT_EX) { +- rw_lock_t* lock; +- +- lock = cell->wait_object; +- os_event_set(lock->wait_ex_event); +-#endif +- } else { +- rw_lock_t* lock; +- +- lock = cell->wait_object; +- os_event_set(lock->event); +- } +- } ++ os_event_set(event); + } + +- i++; + } + + sync_array_exit(arr); +@@ -1014,4 +1015,3 @@ + + sync_array_exit(arr); + } +- +diff -ruN a/innobase/sync/sync0rw.c b/innobase/sync/sync0rw.c +--- a/innobase/sync/sync0rw.c 2009-09-10 04:03:01.000000000 +0900 ++++ b/innobase/sync/sync0rw.c 2009-10-22 15:18:44.000000000 +0900 +@@ -1,8 +1,31 @@ ++/***************************************************************************** ++ ++Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. ++Copyright (c) 2008, Google Inc. ++ ++Portions of this file contain modifications contributed and copyrighted by ++Google, Inc. Those modifications are gratefully acknowledged and are described ++briefly in the InnoDB documentation. The contributions by Google are ++incorporated with their permission, and subject to the conditions contained in ++the file COPYING.Google. ++ ++This program is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License as published by the Free Software ++Foundation; version 2 of the License. ++ ++This program is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License along with ++this program; if not, write to the Free Software Foundation, Inc., 59 Temple ++Place, Suite 330, Boston, MA 02111-1307 USA ++ ++*****************************************************************************/ ++ + /****************************************************** + The read-write lock (for thread synchronization) + +-(c) 1995 Innobase Oy +- + Created 9/11/1995 Heikki Tuuri + *******************************************************/ + +@@ -15,17 +38,110 @@ + #include "mem0mem.h" + #include "srv0srv.h" + +-ulint rw_s_system_call_count = 0; +-ulint rw_s_spin_wait_count = 0; +-ulint rw_s_os_wait_count = 0; ++/* ++ IMPLEMENTATION OF THE RW_LOCK ++ ============================= ++The status of a rw_lock is held in lock_word. The initial value of lock_word is ++X_LOCK_DECR. lock_word is decremented by 1 for each s-lock and by X_LOCK_DECR ++for each x-lock. This describes the lock state for each value of lock_word: ++ ++lock_word == X_LOCK_DECR: Unlocked. ++0 < lock_word < X_LOCK_DECR: Read locked, no waiting writers. ++ (X_LOCK_DECR - lock_word) is the ++ number of readers that hold the lock. ++lock_word == 0: Write locked ++-X_LOCK_DECR < lock_word < 0: Read locked, with a waiting writer. ++ (-lock_word) is the number of readers ++ that hold the lock. ++lock_word <= -X_LOCK_DECR: Recursively write locked. lock_word has been ++ decremented by X_LOCK_DECR once for each lock, ++ so the number of locks is: ++ ((-lock_word) / X_LOCK_DECR) + 1 ++When lock_word <= -X_LOCK_DECR, we also know that lock_word % X_LOCK_DECR == 0: ++other values of lock_word are invalid. ++ ++The lock_word is always read and updated atomically and consistently, so that ++it always represents the state of the lock, and the state of the lock changes ++with a single atomic operation. This lock_word holds all of the information ++that a thread needs in order to determine if it is eligible to gain the lock ++or if it must spin or sleep. The one exception to this is that writer_thread ++must be verified before recursive write locks: to solve this scenario, we make ++writer_thread readable by all threads, but only writeable by the x-lock holder. ++ ++The other members of the lock obey the following rules to remain consistent: ++ ++recursive: This and the writer_thread field together control the ++ behaviour of recursive x-locking. ++ lock->recursive must be FALSE in following states: ++ 1) The writer_thread contains garbage i.e.: the ++ lock has just been initialized. ++ 2) The lock is not x-held and there is no ++ x-waiter waiting on WAIT_EX event. ++ 3) The lock is x-held or there is an x-waiter ++ waiting on WAIT_EX event but the 'pass' value ++ is non-zero. ++ lock->recursive is TRUE iff: ++ 1) The lock is x-held or there is an x-waiter ++ waiting on WAIT_EX event and the 'pass' value ++ is zero. ++ This flag must be set after the writer_thread field ++ has been updated with a memory ordering barrier. ++ It is unset before the lock_word has been incremented. ++writer_thread: Is used only in recursive x-locking. Can only be safely ++ read iff lock->recursive flag is TRUE. ++ This field is uninitialized at lock creation time and ++ is updated atomically when x-lock is acquired or when ++ move_ownership is called. A thread is only allowed to ++ set the value of this field to it's thread_id i.e.: a ++ thread cannot set writer_thread to some other thread's ++ id. ++waiters: May be set to 1 anytime, but to avoid unnecessary wake-up ++ signals, it should only be set to 1 when there are threads ++ waiting on event. Must be 1 when a writer starts waiting to ++ ensure the current x-locking thread sends a wake-up signal ++ during unlock. May only be reset to 0 immediately before a ++ a wake-up signal is sent to event. On most platforms, a ++ memory barrier is required after waiters is set, and before ++ verifying lock_word is still held, to ensure some unlocker ++ really does see the flags new value. ++event: Threads wait on event for read or writer lock when another ++ thread has an x-lock or an x-lock reservation (wait_ex). A ++ thread may only wait on event after performing the following ++ actions in order: ++ (1) Record the counter value of event (with os_event_reset). ++ (2) Set waiters to 1. ++ (3) Verify lock_word <= 0. ++ (1) must come before (2) to ensure signal is not missed. ++ (2) must come before (3) to ensure a signal is sent. ++ These restrictions force the above ordering. ++ Immediately before sending the wake-up signal, we should: ++ (1) Verify lock_word == X_LOCK_DECR (unlocked) ++ (2) Reset waiters to 0. ++wait_ex_event: A thread may only wait on the wait_ex_event after it has ++ performed the following actions in order: ++ (1) Decrement lock_word by X_LOCK_DECR. ++ (2) Record counter value of wait_ex_event (os_event_reset, ++ called from sync_array_reserve_cell). ++ (3) Verify that lock_word < 0. ++ (1) must come first to ensures no other threads become reader ++ or next writer, and notifies unlocker that signal must be sent. ++ (2) must come before (3) to ensure the signal is not missed. ++ These restrictions force the above ordering. ++ Immediately before sending the wake-up signal, we should: ++ Verify lock_word == 0 (waiting thread holds x_lock) ++*/ ++ ++ib_longlong rw_s_spin_wait_count = 0; ++ib_longlong rw_s_spin_round_count = 0; ++ib_longlong rw_s_os_wait_count = 0; ++ ++ib_longlong rw_s_exit_count = 0; ++ ++ib_longlong rw_x_spin_wait_count = 0; ++ib_longlong rw_x_spin_round_count = 0; ++ib_longlong rw_x_os_wait_count = 0; + +-ulint rw_s_exit_count = 0; +- +-ulint rw_x_system_call_count = 0; +-ulint rw_x_spin_wait_count = 0; +-ulint rw_x_os_wait_count = 0; +- +-ulint rw_x_exit_count = 0; ++ib_longlong rw_x_exit_count = 0; + + /* The global list of rw-locks */ + rw_lock_list_t rw_lock_list; +@@ -99,22 +215,30 @@ + object is created, then the following call initializes + the sync system. */ + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_create(rw_lock_get_mutex(lock)); + mutex_set_level(rw_lock_get_mutex(lock), SYNC_NO_ORDER_CHECK); + + lock->mutex.cfile_name = cfile_name; + lock->mutex.cline = cline; +-#if defined UNIV_DEBUG && !defined UNIV_HOTBACKUP ++# if defined UNIV_DEBUG && !defined UNIV_HOTBACKUP + lock->mutex.cmutex_name = cmutex_name; + lock->mutex.mutex_type = 1; +-#endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ ++# endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ + +- rw_lock_set_waiters(lock, 0); +- rw_lock_set_writer(lock, RW_LOCK_NOT_LOCKED); +- lock->writer_count = 0; +- rw_lock_set_reader_count(lock, 0); +- +- lock->writer_is_wait_ex = FALSE; ++#else /* HAVE_ATOMIC_BUILTINS */ ++# ifdef UNIV_DEBUG ++ UT_NOT_USED(cmutex_name); ++# endif ++#endif /* HAVE_ATOMIC_BUILTINS */ ++ ++ lock->lock_word = X_LOCK_DECR; ++ lock->waiters = 0; ++ ++ /* We set this value to signify that lock->writer_thread ++ contains garbage at initialization and cannot be used for ++ recursive x-locking. */ ++ lock->recursive = FALSE; + + #ifdef UNIV_SYNC_DEBUG + UT_LIST_INIT(lock->debug_list); +@@ -126,15 +250,13 @@ + lock->cfile_name = cfile_name; + lock->cline = cline; + ++ lock->count_os_wait = 0; + lock->last_s_file_name = "not yet reserved"; + lock->last_x_file_name = "not yet reserved"; + lock->last_s_line = 0; + lock->last_x_line = 0; + lock->event = os_event_create(NULL); +- +-#ifdef __WIN__ + lock->wait_ex_event = os_event_create(NULL); +-#endif + + mutex_enter(&rw_lock_list_mutex); + +@@ -158,23 +280,17 @@ + /*=========*/ + rw_lock_t* lock) /* in: rw-lock */ + { +-#ifdef UNIV_DEBUG + ut_a(rw_lock_validate(lock)); +-#endif /* UNIV_DEBUG */ +- ut_a(rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED); +- ut_a(rw_lock_get_waiters(lock) == 0); +- ut_a(rw_lock_get_reader_count(lock) == 0); ++ ut_a(lock->lock_word == X_LOCK_DECR); + +- lock->magic_n = 0; +- ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_free(rw_lock_get_mutex(lock)); ++#endif /* HAVE_ATOMIC_BUILTINS */ + + mutex_enter(&rw_lock_list_mutex); + os_event_free(lock->event); + +-#ifdef __WIN__ + os_event_free(lock->wait_ex_event); +-#endif + + if (UT_LIST_GET_PREV(list, lock)) { + ut_a(UT_LIST_GET_PREV(list, lock)->magic_n == RW_LOCK_MAGIC_N); +@@ -186,6 +302,8 @@ + UT_LIST_REMOVE(list, rw_lock_list, lock); + + mutex_exit(&rw_lock_list_mutex); ++ ++ lock->magic_n = 0; + } + + /********************************************************************** +@@ -199,19 +317,12 @@ + { + ut_a(lock); + +- mutex_enter(rw_lock_get_mutex(lock)); ++ ulint waiters = rw_lock_get_waiters(lock); ++ lint lock_word = lock->lock_word; + + ut_a(lock->magic_n == RW_LOCK_MAGIC_N); +- ut_a((rw_lock_get_reader_count(lock) == 0) +- || (rw_lock_get_writer(lock) != RW_LOCK_EX)); +- ut_a((rw_lock_get_writer(lock) == RW_LOCK_EX) +- || (rw_lock_get_writer(lock) == RW_LOCK_WAIT_EX) +- || (rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED)); +- ut_a((rw_lock_get_waiters(lock) == 0) +- || (rw_lock_get_waiters(lock) == 1)); +- ut_a((lock->writer != RW_LOCK_EX) || (lock->writer_count > 0)); +- +- mutex_exit(rw_lock_get_mutex(lock)); ++ ut_a(waiters == 0 || waiters == 1); ++ ut_a(lock_word > -X_LOCK_DECR ||(-lock_word) % X_LOCK_DECR == 0); + + return(TRUE); + } +@@ -232,18 +343,15 @@ + ulint line) /* in: line where requested */ + { + ulint index; /* index of the reserved wait cell */ +- ulint i; /* spin round count */ ++ ulint i = 0; /* spin round count */ + + ut_ad(rw_lock_validate(lock)); + ++ rw_s_spin_wait_count++; /* Count calls to this function */ + lock_loop: +- rw_s_spin_wait_count++; + + /* Spin waiting for the writer field to become free */ +- i = 0; +- +- while (rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED +- && i < SYNC_SPIN_ROUNDS) { ++ while (i < SYNC_SPIN_ROUNDS && lock->lock_word <= 0) { + if (srv_spin_wait_delay) { + ut_delay(ut_rnd_interval(0, srv_spin_wait_delay)); + } +@@ -262,28 +370,32 @@ + lock->cfile_name, (ulong) lock->cline, (ulong) i); + } + +- mutex_enter(rw_lock_get_mutex(lock)); +- + /* We try once again to obtain the lock */ +- + if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) { +- mutex_exit(rw_lock_get_mutex(lock)); ++ rw_s_spin_round_count += i; + + return; /* Success */ + } else { +- /* If we get here, locking did not succeed, we may +- suspend the thread to wait in the wait array */ + +- rw_s_system_call_count++; ++ if (i < SYNC_SPIN_ROUNDS) { ++ goto lock_loop; ++ } ++ ++ rw_s_spin_round_count += i; + + sync_array_reserve_cell(sync_primary_wait_array, + lock, RW_LOCK_SHARED, + file_name, line, + &index); + +- rw_lock_set_waiters(lock, 1); +- +- mutex_exit(rw_lock_get_mutex(lock)); ++ /* Set waiters before checking lock_word to ensure wake-up ++ signal is sent. This may lead to some unnecessary signals. */ ++ rw_lock_set_waiter_flag(lock); ++ ++ if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) { ++ sync_array_free_cell(sync_primary_wait_array, index); ++ return; /* Success */ ++ } + + if (srv_print_latch_waits) { + fprintf(stderr, +@@ -292,11 +404,13 @@ + lock, lock->cfile_name, (ulong) lock->cline); + } + +- rw_s_system_call_count++; ++ /* these stats may not be accurate */ ++ lock->count_os_wait++; + rw_s_os_wait_count++; + + sync_array_wait_event(sync_primary_wait_array, index); + ++ i = 0; + goto lock_loop; + } + } +@@ -318,114 +432,130 @@ + { + ut_ad(rw_lock_is_locked(lock, RW_LOCK_EX)); + +- mutex_enter(&(lock->mutex)); +- +- lock->writer_thread = os_thread_get_curr_id(); +- +- lock->pass = 0; +- +- mutex_exit(&(lock->mutex)); ++ rw_lock_set_writer_id_and_recursion_flag(lock, TRUE); + } + + /********************************************************************** +-Low-level function for acquiring an exclusive lock. */ ++Function for the next writer to call. Waits for readers to exit. ++The caller must have already decremented lock_word by X_LOCK_DECR.*/ + UNIV_INLINE +-ulint +-rw_lock_x_lock_low( +-/*===============*/ +- /* out: RW_LOCK_NOT_LOCKED if did +- not succeed, RW_LOCK_EX if success, +- RW_LOCK_WAIT_EX, if got wait reservation */ ++void ++rw_lock_x_lock_wait( ++/*================*/ + rw_lock_t* lock, /* in: pointer to rw-lock */ ++#ifdef UNIV_SYNC_DEBUG + ulint pass, /* in: pass value; != 0, if the lock will + be passed to another thread to unlock */ ++#endif + const char* file_name,/* in: file name where lock requested */ + ulint line) /* in: line where requested */ + { +-#ifdef UNIV_SYNC_DEBUG +- ut_ad(mutex_own(rw_lock_get_mutex(lock))); +-#endif /* UNIV_SYNC_DEBUG */ +- if (rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED) { ++ ulint index; ++ ulint i = 0; + +- if (rw_lock_get_reader_count(lock) == 0) { ++ ut_ad(lock->lock_word <= 0); ++ ++ while (lock->lock_word < 0) { ++ if (srv_spin_wait_delay) { ++ ut_delay(ut_rnd_interval(0, srv_spin_wait_delay)); ++ } ++ if(i < SYNC_SPIN_ROUNDS) { ++ i++; ++ continue; ++ } + +- rw_lock_set_writer(lock, RW_LOCK_EX); +- lock->writer_thread = os_thread_get_curr_id(); +- lock->writer_count++; +- lock->pass = pass; ++ /* If there is still a reader, then go to sleep.*/ ++ rw_x_spin_round_count += i; ++ i = 0; ++ sync_array_reserve_cell(sync_primary_wait_array, ++ lock, ++ RW_LOCK_WAIT_EX, ++ file_name, line, ++ &index); ++ /* Check lock_word to ensure wake-up isn't missed.*/ ++ if(lock->lock_word < 0) { + ++ /* these stats may not be accurate */ ++ lock->count_os_wait++; ++ rw_x_os_wait_count++; ++ ++ /* Add debug info as it is needed to detect possible ++ deadlock. We must add info for WAIT_EX thread for ++ deadlock detection to work properly. */ + #ifdef UNIV_SYNC_DEBUG +- rw_lock_add_debug_info(lock, pass, RW_LOCK_EX, ++ rw_lock_add_debug_info(lock, pass, RW_LOCK_WAIT_EX, + file_name, line); + #endif +- lock->last_x_file_name = file_name; +- lock->last_x_line = line; +- +- /* Locking succeeded, we may return */ +- return(RW_LOCK_EX); +- } else { +- /* There are readers, we have to wait */ +- rw_lock_set_writer(lock, RW_LOCK_WAIT_EX); +- lock->writer_thread = os_thread_get_curr_id(); +- lock->pass = pass; +- lock->writer_is_wait_ex = TRUE; + ++ sync_array_wait_event(sync_primary_wait_array, ++ index); + #ifdef UNIV_SYNC_DEBUG +- rw_lock_add_debug_info(lock, pass, RW_LOCK_WAIT_EX, +- file_name, line); ++ rw_lock_remove_debug_info(lock, pass, ++ RW_LOCK_WAIT_EX); + #endif +- +- return(RW_LOCK_WAIT_EX); ++ /* It is possible to wake when lock_word < 0. ++ We must pass the while-loop check to proceed.*/ ++ } else { ++ sync_array_free_cell(sync_primary_wait_array, ++ index); + } ++ } ++ rw_x_spin_round_count += i; ++} + +- } else if ((rw_lock_get_writer(lock) == RW_LOCK_WAIT_EX) +- && os_thread_eq(lock->writer_thread, +- os_thread_get_curr_id())) { ++/********************************************************************** ++Low-level function for acquiring an exclusive lock. */ ++UNIV_INLINE ++ibool ++rw_lock_x_lock_low( ++/*===============*/ ++ /* out: RW_LOCK_NOT_LOCKED if did ++ not succeed, RW_LOCK_EX if success. */ ++ rw_lock_t* lock, /* in: pointer to rw-lock */ ++ ulint pass, /* in: pass value; != 0, if the lock will ++ be passed to another thread to unlock */ ++ const char* file_name,/* in: file name where lock requested */ ++ ulint line) /* in: line where requested */ ++{ ++ os_thread_id_t curr_thread = os_thread_get_curr_id(); + +- if (rw_lock_get_reader_count(lock) == 0) { ++ if (rw_lock_lock_word_decr(lock, X_LOCK_DECR)) { + +- rw_lock_set_writer(lock, RW_LOCK_EX); +- lock->writer_count++; +- lock->pass = pass; +- lock->writer_is_wait_ex = FALSE; ++ /* lock->recursive also tells us if the writer_thread ++ field is stale or active. As we are going to write ++ our own thread id in that field it must be that the ++ current writer_thread value is not active. */ ++ ut_a(!lock->recursive); + ++ /* Decrement occurred: we are writer or next-writer. */ ++ rw_lock_set_writer_id_and_recursion_flag(lock, ++ pass ? FALSE : TRUE); ++ ++ rw_lock_x_lock_wait(lock, + #ifdef UNIV_SYNC_DEBUG +- rw_lock_remove_debug_info(lock, pass, RW_LOCK_WAIT_EX); +- rw_lock_add_debug_info(lock, pass, RW_LOCK_EX, +- file_name, line); ++ pass, + #endif ++ file_name, line); + +- lock->last_x_file_name = file_name; +- lock->last_x_line = line; +- +- /* Locking succeeded, we may return */ +- return(RW_LOCK_EX); ++ } else { ++ /* Decrement failed: relock or failed lock */ ++ if (!pass && lock->recursive ++ && os_thread_eq(lock->writer_thread, curr_thread)) { ++ /* Relock */ ++ lock->lock_word -= X_LOCK_DECR; ++ } else { ++ /* Another thread locked before us */ ++ return(FALSE); + } +- +- return(RW_LOCK_WAIT_EX); +- +- } else if ((rw_lock_get_writer(lock) == RW_LOCK_EX) +- && os_thread_eq(lock->writer_thread, +- os_thread_get_curr_id()) +- && (lock->pass == 0) +- && (pass == 0)) { +- +- lock->writer_count++; +- ++ } + #ifdef UNIV_SYNC_DEBUG +- rw_lock_add_debug_info(lock, pass, RW_LOCK_EX, file_name, +- line); ++ rw_lock_add_debug_info(lock, pass, RW_LOCK_EX, ++ file_name, line); + #endif ++ lock->last_x_file_name = file_name; ++ lock->last_x_line = (unsigned int) line; + +- lock->last_x_file_name = file_name; +- lock->last_x_line = line; +- +- /* Locking succeeded, we may return */ +- return(RW_LOCK_EX); +- } +- +- /* Locking did not succeed */ +- return(RW_LOCK_NOT_LOCKED); ++ return(TRUE); + } + + /********************************************************************** +@@ -448,47 +578,30 @@ + ulint line) /* in: line where requested */ + { + ulint index; /* index of the reserved wait cell */ +- ulint state; /* lock state acquired */ + ulint i; /* spin round count */ ++ ibool spinning = FALSE; + + ut_ad(rw_lock_validate(lock)); + +-lock_loop: +- /* Acquire the mutex protecting the rw-lock fields */ +- mutex_enter_fast(&(lock->mutex)); +- +- state = rw_lock_x_lock_low(lock, pass, file_name, line); ++ i = 0; + +- mutex_exit(&(lock->mutex)); ++lock_loop: + +- if (state == RW_LOCK_EX) { ++ if (rw_lock_x_lock_low(lock, pass, file_name, line)) { ++ rw_x_spin_round_count += i; + + return; /* Locking succeeded */ + +- } else if (state == RW_LOCK_NOT_LOCKED) { +- +- /* Spin waiting for the writer field to become free */ +- i = 0; +- +- while (rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED +- && i < SYNC_SPIN_ROUNDS) { +- if (srv_spin_wait_delay) { +- ut_delay(ut_rnd_interval(0, +- srv_spin_wait_delay)); +- } ++ } else { + +- i++; +- } +- if (i == SYNC_SPIN_ROUNDS) { +- os_thread_yield(); ++ if (!spinning) { ++ spinning = TRUE; ++ rw_x_spin_wait_count++; + } +- } else if (state == RW_LOCK_WAIT_EX) { + +- /* Spin waiting for the reader count field to become zero */ +- i = 0; +- +- while (rw_lock_get_reader_count(lock) != 0 +- && i < SYNC_SPIN_ROUNDS) { ++ /* Spin waiting for the lock_word to become free */ ++ while (i < SYNC_SPIN_ROUNDS ++ && lock->lock_word <= 0) { + if (srv_spin_wait_delay) { + ut_delay(ut_rnd_interval(0, + srv_spin_wait_delay)); +@@ -498,12 +611,13 @@ + } + if (i == SYNC_SPIN_ROUNDS) { + os_thread_yield(); ++ } else { ++ goto lock_loop; + } +- } else { +- i = 0; /* Eliminate a compiler warning */ +- ut_error; + } + ++ rw_x_spin_round_count += i; ++ + if (srv_print_latch_waits) { + fprintf(stderr, + "Thread %lu spin wait rw-x-lock at %p cfile %s cline %lu rnds %lu\n", +@@ -511,39 +625,20 @@ + lock->cfile_name, (ulong) lock->cline, (ulong) i); + } + +- rw_x_spin_wait_count++; +- +- /* We try once again to obtain the lock. Acquire the mutex protecting +- the rw-lock fields */ +- +- mutex_enter(rw_lock_get_mutex(lock)); +- +- state = rw_lock_x_lock_low(lock, pass, file_name, line); +- +- if (state == RW_LOCK_EX) { +- mutex_exit(rw_lock_get_mutex(lock)); +- +- return; /* Locking succeeded */ +- } +- +- rw_x_system_call_count++; +- + sync_array_reserve_cell(sync_primary_wait_array, + lock, +-#ifdef __WIN__ +- /* On windows RW_LOCK_WAIT_EX signifies +- that this thread should wait on the +- special wait_ex_event. */ +- (state == RW_LOCK_WAIT_EX) +- ? RW_LOCK_WAIT_EX : +-#endif + RW_LOCK_EX, + file_name, line, + &index); + +- rw_lock_set_waiters(lock, 1); +- +- mutex_exit(rw_lock_get_mutex(lock)); ++ /* Waiters must be set before checking lock_word, to ensure signal ++ is sent. This could lead to a few unnecessary wake-up signals. */ ++ rw_lock_set_waiter_flag(lock); ++ ++ if (rw_lock_x_lock_low(lock, pass, file_name, line)) { ++ sync_array_free_cell(sync_primary_wait_array, index); ++ return; /* Locking succeeded */ ++ } + + if (srv_print_latch_waits) { + fprintf(stderr, +@@ -552,11 +647,13 @@ + lock->cfile_name, (ulong) lock->cline); + } + +- rw_x_system_call_count++; ++ /* these stats may not be accurate */ ++ lock->count_os_wait++; + rw_x_os_wait_count++; + + sync_array_wait_event(sync_primary_wait_array, index); + ++ i = 0; + goto lock_loop; + } + +@@ -697,7 +794,9 @@ + rw_lock_t* lock, /* in: rw-lock */ + ulint level) /* in: level */ + { ++#ifdef UNIV_SYNC_DEBUG + lock->level = level; ++#endif /* UNIV_SYNC_DEBUG */ + } + + #ifdef UNIV_SYNC_DEBUG +@@ -718,7 +817,7 @@ + ut_ad(lock); + ut_ad(rw_lock_validate(lock)); + +- mutex_enter(&(lock->mutex)); ++ rw_lock_debug_mutex_enter(); + + info = UT_LIST_GET_FIRST(lock->debug_list); + +@@ -728,7 +827,7 @@ + && (info->pass == 0) + && (info->lock_type == lock_type)) { + +- mutex_exit(&(lock->mutex)); ++ rw_lock_debug_mutex_exit(); + /* Found! */ + + return(TRUE); +@@ -736,7 +835,7 @@ + + info = UT_LIST_GET_NEXT(list, info); + } +- mutex_exit(&(lock->mutex)); ++ rw_lock_debug_mutex_exit(); + + return(FALSE); + } +@@ -758,22 +857,18 @@ + ut_ad(lock); + ut_ad(rw_lock_validate(lock)); + +- mutex_enter(&(lock->mutex)); +- + if (lock_type == RW_LOCK_SHARED) { +- if (lock->reader_count > 0) { ++ if (rw_lock_get_reader_count(lock) > 0) { + ret = TRUE; + } + } else if (lock_type == RW_LOCK_EX) { +- if (lock->writer == RW_LOCK_EX) { ++ if (rw_lock_get_writer(lock) == RW_LOCK_EX) { + ret = TRUE; + } + } else { + ut_error; + } + +- mutex_exit(&(lock->mutex)); +- + return(ret); + } + +@@ -801,11 +896,10 @@ + + count++; + ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_enter(&(lock->mutex)); +- +- if ((rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED) +- || (rw_lock_get_reader_count(lock) != 0) +- || (rw_lock_get_waiters(lock) != 0)) { ++#endif ++ if (lock->lock_word != X_LOCK_DECR) { + + fprintf(stderr, "RW-LOCK: %p ", lock); + +@@ -821,8 +915,10 @@ + info = UT_LIST_GET_NEXT(list, info); + } + } +- ++#ifndef HAVE_ATOMIC_BUILTINS + mutex_exit(&(lock->mutex)); ++#endif ++ + lock = UT_LIST_GET_NEXT(list, lock); + } + +@@ -845,9 +941,10 @@ + "RW-LATCH INFO\n" + "RW-LATCH: %p ", lock); + +- if ((rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED) +- || (rw_lock_get_reader_count(lock) != 0) +- || (rw_lock_get_waiters(lock) != 0)) { ++#ifndef HAVE_ATOMIC_BUILTINS ++ mutex_enter(&(lock->mutex)); ++#endif ++ if (lock->lock_word != X_LOCK_DECR) { + + if (rw_lock_get_waiters(lock)) { + fputs(" Waiters for the lock exist\n", stderr); +@@ -861,6 +958,9 @@ + info = UT_LIST_GET_NEXT(list, info); + } + } ++#ifndef HAVE_ATOMIC_BUILTINS ++ mutex_exit(&(lock->mutex)); ++#endif + } + + /************************************************************************* +@@ -909,14 +1009,11 @@ + lock = UT_LIST_GET_FIRST(rw_lock_list); + + while (lock != NULL) { +- mutex_enter(rw_lock_get_mutex(lock)); + +- if ((rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED) +- || (rw_lock_get_reader_count(lock) != 0)) { ++ if (lock->lock_word != X_LOCK_DECR) { + count++; + } + +- mutex_exit(rw_lock_get_mutex(lock)); + lock = UT_LIST_GET_NEXT(list, lock); + } + +diff -ruN a/innobase/sync/sync0sync.c b/innobase/sync/sync0sync.c +--- a/innobase/sync/sync0sync.c 2009-10-22 15:15:05.000000000 +0900 ++++ b/innobase/sync/sync0sync.c 2009-10-22 15:18:44.000000000 +0900 +@@ -1,8 +1,31 @@ ++/***************************************************************************** ++ ++Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. ++Copyright (c) 2008, Google Inc. ++ ++Portions of this file contain modifications contributed and copyrighted by ++Google, Inc. Those modifications are gratefully acknowledged and are described ++briefly in the InnoDB documentation. The contributions by Google are ++incorporated with their permission, and subject to the conditions contained in ++the file COPYING.Google. ++ ++This program is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License as published by the Free Software ++Foundation; version 2 of the License. ++ ++This program is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License along with ++this program; if not, write to the Free Software Foundation, Inc., 59 Temple ++Place, Suite 330, Boston, MA 02111-1307 USA ++ ++*****************************************************************************/ ++ + /****************************************************** + Mutex, the basic synchronization primitive + +-(c) 1995 Innobase Oy +- + Created 9/5/1995 Heikki Tuuri + *******************************************************/ + +@@ -140,17 +163,12 @@ + + ulint sync_dummy = 0; + +-/* The number of system calls made in this module. Intended for performance +-monitoring. */ +- +-ulint mutex_system_call_count = 0; +- + /* Number of spin waits on mutexes: for performance monitoring */ + +-ulint mutex_spin_round_count = 0; +-ulint mutex_spin_wait_count = 0; +-ulint mutex_os_wait_count = 0; +-ulint mutex_exit_count = 0; ++ib_longlong mutex_spin_round_count = 0; ++ib_longlong mutex_spin_wait_count = 0; ++ib_longlong mutex_os_wait_count = 0; ++ib_longlong mutex_exit_count = 0; + + /* The global array of wait cells for implementation of the database's own + mutexes and read-write locks */ +@@ -240,6 +258,8 @@ + { + #if defined(_WIN32) && defined(UNIV_CAN_USE_X86_ASSEMBLER) + mutex_reset_lock_word(mutex); ++#elif defined(HAVE_ATOMIC_BUILTINS) ++ mutex_reset_lock_word(mutex); + #else + os_fast_mutex_init(&(mutex->os_fast_mutex)); + mutex->lock_word = 0; +@@ -325,7 +345,9 @@ + + os_event_free(mutex->event); + +-#if !defined(_WIN32) || !defined(UNIV_CAN_USE_X86_ASSEMBLER) ++#if defined(_WIN32) && defined(UNIV_CAN_USE_X86_ASSEMBLER) ++#elif defined(HAVE_ATOMIC_BUILTINS) ++#else + os_fast_mutex_free(&(mutex->os_fast_mutex)); + #endif + /* If we free the mutex protecting the mutex list (freeing is +@@ -421,6 +443,12 @@ + #endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ + ut_ad(mutex); + ++ /* This update is not thread safe, but we don't mind if the count ++ isn't exact. Moved out of ifdef that follows because we are willing ++ to sacrifice the cost of counting this as the data is valuable. ++ Count the number of calls to mutex_spin_wait. */ ++ mutex_spin_wait_count++; ++ + mutex_loop: + + i = 0; +@@ -433,7 +461,6 @@ + + spin_loop: + #if defined UNIV_DEBUG && !defined UNIV_HOTBACKUP +- mutex_spin_wait_count++; + mutex->count_spin_loop++; + #endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ + +@@ -502,8 +529,6 @@ + sync_array_reserve_cell(sync_primary_wait_array, mutex, + SYNC_MUTEX, file_name, line, &index); + +- mutex_system_call_count++; +- + /* The memory order of the array reservation and the change in the + waiters field is important: when we suspend a thread, we first + reserve the cell and then set waiters field to 1. When threads are +@@ -551,7 +576,6 @@ + mutex->cfile_name, (ulong) mutex->cline, (ulong) i); + #endif + +- mutex_system_call_count++; + mutex_os_wait_count++; + + #ifndef UNIV_HOTBACKUP +@@ -1368,20 +1392,31 @@ + FILE* file) /* in: file where to print */ + { + #ifdef UNIV_SYNC_DEBUG +- fprintf(stderr, "Mutex exits %lu, rws exits %lu, rwx exits %lu\n", ++ fprintf(file, "Mutex exits %llu, rws exits %llu, rwx exits %llu\n", + mutex_exit_count, rw_s_exit_count, rw_x_exit_count); + #endif + + fprintf(file, +-"Mutex spin waits %lu, rounds %lu, OS waits %lu\n" +-"RW-shared spins %lu, OS waits %lu; RW-excl spins %lu, OS waits %lu\n", +- (ulong) mutex_spin_wait_count, +- (ulong) mutex_spin_round_count, +- (ulong) mutex_os_wait_count, +- (ulong) rw_s_spin_wait_count, +- (ulong) rw_s_os_wait_count, +- (ulong) rw_x_spin_wait_count, +- (ulong) rw_x_os_wait_count); ++ "Mutex spin waits %llu, rounds %llu, OS waits %llu\n" ++ "RW-shared spins %llu, OS waits %llu;" ++ " RW-excl spins %llu, OS waits %llu\n", ++ mutex_spin_wait_count, ++ mutex_spin_round_count, ++ mutex_os_wait_count, ++ rw_s_spin_wait_count, ++ rw_s_os_wait_count, ++ rw_x_spin_wait_count, ++ rw_x_os_wait_count); ++ ++ fprintf(file, ++ "Spin rounds per wait: %.2f mutex, %.2f RW-shared, " ++ "%.2f RW-excl\n", ++ (double) mutex_spin_round_count / ++ (mutex_spin_wait_count ? mutex_spin_wait_count : 1), ++ (double) rw_s_spin_round_count / ++ (rw_s_spin_wait_count ? rw_s_spin_wait_count : 1), ++ (double) rw_x_spin_round_count / ++ (rw_x_spin_wait_count ? rw_x_spin_wait_count : 1)); + } + + /*********************************************************************** +diff -ruN a/patch_info/innodb_rw_lock.info b/patch_info/innodb_rw_lock.info +--- /dev/null 1970-01-01 09:00:00.000000000 +0900 ++++ b/patch_info/innodb_rw_lock.info 2009-10-22 15:18:30.000000000 +0900 +@@ -0,0 +1,6 @@ ++File=innodb_rw_lock.patch ++Name=Fix of InnoDB rw_locks ported from InnoDB Plugin ++Version=1.0 ++Author=InnoBase Oy. ++License=GPL ++Comment= |