|
View:
New views
4 Messages
—
Rating Filter:
Alert me
|
|
|
Review of the new ringbuffer Hi!
Here are a few thoughts on Tims new Birnet::Atomic::RingBuffer<T>. | namespace Atomic { | | template<typename T> | class RingBuffer { | const uint m_size; If something has a size, its useful to add an accessor method which reads the size. | T *m_buffer; Using vector<T> means doing less things yourself (and less things that can go wrong). | volatile uint m_wmark, m_rmark; You probably want to add BIRNET_PRIVATE_CLASS_COPY here, as copying (or assigning a ringbuffer (as you implemented it) will not work. | public: | explicit | RingBuffer (uint bsize) : | m_size (bsize + 1), m_wmark (0), m_rmark (bsize) | { | m_buffer = new T[m_size]; | Atomic::uint_set (&m_wmark, 0); | Atomic::uint_set (&m_rmark, 0); | } | ~RingBuffer() | { | Atomic::uint_set ((volatile uint*) &m_size, 0); | Atomic::uint_set (&m_rmark, 0); | Atomic::uint_set (&m_wmark, 0); | delete &m_buffer[m_size]; This is wrong (run it in valgrind to see that you are not deleting the right thing here). The correct code is delete[] m_buffer; | } | uint | n_writable() | { | const uint rm = Atomic::uint_get (&m_rmark); | const uint wm = Atomic::uint_get (&m_wmark); | uint space = (m_size - 1 + rm - wm) % m_size; | return space; | } | uint | write (uint length, | const T *data, | bool partial = true) | { | const uint orig_length = length; | const uint rm = Atomic::uint_get (&m_rmark); | uint wm = Atomic::uint_get (&m_wmark); | uint space = (m_size - 1 + rm - wm) % m_size; | if (!partial && length > space) | return 0; | while (length) | { | if (rm <= wm) | space = m_size - wm + (rm == 0 ? -1 : 0); | else | space = rm - wm -1; | if (!space) | break; | space = MIN (space, length); | for (int i = 0; i < space; i++) | m_buffer[wm + i] = data[i]; Its better to use std::copy, because for some data types (primitive types), then memmove will be used (template specialization). | wm = (wm + space) % m_size; | data += space; | length -= space; | } Systems with memory barriers need a write memory barrier here. In the jack driver code this looks like: // It is important that the data from the previous writes get committed // to memory *before* the index variable is updated. Otherwise, the // consumer thread could be reading invalid data, if the index variable // got written before the rest of the data (when unordered writes are // performed). write_memory_barrier(); and: void write_memory_barrier() { static volatile int dummy = 0; /* * writing this dummy integer should ensure that all prior writes * are committed to memory */ Atomic::int_set (&dummy, 0x12345678); } I am not sure whether write_memory_barrier is a sufficiently good implementation on such systems. | Atomic::uint_set (&m_wmark, wm); | return orig_length - length; | } | uint | n_readable() | { | const uint wm = Atomic::uint_get (&m_wmark); | const uint rm = Atomic::uint_get (&m_rmark); | uint space = (m_size + wm - rm) % m_size; | return space; | } | uint | read (uint length, | T *data, | bool partial = true) | { | const uint orig_length = length; | const uint wm = Atomic::uint_get (&m_wmark); | uint rm = Atomic::uint_get (&m_rmark); | uint space = (m_size + wm - rm) % m_size; | if (!partial && length > space) | return 0; | while (length) | { | if (wm < rm) | space = m_size - rm; | else | space = wm - rm; | if (!space) | break; | space = MIN (space, length); I use std::min where I can, because it doesn't have the typical macro problems (double args evaluation). | for (int i = 0; i < space; i++) | data[i] = m_buffer[rm + i]; This should also use std::copy. | rm = (rm + space) % m_size; | data += space; | length -= space; | } | Atomic::uint_set (&m_rmark, rm); | return orig_length - length; | } | }; | | } // Atomic Cu... Stefan -- Stefan Westerfeld, Hamburg/Germany, http://space.twc.de/~stefan _______________________________________________ beast mailing list beast@... http://mail.gnome.org/mailman/listinfo/beast |
|
|
Re: Review of the new ringbufferOn Thu, 25 Jan 2007, Stefan Westerfeld wrote:
> Hi! > > Here are a few thoughts on Tims new Birnet::Atomic::RingBuffer<T>. > | volatile uint m_wmark, m_rmark; > > You probably want to add BIRNET_PRIVATE_CLASS_COPY here, as copying (or > assigning a ringbuffer (as you implemented it) will not work. yes, i meant to have that here, thanks for catching. > | public: > | explicit > | RingBuffer (uint bsize) : > | m_size (bsize + 1), m_wmark (0), m_rmark (bsize) > | { > | m_buffer = new T[m_size]; > | Atomic::uint_set (&m_wmark, 0); > | Atomic::uint_set (&m_rmark, 0); > | } > | ~RingBuffer() > | { > | Atomic::uint_set ((volatile uint*) &m_size, 0); > | Atomic::uint_set (&m_rmark, 0); > | Atomic::uint_set (&m_wmark, 0); > | delete &m_buffer[m_size]; > > This is wrong (run it in valgrind to see that you are not deleting the > right thing here). The correct code is > > delete[] m_buffer; oops, right. thanks for catching ;) > | uint > | write (uint length, > | const T *data, > | bool partial = true) > | { > | const uint orig_length = length; > | const uint rm = Atomic::uint_get (&m_rmark); > | uint wm = Atomic::uint_get (&m_wmark); > | uint space = (m_size - 1 + rm - wm) % m_size; > | if (!partial && length > space) > | return 0; > | while (length) > | { > | if (rm <= wm) > | space = m_size - wm + (rm == 0 ? -1 : 0); > | else > | space = rm - wm -1; > | if (!space) > | break; > | space = MIN (space, length); > | for (int i = 0; i < space; i++) > | m_buffer[wm + i] = data[i]; > > Its better to use std::copy, because for some data types (primitive > types), then memmove will be used (template specialization). the way the code is written above, GCC can auto-vectorize it, is that true for std::copy as well? > | wm = (wm + space) % m_size; > | data += space; > | length -= space; > | } > > Systems with memory barriers need a write memory barrier here. In the > jack driver code this looks like: write/read memory barriers are implemented as part of Atomic::uint_get and Atomic::uint_set. also, volatile variables haven't been updated here, so there's no need for a barrier at this point. > | Atomic::uint_set (&m_wmark, wm); > | return orig_length - length; > | } > | uint > | read (uint length, > | T *data, > | bool partial = true) > | { > | const uint orig_length = length; > | const uint wm = Atomic::uint_get (&m_wmark); > | uint rm = Atomic::uint_get (&m_rmark); > | uint space = (m_size + wm - rm) % m_size; > | if (!partial && length > space) > | return 0; > | while (length) > | { > | if (wm < rm) > | space = m_size - rm; > | else > | space = wm - rm; > | if (!space) > | break; > | space = MIN (space, length); > I use std::min where I can, because it doesn't have the typical macro > problems (double args evaluation). double evaluation isn't a problem here (on variables) and std::min has other problems (mostly wrg typing). > | } // Atomic thanks for the input. > Cu... Stefan --- ciaoTJ _______________________________________________ beast mailing list beast@... http://mail.gnome.org/mailman/listinfo/beast |
|
|
Re: Review of the new ringbuffer Hi!
(this mail exists twice, because I forgot CCing the beast list the first fime I sent it - sorry for any inconvenience caused by this) On Thu, Jan 25, 2007 at 10:23:44AM +0100, Tim Janik wrote: > On Thu, 25 Jan 2007, Stefan Westerfeld wrote: > > Here are a few thoughts on Tims new Birnet::Atomic::RingBuffer<T>. > > > | volatile uint m_wmark, m_rmark; > > > | public: > > | uint > > | write (uint length, > > | const T *data, > > | bool partial = true) > > | { > > | const uint orig_length = length; > > | const uint rm = Atomic::uint_get (&m_rmark); > > | uint wm = Atomic::uint_get (&m_wmark); > > | uint space = (m_size - 1 + rm - wm) % m_size; > > | if (!partial && length > space) > > | return 0; > > | while (length) > > | { > > | if (rm <= wm) > > | space = m_size - wm + (rm == 0 ? -1 : 0); > > | else > > | space = rm - wm -1; > > | if (!space) > > | break; > > | space = MIN (space, length); > > | for (int i = 0; i < space; i++) > > | m_buffer[wm + i] = data[i]; > > > > Its better to use std::copy, because for some data types (primitive > > types), then memmove will be used (template specialization). > > the way the code is written above, GCC can auto-vectorize it, > is that true for std::copy as well? then, the benchmark may result in different results, depending on * the version of g++, which affects - how std::copy is implemented in the STL - how good the auto vectorizer is * whether auto vecorization can be performed where the ringbuffer was instantiated (for instance, on x86 targets, libbse does not have SSE instructions, so auto vectorization is no advantage here) * the libc version (as the version of std::copy I examined expands to memmove, so the question is really: how does memmove compare with a vectorized loop, performance wise) * whether the code uses the __restrict__ keyword - my experience is that g++ does not vectorize loops when the areas pointed to could overlap I wrote a benchmark which performs a similar loop like the one you use, and here are execution times on my Debian/AMD64: g++-4.0 --version g++-4.0 (GCC) 4.0.4 20060904 (prerelease) (Debian 4.0.3-7) Copyright (C) 2006 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. ========== copy_bench ========= char (loop): 11.337754 sec char (copy): 0.573089 sec int (loop): 2.841360 sec int (copy): 0.575760 sec float (loop): 2.841312 sec float (copy): 0.572800 sec ========== copy_bench_tv ========= char (loop): 11.357746 sec char (copy): 0.567677 sec int (loop): 1.207976 sec int (copy): 0.572344 sec float (loop): 0.969715 sec float (copy): 0.568255 sec ========== copy_bench_res ========= char (loop): 11.337712 sec char (copy): 0.810970 sec int (loop): 2.844015 sec int (copy): 0.802650 sec float (loop): 2.840387 sec float (copy): 0.806874 sec ========== copy_bench_res_tv ========= char (loop): 1.202510 sec char (copy): 0.575979 sec int (loop): 1.203514 sec int (copy): 0.575684 sec float (loop): 0.966977 sec float (copy): 0.571858 sec g++-4.1 --version g++-4.1 (GCC) 4.1.2 20061115 (prerelease) (Debian 4.1.1-21) Copyright (C) 2006 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. ========== copy_bench ========= char (loop): 11.336817 sec char (copy): 0.567631 sec int (loop): 3.787092 sec int (copy): 0.569993 sec float (loop): 3.786067 sec float (copy): 0.568548 sec ========== copy_bench_tv ========= char (loop): 11.334109 sec char (copy): 0.971198 sec int (loop): 3.785412 sec int (copy): 0.967551 sec float (loop): 3.784448 sec float (copy): 0.974924 sec ========== copy_bench_res ========= char (loop): 11.335676 sec char (copy): 0.572069 sec int (loop): 3.785080 sec int (copy): 0.569166 sec float (loop): 3.785338 sec float (copy): 0.569591 sec ========== copy_bench_res_tv ========= char (loop): 1.206806 sec char (copy): 0.573147 sec int (loop): 3.440046 sec int (copy): 0.574105 sec float (loop): 2.845281 sec float (copy): 0.575720 sec /usr/lib/gcc-snapshot/bin/g++ --version g++ (GCC) 4.3.0 20061022 (experimental) Copyright (C) 2006 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. ========== copy_bench ========= char (loop): 11.333177 sec char (copy): 0.569332 sec int (loop): 3.784759 sec int (copy): 0.571180 sec float (loop): 3.784947 sec float (copy): 0.572712 sec ========== copy_bench_tv ========= char (loop): 11.336388 sec char (copy): 0.979739 sec int (loop): 3.784161 sec int (copy): 0.985023 sec float (loop): 3.784255 sec float (copy): 0.984372 sec ========== copy_bench_res ========= char (loop): 11.336951 sec char (copy): 0.968223 sec int (loop): 3.784592 sec int (copy): 0.975108 sec float (loop): 3.785653 sec float (copy): 0.967514 sec ========== copy_bench_res_tv ========= char (loop): 1.320710 sec char (copy): 0.578194 sec int (loop): 2.967469 sec int (copy): 0.575978 sec float (loop): 3.318709 sec float (copy): 0.570503 sec A _tv suffix indicates that the tree vectorizer is enabled, a _res suffix indicates that the __restrict__ keyword was used to mark the blocks in the assignment loop as non overlapping (otherwise the vectorizer can refuse the loop). What you see is that on my system, in every case std::copy was faster than a hand written loop. Vectorization does make a difference, but still the vectorized loop is slower than std::copy in every case. I'll attach the benchmark and a Makefile below. Your machine(s) may behave quite a bit different from mine. Run something like: for i in g++-4.0 g++-4.1 /usr/lib/gcc-snapshot/bin/g++; do make clean all CXX=$i >/dev/null; make bench CXX=$i; done > copy_bench_results.txt To test copy bench with a few compilers. And no, I can not explain why sometimes std::copy only takes 0.57 seconds or so, whereas sometimes it takes 0.98 seconds. I have no idea. > > | wm = (wm + space) % m_size; > > | data += space; > > | length -= space; > > | } > > > > Systems with memory barriers need a write memory barrier here. In the > > jack driver code this looks like: > > write/read memory barriers are implemented as part of Atomic::uint_get > and Atomic::uint_set. also, volatile variables haven't been updated > here, so there's no need for a barrier at this point. void Atomic::uint_set (volatile int *aint, int arg) { *aint = arg; write_memory_barrier(); } then setting m_wmark would set the index to the new value in the write cache of the processor (core) executing the thread. Now if the memory barrier is encountered, the write cache is written to main memory in some order. But "in some order" could mean that the new value of w_mark is written to main memory before the new value of the writes before arrive in main memory. http://trac.zeitherrschaft.org/zzub/browser/trunk/src/portaudio/src/hostapi/coreaudio/ringbuffer.c?rev=663 for instance has these barriers (before updating the index). Here is a mail which briefly justifies their existence: http://techweb.rfa.org/pipermail/portaudio/2006-May/005651.html > > | space = MIN (space, length); > > I use std::min where I can, because it doesn't have the typical macro > > problems (double args evaluation). > > double evaluation isn't a problem here (on variables) and > std::min has other problems (mostly wrg typing). Yes, I know, so its a tradeoff between two solutions which don't really exactly do what I think they should do... Cu... Stefan -- Stefan Westerfeld, Hamburg/Germany, http://space.twc.de/~stefan #include <algorithm> #include <sys/time.h> #include <time.h> #ifdef USE_RESTRICT #define RESTRICT __restrict__ #else #define RESTRICT #warning "no restrict used, so probably the compiler will not vectorize the code" #endif template<typename T, bool USE_COPY> class Buffer { const int m_size; T* RESTRICT m_data; public: Buffer (int size) : m_size (size), m_data (new T[size]) { } ~Buffer() { delete[] m_data; } void xcopy (const T* RESTRICT src, T* RESTRICT dest) { for (int i = 0; i < m_size; i++) dest[i] = src[i]; } void set_data (const T* new_data) { if (USE_COPY) { std::copy (new_data, new_data + m_size, m_data); } else { xcopy (new_data, m_data); } } }; double gettime () { timeval tv; gettimeofday (&tv, 0); return double(tv.tv_sec) + double(tv.tv_usec) * (1.0 / 1000000.0); } template<typename T, bool USE_COPY> void bench (const char *type) { const int size = 4096 / sizeof (T); T src1[size]; T src2[size]; double start = gettime(); Buffer<T, USE_COPY> buffer (size); for (int i = 0; i < 1000000; i++) { buffer.set_data (src1); buffer.set_data (src2); } double end = gettime(); printf ("%s: %f sec\n", type, end - start); } int main() { bench<char, false> ("char (loop)"); bench<char, true> ("char (copy)"); bench<int, false> ("int (loop)"); bench<int, true> ("int (copy)"); bench<float, false> ("float (loop)"); bench<float, true> ("float (copy)"); return 0; } CXXFLAGS = -O3 TARGETS = copy_bench copy_bench_tv copy_bench_res copy_bench_res_tv all: $(TARGETS) bench: $(CXX) --version @for i in $(TARGETS); do echo "========== $$i ========="; $$i; done clean: rm -f $(TARGETS) copy_bench: copy_bench.cc $(CXX) $(CXXFLAGS) -o $@ $< copy_bench_tv: copy_bench.cc $(CXX) $(CXXFLAGS) -ftree-vectorize -ftree-vectorizer-verbose=1 -o $@ $< copy_bench_res: copy_bench.cc $(CXX) $(CXXFLAGS) -DUSE_RESTRICT -o $@ $< copy_bench_res_tv: copy_bench.cc $(CXX) $(CXXFLAGS) -ftree-vectorize -ftree-vectorizer-verbose=1 -DUSE_RESTRICT -o $@ $< _______________________________________________ beast mailing list beast@... http://mail.gnome.org/mailman/listinfo/beast |
|
|
Re: Review of the new ringbufferOn Fri, 26 Jan 2007, Stefan Westerfeld wrote:
> On Thu, Jan 25, 2007 at 10:23:44AM +0100, Tim Janik wrote: >> On Thu, 25 Jan 2007, Stefan Westerfeld wrote: >>> | for (int i = 0; i < space; i++) >>> | m_buffer[wm + i] = data[i]; >>> >>> Its better to use std::copy, because for some data types (primitive >>> types), then memmove will be used (template specialization). >> >> the way the code is written above, GCC can auto-vectorize it, >> is that true for std::copy as well? > > I think the only way to find out what is faster is a benchmark, and even > then, the benchmark may result in different results, depending on > What you see is that on my system, in every case std::copy was faster > than a hand written loop. Vectorization does make a difference, but > still the vectorized loop is slower than std::copy in every case. interesting, the other question that came to my mind about std::copy is how it correctly handles memcpy vs. copy constructors. stl_algobase.h tells us that memmove is used if the copy argument types are scalars and equal. cpp_type_traits.h implements __is_scalar by matching bool, char, wchar_t, short, int, long, long long, float, double, pointer. otherwise, an assignment loop similar to the one we already have is used. that should eliminate the need for vectorization in most cases. >>> | wm = (wm + space) % m_size; >>> | data += space; >>> | length -= space; >>> | } >>> >>> Systems with memory barriers need a write memory barrier here. In the >>> jack driver code this looks like: >> >> write/read memory barriers are implemented as part of Atomic::uint_get >> and Atomic::uint_set. also, volatile variables haven't been updated >> here, so there's no need for a barrier at this point. > > Are you sure? Suppose Atomic::uint_set looked like this: i thought you meant the updating of wm here which is a local variable and didn't quite make sense to me. reading up in: > http://techweb.rfa.org/pipermail/portaudio/2006-May/005651.html i get your point though. i've added descriptive functions to birnet for the barriers now, using them once it's pushed to birnet.git and merged into beast should make the jack driver more readable as well: diff --git a/birnet/birnetthread.hh b/birnet/birnetthread.hh index 48b9cee..061cefd 100644 --- a/birnet/birnetthread.hh +++ b/birnet/birnetthread.hh @@ -62,6 +62,8 @@ public: }; namespace Atomic { +inline void read_barrier (void) { int dummy; ThreadTable.atomic_int_get (&dummy); } +inline void write_barrier (void) { int dummy; ThreadTable.atomic_int_set (&dummy, 0); } /* atomic integers */ inline void int_set (volatile int *iptr, int value) { ThreadTable.atomic_int_set (iptr, value); } inline int int_get (volatile int *iptr) { return ThreadTable.atomic_int_get (iptr); } and yes, this is a sufficient barrier implementation, the functions boil down to g_atomic_int_get/g_atomic_int_set which are special cased on systems that need barriers to implement a memory barrier. glib doesn't distinguish between read/write barriers, but making that distinction in the API should help readability in the actual code. thanks again, i'll update the code accordingly. > Cu... Stefan --- ciaoTJ _______________________________________________ beast mailing list beast@... http://mail.gnome.org/mailman/listinfo/beast |
| Free embeddable forum powered by Nabble | Forum Help |