Deutsch English Français Italiano |
<vdjf67$3840f$1@raubtier-asyl.eternal-september.org> View for Bookmarking (what is this?) Look up another Usenet article |
Path: ...!eternal-september.org!feeder3.eternal-september.org!news.eternal-september.org!raubtier-asyl.eternal-september.org!.POSTED!not-for-mail From: Bonita Montero <Bonita.Montero@gmail.com> Newsgroups: alt.comp.lang.c,comp.lang.c Subject: Re: A very slow program Date: Wed, 2 Oct 2024 14:44:04 +0200 Organization: A noiseless patient Spider Lines: 323 Message-ID: <vdjf67$3840f$1@raubtier-asyl.eternal-september.org> References: <vc7nb0$7rl3$1@paganini.bofh.team> MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8; format=flowed Content-Transfer-Encoding: 7bit Injection-Date: Wed, 02 Oct 2024 14:43:52 +0200 (CEST) Injection-Info: raubtier-asyl.eternal-september.org; posting-host="142f7657bb08b4e3cf99d9234e184754"; logging-data="3411983"; mail-complaints-to="abuse@eternal-september.org"; posting-account="U2FsdGVkX191yOK4Z/tLmC6HlCGQJJ2swZS3PC8gvO8=" User-Agent: Mozilla Thunderbird Cancel-Lock: sha1:Z03wFJvLKkK8bj0MvtqpTCogSK0= In-Reply-To: <vc7nb0$7rl3$1@paganini.bofh.team> Content-Language: de-DE Bytes: 10100 Ive developed a Sieve of Erastosthenes in C++20 with an unbeaten performance. With this code I get all primes <= 2 ^ 32 in 140ms on my AMD 7950X 16-core Zen4-system. It calculates first only the primes up to the square root of the maximum. The remaining bits are partitioned according to the num- ber of CPU-cores. Within each thread the bitmap is partitioned in parts that fit into the L2-cache (queried via CPUID). All operations are aligned on a cacheline-boundary to avoid false sharing and locking of shared words. #include <iostream> #include <cstdlib> #include <charconv> #include <cstring> #include <vector> #include <algorithm> #include <cmath> #include <bit> #include <fstream> #include <string_view> #include <thread> #include <utility> #include <new> #include <span> #include <array> #include <cassert> #include <sstream> #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) #include <intrin.h> #elif (defined(__GNUC__) || defined(__clang__)) && (defined(__i386__) || defined(__x86_64__)) #include <cpuid.h> #endif #if defined(_MSC_VER) #pragma warning(disable: 26495) // uninitialized member #endif using namespace std; #if defined(__cpp_lib_hardware_interference_size) constexpr ptrdiff_t CL_SIZE = hardware_destructive_interference_size; #else constexpr ptrdiff_t CL_SIZE = 64; #endif size_t getL2Size(); bool smt(); int main( int argc, char **argv ) { try { if( argc < 2 ) return EXIT_FAILURE; auto parse = []( char const *str, auto &value, bool bytes ) { bool hex = str[0] == '0' && tolower( str[1] ) == 'x'; str += 2 * hex; auto [ptr, ec] = from_chars( str, str + strlen( str ), value, !hex ? 10 : 16 ); if( ec != errc() ) return false; if( bytes && tolower( *ptr ) == 'x' && !ptr[1] ) { value *= 8; --value; ++ptr; } return !*ptr; }; size_t end; if( !parse( argv[1], end, true ) ) return EXIT_FAILURE; if( end < 2 ) return EXIT_FAILURE; if( (ptrdiff_t)end++ < 0 ) throw bad_alloc(); unsigned hc = jthread::hardware_concurrency(), nThreads; if( argc < 4 || !parse( argv[3], nThreads, false ) ) nThreads = hc; if( !nThreads ) return EXIT_FAILURE; using word_t = size_t; constexpr size_t BITS_PER_CL = CL_SIZE * 8, BITS = sizeof(word_t) * 8; size_t nBits = end / 2; union alignas(CL_SIZE) ndi_words_cl { word_t words[CL_SIZE / sizeof(word_t)]; ndi_words_cl() {} }; vector<ndi_words_cl> sieveCachelines( (nBits + BITS_PER_CL - 1) / BITS_PER_CL ); span<word_t> sieve( &sieveCachelines[0].words[0], (nBits + BITS - 1) / BITS ); assert(to_address( sieve.end() ) <= &to_address( sieveCachelines.end() )->words[0]); fill( sieve.begin(), sieve.end(), -1 ); auto punch = [&]( size_t start, size_t end, size_t prime, auto ) { size_t bit = start / 2, bitEnd = end / 2; if( bit >= bitEnd ) [[unlikely]] return; if( prime >= BITS ) [[likely]] do [[likely]] sieve[bit / BITS] &= rotl( (word_t)-2, bit % BITS ); while( (bit += prime) < bitEnd ); else { auto pSieve = sieve.begin() + bit / BITS; do [[likely]] { word_t word = *pSieve, mask = rotl( (word_t)-2, bit % BITS ), oldMask; do { word &= mask; oldMask = mask; mask = rotl( mask, prime % BITS ); bit += prime; } while( mask < oldMask ); *pSieve++ = word; } while( bit < bitEnd ); } }; size_t sqrt = (ptrdiff_t)ceil( ::sqrt( (ptrdiff_t)end ) ); [&] { for( size_t bSqrt = sqrt / 2, bit = 1; bit < bSqrt; ++bit ) [[likely]] { auto pSieve = sieve.begin () + bit / BITS; for( ; ; ) { if( word_t w = *pSieve >> bit % BITS; w ) [[likely]] { bit += countr_zero ( w ); break; } ++pSieve; bit = bit + BITS & -(ptrdiff_t)BITS; if( bit >= bSqrt ) [[unlikely]] return; } size_t prime = 2 * bit + 1; punch( prime * prime, sqrt, prime, false_type () ); } }(); auto scan = [&]( size_t start, size_t end, auto fn ) { size_t bit = start / 2, bitEnd = end / 2; if( bit >= bitEnd ) [[unlikely]] return; auto pSieve = sieve.begin() + bit / BITS; word_t word = *pSieve++ >> bit % BITS; for( ; ; ) { size_t nextWordBit = bit + BITS & -(ptrdiff_t)BITS; for( unsigned gap; word; word >>= gap, word >>= 1, ++bit ) { gap = countr_zero( word ); bit += gap; if( bit >= bitEnd ) [[unlikely]] return; fn( bit * 2 + 1, true_type() ); } bit = nextWordBit; if( bit >= bitEnd ) [[unlikely]] return; word = *pSieve++; } }; vector<jthread> threads; ========== REMAINDER OF ARTICLE TRUNCATED ==========