Deutsch   English   Français   Italiano  
<vdjf67$3840f$1@raubtier-asyl.eternal-september.org>

View for Bookmarking (what is this?)
Look up another Usenet article

Path: ...!eternal-september.org!feeder3.eternal-september.org!news.eternal-september.org!raubtier-asyl.eternal-september.org!.POSTED!not-for-mail
From: Bonita Montero <Bonita.Montero@gmail.com>
Newsgroups: alt.comp.lang.c,comp.lang.c
Subject: Re: A very slow program
Date: Wed, 2 Oct 2024 14:44:04 +0200
Organization: A noiseless patient Spider
Lines: 323
Message-ID: <vdjf67$3840f$1@raubtier-asyl.eternal-september.org>
References: <vc7nb0$7rl3$1@paganini.bofh.team>
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8; format=flowed
Content-Transfer-Encoding: 7bit
Injection-Date: Wed, 02 Oct 2024 14:43:52 +0200 (CEST)
Injection-Info: raubtier-asyl.eternal-september.org; posting-host="142f7657bb08b4e3cf99d9234e184754";
	logging-data="3411983"; mail-complaints-to="abuse@eternal-september.org";	posting-account="U2FsdGVkX191yOK4Z/tLmC6HlCGQJJ2swZS3PC8gvO8="
User-Agent: Mozilla Thunderbird
Cancel-Lock: sha1:Z03wFJvLKkK8bj0MvtqpTCogSK0=
In-Reply-To: <vc7nb0$7rl3$1@paganini.bofh.team>
Content-Language: de-DE
Bytes: 10100

Ive developed a Sieve of Erastosthenes in C++20 with an unbeaten
performance. With this code I get all primes <= 2 ^ 32 in 140ms
on my AMD 7950X 16-core Zen4-system.
It calculates first only the primes up to the square root of the
maximum. The remaining bits are partitioned according to the num-
ber of CPU-cores. Within each thread the bitmap is partitioned in
parts that fit into the L2-cache (queried via CPUID).
All operations are aligned on a cacheline-boundary to avoid false
sharing and locking of shared words.

#include <iostream>
#include <cstdlib>
#include <charconv>
#include <cstring>
#include <vector>
#include <algorithm>
#include <cmath>
#include <bit>
#include <fstream>
#include <string_view>
#include <thread>
#include <utility>
#include <new>
#include <span>
#include <array>
#include <cassert>
#include <sstream>
#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
	#include <intrin.h>
#elif (defined(__GNUC__) || defined(__clang__)) && (defined(__i386__) || 
defined(__x86_64__))
	#include <cpuid.h>
#endif

#if defined(_MSC_VER)
	#pragma warning(disable: 26495) // uninitialized member
#endif

using namespace std;

#if defined(__cpp_lib_hardware_interference_size)
constexpr ptrdiff_t CL_SIZE = hardware_destructive_interference_size;
#else
constexpr ptrdiff_t CL_SIZE = 64;
#endif

size_t getL2Size();
bool smt();

int main( int argc, char **argv )
{
	try
	{
		if( argc < 2 )
			return EXIT_FAILURE;
		auto parse = []( char const *str, auto &value, bool bytes )
		{
			bool hex = str[0] == '0' && tolower( str[1] ) == 'x';
			str += 2 * hex;
			auto [ptr, ec] = from_chars( str, str + strlen( str ), value, !hex ? 
10 : 16 );
			if( ec != errc() )
				return false;
			if( bytes && tolower( *ptr ) == 'x' && !ptr[1] )
			{
				value *= 8;
				--value;
				++ptr;
			}
			return !*ptr;
		};
		size_t end;
		if( !parse( argv[1], end, true ) )
			return EXIT_FAILURE;
		if( end < 2 )
			return EXIT_FAILURE;
		if( (ptrdiff_t)end++ < 0 )
			throw bad_alloc();
		unsigned
			hc = jthread::hardware_concurrency(),
			nThreads;
		if( argc < 4 || !parse( argv[3], nThreads, false ) )
			nThreads = hc;
		if( !nThreads )
			return EXIT_FAILURE;
		using word_t = size_t;
		constexpr size_t
			BITS_PER_CL = CL_SIZE * 8,
			BITS = sizeof(word_t) * 8;
		size_t nBits = end / 2;
		union alignas(CL_SIZE) ndi_words_cl { word_t words[CL_SIZE / 
sizeof(word_t)]; ndi_words_cl() {} };
		vector<ndi_words_cl> sieveCachelines( (nBits + BITS_PER_CL - 1) / 
BITS_PER_CL );
		span<word_t> sieve( &sieveCachelines[0].words[0], (nBits + BITS - 1) / 
BITS );
		assert(to_address( sieve.end() ) <= &to_address( sieveCachelines.end() 
)->words[0]);
		fill( sieve.begin(), sieve.end(), -1 );
		auto punch = [&]( size_t start, size_t end, size_t prime, auto )
		{
			size_t
				bit = start / 2,
				bitEnd = end / 2;
			if( bit >= bitEnd ) [[unlikely]]
				return;
			if( prime >= BITS ) [[likely]]
				do [[likely]]
					sieve[bit / BITS] &= rotl( (word_t)-2, bit % BITS );
				while( (bit += prime) < bitEnd );
			else
			{
				auto pSieve = sieve.begin() + bit / BITS;
				do [[likely]]
				{
					word_t
						word = *pSieve,
						mask = rotl( (word_t)-2, bit % BITS ),
						oldMask;
					do
					{
						word &= mask;
						oldMask = mask;
						mask = rotl( mask, prime % BITS );
						bit += prime;
					} while( mask < oldMask );
					*pSieve++ = word;
				} while( bit < bitEnd );
			}
		};
		size_t sqrt = (ptrdiff_t)ceil( ::sqrt( (ptrdiff_t)end ) );
		[&]
		{
			for( size_t bSqrt = sqrt / 2, bit = 1; bit < bSqrt; ++bit ) [[likely]]
			{
				auto pSieve = sieve.begin () + bit / BITS;
				for( ; ; )
				{
					if( word_t w = *pSieve >> bit % BITS; w ) [[likely]]
					{
						bit += countr_zero ( w );
						break;
					}
					++pSieve;
					bit = bit + BITS & -(ptrdiff_t)BITS;
					if( bit >= bSqrt ) [[unlikely]]
						return;
				}
				size_t prime = 2 * bit + 1;
				punch( prime * prime, sqrt, prime, false_type () );
			}
		}();
		auto scan = [&]( size_t start, size_t end, auto fn )
		{
			size_t
				bit = start / 2,
				bitEnd = end / 2;
			if( bit >= bitEnd ) [[unlikely]]
				return;
			auto pSieve = sieve.begin() + bit / BITS;
			word_t word = *pSieve++ >> bit % BITS;
			for( ; ; )
			{
				size_t nextWordBit = bit + BITS & -(ptrdiff_t)BITS;
				for( unsigned gap; word; word >>= gap, word >>= 1, ++bit )
				{
					gap = countr_zero( word );
					bit += gap;
					if( bit >= bitEnd ) [[unlikely]]
						return;
					fn( bit * 2 + 1, true_type() );
				}
				bit = nextWordBit;
				if( bit >= bitEnd ) [[unlikely]]
					return;
				word = *pSieve++;
			}
		};
		vector<jthread> threads;
========== REMAINDER OF ARTICLE TRUNCATED ==========