// Note: the size must be power of 2.
#define SEGMENT_SIZE 0x200000
// 32 KiB L1 data cache = 2^15*2^3 = 2^18 bits, identifying 2^19 numbers.
// #define SEGMENT_SIZE 0x80000
// 256 KiB L2 cache
// #define SEGMENT_SIZE 0x400000
// 6 MiB L3 cache, use 2 MiB of it
// #define SEGMENT_SIZE 0x2000000
boolprime(char*data,uint32_tn){if(n==2)returntrue;if(!(n&1))returnfalse;return!(data[n>>4]&(1<<(n>>1&7)));}/*
* Run sieve of Eratosthenes in [0, max(SEGMENT_SIZE, sqrt(UINT32_MAX))),
* and store prime numbers in [3, sqrt(UINT32_MAX)].
*/uint32_tinitial_sieve(char*data,uint32_t*primes){constuint32_tSQ=(uint32_t)sqrt(UINT32_MAX),LIMIT=SEGMENT_SIZE>SQ?SEGMENT_SIZE:SQ;data[0]=1;uint32_tn,sq,c,step,count=0;for(n=3,sq=(uint32_t)sqrt(LIMIT);n<=sq;n+=2){if(data[n>>4]&(1<<(n>>1&7)))continue;for(c=n*n,step=n<<1;c<=LIMIT;c+=step)data[c>>4]|=1<<(c>>1&7);}for(n=3;n<=SQ;n+=2){if(!(data[n>>4]&(1<<(n>>1&7))))primes[count++]=n;}returncount;}voidsegmented_sieve_of_eratosthenes(char*data){// store prime numbers in [3, sqrt(UINT32_MAX)]
uint32_t*primes=(uint32_t*)malloc(8192*sizeof(uint32_t));constuint32_tcount=initial_sieve(data,primes);uint32_tsq,i,p,c,step;constuint32_tSEGMENTS=(uint32_t)(1<<31)/(SEGMENT_SIZE>>1);// first segment has been sieved
for(uint32_ts=1,low=SEGMENT_SIZE,high=(SEGMENT_SIZE<<1)-1;s<SEGMENTS;++s,low+=SEGMENT_SIZE,high+=SEGMENT_SIZE){sq=(uint32_t)sqrt(high);for(i=0;i<count&&(p=primes[i])<=sq;++i){// c = p^2 + 2k * p
c=low/p*p;if(c<low)c+=p;if(!(c%2))c+=p;// uint32t overflow in last segment
for(step=p<<1;low<=c&&c<=high;c+=step)data[c>>4]|=1<<(c>>1&7);}}free(primes);}
速度有了明显的提升:
1
2
3
Segmented sieve of Eratosthenes cost 5.576287 seconds.
Same story.
Verification cost 63.089097 seconds.