给40亿个不重复的unsigned int的整数,没排过序的,然后再给一个数,bitmap解法

    技术2022-07-20  72

    总共有4*10^9个数字,如果直接放在内存中,需要的内存量是 4x109x4bytes(一个unsigned int为4bytes) ~= 16GBytes内存 ,用bitmap表示就是4x109bits~=512MBytes=232 = 4 294 967 296 ,所以索性就分配512M内存,然后依次将这个40亿不重复的数字添加到bitmap中,默认为0,添加后为1。最后直接判断要查找的数字的bitmap值,如果为0,则表示没有,如果为1,则表示有。 C++代码,用char型(每个char占用1字节)来存储数据,一个char占8bit,因此需要的总空间为maxsize//8,注意整除有可能比所需空间少,因此在最后在加1,总共为((TOTAL_DATA_SIZE+BITMAP_BITS-1)/BITMAP_BITS)的空间。 对每一个整数,将其整除8((v)/BITMAP_BITS)得到所在的位置,在对8求余得到具体位数(0-7位,用将1左移0-7位的方法实现)((b[(v)/BITMAP_BITS] |= (1 << ((v)%BITMAP_BITS)))将数字对应bit位置为1)。然后在查询时只要查询数字对应的位数是否为1,为1则出现过,为0则未出现。(b[(v)/BITMAP_BITS] & (1 << ((v)%BITMAP_BITS)))

    #include <stdio.h> #include <stdlib.h> #include <string.h> #define TOTAL_DATA_SIZE 300 //4000000000 /*40亿个不重复的unsigned int整数大数据*/ //#define MAX_BUFFER_SIZE 536870912 /*512MB*/ typedef char bitmap_type; #define BITMAP_BITS (sizeof(bitmap_type)*8) #define MAX_BUFFER_SIZE ((TOTAL_DATA_SIZE+BITMAP_BITS-1)/BITMAP_BITS) #之所以要BITMAP_BITS-1是为了防止出现只用TOTAL_DATA_SIZE/BITMAP_BITS后的值比所需空间少的情况,BITMAP_BITS-1=7,TOTAL_DATA_SIZE+7要么大于所需空间,要么正好等于(+7消除了余数) #define bitmap_set(b,v) (b[(v)/BITMAP_BITS] |= (1 << ((v)%BITMAP_BITS))) #define bitmap_clear(b,v) (b[(v)/BITMAP_BITS] &= ~(1 << ((v)%BITMAP_BITS))) #define bitmap_isset(b,v) (b[(v)/BITMAP_BITS] & (1 << ((v)%BITMAP_BITS))) #define bitmap_zero(b) memset(b,0,sizeof(bitmap_type)*MAX_BUFFER_SIZE) #define DATA_FILE_NAME "data.txt" int main(int argc, char*argv[]) { bitmap_type *bitmap = NULL; FILE *fp; unsigned int num; bitmap = (bitmap_type*)malloc(sizeof(bitmap_type)*MAX_BUFFER_SIZE); if(bitmap == NULL) { printf("bitmap malloc error\n"); return -1; } bitmap_zero(bitmap); //memset(bitmap,0,sizeof(bitmap_type)*MAX_BUFFER_SIZE); fp = fopen(DATA_FILE_NAME,"rb"); if(fp == NULL) { printf("fopen file %s error\n",DATA_FILE_NAME); return -1; } while(fscanf(fp,"%d",&num) != EOF) { bitmap_set(bitmap,num); } if(bitmap_isset(bitmap,2)) { printf("2 is set in the bitmap\n"); } if(bitmap_isset(bitmap,222)) { printf("222 is set in the bitmap\n"); }else { printf("222 is not in the bitmap\n"); } if(bitmap_isset(bitmap,333)) { printf("333 is set in the bitmap\n"); }else { printf("333 is not in the bitmap\n"); } return 0; } ————————————————

    python实现

    #给40亿个不重复的unsigned int的整数,没排过序的,然后再给一个数,bitmap解法 ############ list1 = [1,2,3,4,5,6,7,8,0] def setchar(mark,move): lista[mark]|=1<<move def findchar(mark,move): if lista[mark] & 1<<move: return True else:return False lenth = len(list1)//8+1 lista = [0 for _ in range(lenth)] for i in list1: key = i//8 move = i%8 setchar(key,move) print(lista) list2 = [1,3,7,9] for i in list2: key = i//8 move = i%8 print(i,findchar(key,move)) ################
    Processed: 0.008, SQL: 9