#include #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace std; mpz_class load_mpz(const char *fname) { FILE *f = fopen(fname,"r"); if (!f) { perror(fname); exit(111); } mpz_class result; mpz_inp_raw(result.get_mpz_t(),f); fclose(f); return result; } void load_mpzarray(const char *fname, mpz_class *m, const int len) { FILE *f = fopen(fname,"r"); if (!f) { perror(fname); exit(111); } for (int i = 0;i < len;++i) mpz_inp_raw(m[i].get_mpz_t(),f); fclose(f); } double current_time(void) { struct timeval t; gettimeofday(&t, NULL); return (double) (t.tv_sec + (double) (t.tv_usec / 1000000.0)); } int mpi_rank; int mpi_size; double start_time; #define coutranktime cout << mpi_rank << " time " << current_time() - start_time // for left l-bit search: // generate prefixes between mystartleft[b] and myendleft[b] inclusive vector mystartleft; vector myendleft; // for right (n-l)-bit search: // generate reversals of prefixes between mystartleft[b] and myendleft[b] inclusive vector mystartright; vector myendright; int w; int l; int n; int parallelizeww = 1; // 0: w-way parallelization of matrix-vector product // 1: (w*w)-way parallelization of matrix-vector product // both ways work; choice is for speed // 1 has slightly higher synchronization cost // 1 is clearly better balanced when omp_get_max_threads() does not divide w // 1 is also more robust against cpu being busy mpz_class *tptr; mpz_class pzt; mpz_class q; long nu; mpz_class *Bptr; vector > Ltable; vector Lx; // fill Ltable with left products indexed by l-bit strings in lex order // but limit to strings between mystartleft and myendleft // and limit to strings starting with b-bit string xleft (big-endian) // using left product L corresponding to xleft void precompute(int b,int xleft,mpz_class *L) { if (xleft < mystartleft[b]) return; if (xleft > myendleft[b]) return; if (b == l) { coutranktime << " L " << xleft << "\n" << flush; Ltable.push_back(vector(L,L+w)); Lx.push_back(xleft); return; } mpz_class newL[w]; for (int xb = 0;xb < 2;++xb) { if (xleft * 2 + xb < mystartleft[b + 1]) continue; if (xleft * 2 + xb > myendleft[b + 1]) continue; double startvmtime = current_time(); if (parallelizeww) { mpz_class *product = new mpz_class[w * w]; #pragma omp parallel { #pragma omp for for (int ij = 0;ij < w * w;++ij) { int i = ij / w; int j = ij % w; product[ij] = L[j] * Bptr[((b * 2 + xb) * w + i) * w + j]; } #pragma omp for for (int i = 0;i < w;++i) { newL[i] = product[i * w]; for (int j = 1;j < w;++j) newL[i] += product[i * w + j]; newL[i] %= q; } } delete[] product; } else { #pragma omp parallel { #pragma omp for for (int i = 0;i < w;++i) { newL[i] = L[0] * Bptr[((b * 2 + xb) * w + i) * w + 0]; for (int j = 1;j < w;++j) newL[i] += L[j] * Bptr[((b * 2 + xb) * w + i) * w + j]; newL[i] %= q; } } } coutranktime << " vm " << current_time() - startvmtime << "\n" << flush; precompute(b + 1,xleft * 2 + xb,newL); } } // scan right products indexed by (n-l)-bit strings in revlex order // but limit to strings between mystartright and myendright // and limit to reversals of strings starting with b-bit string xright (big-endian) void mainloop(int b,int xright,mpz_class *R) { if (xright < mystartright[b]) return; if (xright > myendright[b]) return; if (b == n - l) { coutranktime << " R " << xright << "\n" << flush; #pragma omp parallel { #pragma omp for for (unsigned long long Lpos = 0;Lpos < Ltable.size();++Lpos) { double startvvtime = current_time(); mpz_class y; for (int i = 0;i < w;++i) y += R[i] * Ltable[Lpos][i]; y *= pzt; y %= q; if (y > q - y) y -= q; double now = current_time(); #pragma omp critical { int solution = (mpz_sizeinbase(y.get_mpz_t(),2) > mpz_sizeinbase(q.get_mpz_t(),2) - nu); cout << mpi_rank << " time " << now - start_time; cout << " vv " << now - startvvtime; cout << " checksum "; for (int i = 0;i < l;++i) cout << "01"[1 & (Lx[Lpos] >> (l - 1 - i))]; for (int i = l;i < n;++i) cout << "01"[1 & (xright >> (i - l))]; cout << " " << mpz_fdiv_ui(y.get_mpz_t(),7121606815140220673); if (solution) cout << " solution"; cout << "\n" << flush; // if (solution) exit(0); } } } return; } mpz_class newR[w]; for (int xb = 0;xb < 2;++xb) { if (xright * 2 + xb < mystartright[b + 1]) continue; if (xright * 2 + xb > myendright[b + 1]) continue; double startmvtime = current_time(); if (parallelizeww) { mpz_class *product = new mpz_class[w * w]; #pragma omp parallel { #pragma omp for for (int ij = 0;ij < w * w;++ij) { int i = ij / w; int j = ij % w; product[ij] = R[j] * Bptr[(((n - 1 - b - l) * 2 + xb) * w + j) * w + i]; } #pragma omp for for (int i = 0;i < w;++i) { newR[i] = product[i * w]; for (int j = 1;j < w;++j) newR[i] += product[i * w + j]; newR[i] %= q; } } delete[] product; } else { #pragma omp parallel { #pragma omp for for (int i = 0;i < w;++i) { newR[i] = R[0] * Bptr[(((n - 1 - b - l) * 2 + xb) * w + 0) * w + i]; for (int j = 1;j < w;++j) newR[i] += R[j] * Bptr[(((n - 1 - b - l) * 2 + xb) * w + j) * w + i]; newR[i] %= q; } } } coutranktime << " mv " << current_time() - startmvtime << "\n" << flush; mainloop(b + 1,xright * 2 + xb,newR); } } int main(int argc,char **argv) { start_time = current_time(); MPI::Init(); mpi_rank = MPI::COMM_WORLD.Get_rank(); mpi_size = MPI::COMM_WORLD.Get_size(); cout << fixed << setprecision(3) << showpoint; cout << mpi_rank << " outof " << mpi_size << "\n" << flush; w = load_mpz("size").get_si(); pzt = load_mpz("pzt"); q = load_mpz("q"); nu = load_mpz("nu").get_si(); mpz_class s[w]; mpz_class t[w]; load_mpzarray("s_enc", s, w); load_mpzarray("t_enc", t, w); tptr = t; n = w - 2; l = n / 2; if (!(w % omp_get_max_threads())) parallelizeww = 0; uint64_t startleft[mpi_size]; uint64_t pastendleft[mpi_size]; for (int node = 0;node < mpi_size;++node) { startleft[node] = (float)(1UL << l) / (float)mpi_size * (float)node; pastendleft[node] = (float)(1UL << l) / (float)mpi_size * ((float)node + 1.0); } for (int b = 0;b <= l;++b) { mystartleft.push_back(startleft[mpi_rank] >> (l - b)); myendleft.push_back((pastendleft[mpi_rank] - 1) >> (l - b)); } cout << mpi_rank << " precomprange " << mystartleft[l] << " " << myendleft[l] << "\n" << flush; uint64_t startright[mpi_size]; uint64_t pastendright[mpi_size]; for (int node = 0;node < mpi_size;++node) { startright[node] = (float)(1UL << (n - l)) / (float)mpi_size * (float)node; pastendright[node] = (float)(1UL << (n - l)) / (float)mpi_size * ((float)node + 1.0); } for (int b = 0;b <= n - l;++b) { mystartright.push_back(startright[mpi_rank] >> (n - l - b)); myendright.push_back((pastendright[mpi_rank] - 1) >> (n - l - b)); } cout << mpi_rank << " mainlooprange " << mystartright[n - l] << " " << myendright[n - l] << "\n" << flush; mpz_class B[n][2][w * w]; Bptr = &B[0][0][0]; for (int b = 0;b < l;++b) { std::stringstream ss; ss << b; string fname = ss.str() + ".zero"; load_mpzarray(fname.c_str(),B[b][0],w * w); fname = ss.str() + ".one"; load_mpzarray(fname.c_str(),B[b][1],w * w); } coutranktime << " firstloaddone\n" << flush; if (startleft[mpi_rank] < pastendleft[mpi_rank]) precompute(0,0,s); coutranktime << " precompdone\n" << flush; if (mpi_size > 1) { mpz_class receive[w]; for (int node = 0;node < mpi_size;++node) { for (uint64_t tablepos = startleft[node];tablepos < pastendleft[node];++tablepos) { for (int j = 0;j < w;++j) { void *raw = NULL; size_t rawbytes; if (node == mpi_rank) raw = mpz_export(0,&rawbytes,1,1,0,0,Ltable[tablepos - startleft[node]][j].get_mpz_t()); MPI::COMM_WORLD.Bcast(&rawbytes,sizeof rawbytes,MPI::BYTE,node); if (node != mpi_rank) raw = malloc(rawbytes); MPI::COMM_WORLD.Bcast(raw,rawbytes,MPI::BYTE,node); if (node != mpi_rank) mpz_import(receive[j].get_mpz_t(),rawbytes,1,1,0,0,raw); free(raw); } if (node != mpi_rank) { Ltable.push_back(vector(receive,receive + w)); Lx.push_back(tablepos); } } } } coutranktime << " broadcastdone\n" << flush; for (int b = l;b < n;++b) { std::stringstream ss; ss << b; string fname = ss.str() + ".zero"; load_mpzarray(fname.c_str(),B[b - l][0],w * w); fname = ss.str() + ".one"; load_mpzarray(fname.c_str(),B[b - l][1],w * w); } coutranktime << " secondloaddone\n" << flush; if (startright[mpi_rank] < pastendright[mpi_rank]) mainloop(0,0,t); coutranktime << " mainloopdone\n" << flush; MPI::Finalize(); coutranktime << " finalizedone\n" << flush; return 0; }