///////////////////////////////////////////////////////////////////////////////// // The Toolkit for Advanced Discriminative Modeling // Copyright (C) 2001-2005 Robert Malouf // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ///////////////////////////////////////////////////////////////////////////////// // *** // *** Dataset // *** // $Id: dataset.cc,v 1.3 2006/03/17 01:36:31 malouf Exp $ // Copyright (c) 2001-2002 Robert Malouf #include "tadm.h" #include "fileio.h" #include #include #include #include // constructor Dataset::Dataset() { PetscTruth valid; // extract options PetscOptionsHasName(PETSC_NULL,"-correction",&valid); correct = valid; PetscOptionsGetInt(PETSC_NULL,"-bootstrap",&bootstrap,&valid); if (!valid) bootstrap = 0; initialized = false; } // decompose 1D problem (stolen from MPE) #undef __FUNCT__ #define __FUNCT__ "Decomp1d" int Decomp1d(int n, int size, int rank, int *s, int *e) { int nlocal, deficit; nlocal = n / size; *s = rank * nlocal + 1; deficit = n % size; *s = *s + ((rank < deficit) ? rank : deficit); if (rank < deficit) nlocal++; *e = *s + nlocal - 1; if (*e > n || rank == size-1) *e = n; return MPI_SUCCESS; } // scan file to measure its size #undef __FUNCT__ #define __FUNCT__ "measureFile" int measureFile(Datafile &f, int *nClasses, int *nContexts, int *nFeats, int *nNZeros, double *c) { double v, vv; int ii,jj,k,id,ierr; ierr = PetscLogEventBegin(MEASURE_EVENT,0,0,0,0);CHKERRQ(ierr); MPI_Comm_rank(PETSC_COMM_WORLD,&id); if (id == 0) { *nClasses = *nContexts = *nFeats = *nNZeros = 0; *c = 0.0; f.firstContext(); PetscTruth memd; PetscOptionsName("-memd","MEMD estimation","Estimate",&memd); // read file while (f.getCount(&ii) != EOF) { if (ii > 0) { (*nContexts)++; *nClasses += ii;; for (int i=0;i 1) VecLoad(v,VECMPI,&p_ref); else VecLoad(v,VECSEQ,&p_ref); CHKERRQ(ierr); ierr = PetscViewerDestroy(v);CHKERRQ(ierr); // get feature matrix sprintf(tmp,"%s.data.gz",filename); ierr = PetscViewerBinaryOpen(PETSC_COMM_WORLD,tmp,PETSC_BINARY_RDONLY,&v); CHKERRQ(ierr); if (nProcs > 1) MatLoad(v,MATMPIAIJ,&data); else MatLoad(v,MATSEQAIJ,&data); CHKERRQ(ierr); ierr = PetscViewerDestroy(v);CHKERRQ(ierr); // get dataset size MatGetSize(data,nClasses,nFeats); MatGetInfo(data,MAT_GLOBAL_SUM,&info); nClasses = int(info.rows_global); nFeats = int(info.columns_global); nNZeros = int(info.nz_used); // set up e_ref if (nProcs > 1) else ierr = VecCreateSeq(PETSC_COMM_SELF,nFeats,&e_ref);CHKERRQ(ierr); ierr = VecSet(e_ref,0.0);CHKERRQ(ierr); // set up z ierr = VecCreateSeq(PETSC_COMM_SELF,nContexts,&z);CHKERRQ(ierr); ierr = PetscLogEventEnd(READ_EVENT,0,0,0,0);CHKERRQ(ierr); #else SETERRQ(PETSC_ERR_SUP,"Binary event file format not implemented"); #endif } else { // open input file Datafile in(filename); measureFile(in,&nClasses,&nContexts,&nFeats,&nNZeros,&c); // add a correction feature, if necessary if (correct) { nFeats++; nNZeros += nClasses; } // allocate storage ierr = PetscMalloc((nContexts+1)*sizeof(int),&context);CHKERRQ(ierr); // read in file PetscTruth memd; PetscOptionsName("-memd","MEMD estimation","Estimate",&memd); if (nProcs == 1) { // uniprocessor version int *ptr, *row, iPoint, iClass, iContext, ii, jj; double *val; // allocate storage for events ierr = PetscMalloc((nClasses+1)*sizeof(int),&ptr);CHKERRQ(ierr); ierr = PetscMalloc(nNZeros*sizeof(int),&row);CHKERRQ(ierr); ierr = PetscMalloc(nNZeros*sizeof(double),&val);CHKERRQ(ierr); ierr = VecCreateSeq(PETSC_COMM_SELF,nClasses,&p_ref);CHKERRQ(ierr); if (memd) ierr = VecCreateSeq(PETSC_COMM_SELF,nClasses,&p0);CHKERRQ(ierr); // rewind to first context in.firstContext(); // read events as sparse matrix iPoint = iClass = iContext = 0; ierr = PetscLogEventBegin(READ_EVENT,0,0,0,0);CHKERRQ(ierr); while (in.getCount(&ii) != EOF) { if (ii > 0) { context[iContext] = iClass; iContext++; for (int i=0;i 0) { context[iContext] = iClass; for (int i=0; i= e0) && (iClass <= e1)) { if (firstContext == -1) { firstContext = iContext; } lastContext = iContext; // read in line if (in.getFreq(&freq,&jj) == EOF) SETERRQ(PETSC_ERR_FILE_READ,"Error reading data file"); dnz[iClass-e0] = nnz[iClass-e0] = 0; for (int j=0;j= f0) && (k <= f1)) { dnz[iClass-e0]++; } else { nnz[iClass-e0]++; } } // add correction feature if (correct) { if ((nFeats-1 >= f0) && (nFeats-1 <= f1)) { dnz[iClass-e0]++; } else { nnz[iClass-e0]++; } } } else { // skip to next line in.skipLine(); } iClass++; } iContext++; } } context[iContext] = iClass; ierr = PetscLogEventEnd(SCAN_EVENT,0,0,0,0);CHKERRQ(ierr); // allocate storage for events ierr = MatCreateMPIAIJ(PETSC_COMM_WORLD, (e1-e0+1), (f1-f0+1), nClasses, nFeats, 0, dnz, 0, nnz, &data); CHKERRQ(ierr); ierr = PetscFree(dnz); CHKERRQ(ierr); ierr = PetscFree(nnz); CHKERRQ(ierr); ierr = VecCreateMPI(PETSC_COMM_WORLD, (e1-e0+1), nClasses, &p_ref); CHKERRQ(ierr); // read events as parallel sparse matrix in.firstContext(); iClass = 0; ierr = PetscLogEventBegin(READ_EVENT,0,0,0,0);CHKERRQ(ierr); while ((in.getCount(&ii) != EOF)&&(iClass <= e1)) { if (ii > 0) { for (int i=0;i= e0)&&(iClass <= e1)) { // read in line if (in.getFreq(&freq,&jj) == EOF) SETERRQ(PETSC_ERR_FILE_READ,"Error reading data file"); ierr = VecSetValue(p_ref,iClass,freq,INSERT_VALUES); vv = 0.0; for (int j=0;j 1) SETERRQ(PETSC_ERR_SUP,"MPI bootstrapping not supported"); if (d.nContexts == 1) { // bootstrap events if (!d.initialized) { VecSum(d.p_ref,&zz); d.total = (int)zz; ierr = PetscMalloc(sizeof(int)*d.total,&d.counts); ii = d.counts; int p = 0; ierr = VecGetArray(d.p_ref,&xx);CHKERRQ(ierr); for (int i=0;i> var; VecSetValue(d.sigma,i,var,INSERT_VALUES); } in.close(); } VecAssemblyBegin(d.sigma); VecAssemblyEnd(d.sigma); d.penalty = L2; } // set up for Gaussian prior penalty (with one variance) PetscOptionsReal("-l2","Variance for L2 penalty","est",0.0,&tmp,&valid); if (valid && tmp != 0.0) { VecDuplicate(d.e_ref,&d.sigma); tmp = 1.0/tmp; VecSet(d.sigma,tmp); d.penalty = L2; } // set up for double exponential prior penalty (with one variance) PetscOptionsReal("-l1","Variance for L1 penalty","est",0.0,&tmp,&valid); if (valid && tmp != 0.0) { int nprocs; MPI_Comm_size(PETSC_COMM_WORLD,&nprocs); if (nprocs > 1) SETERRQ(PETSC_ERR_SUP,"MPI L1 smoothing not supported yet"); ierr = VecCreateSeq(PETSC_COMM_WORLD,d.nFeats*2,&d.sigma);CHKERRQ(ierr); tmp = 1.0/tmp; VecSet(d.sigma,tmp); // d.lambda = tmp; d.penalty = L1; } PetscOptionsGetString(PETSC_NULL,"-l1vars",filename,LEN,&valid); if (valid) { int nprocs; MPI_Comm_size(PETSC_COMM_WORLD,&nprocs); if (nprocs > 1) SETERRQ(PETSC_ERR_SUP,"MPI L1 smoothing not supported yet"); ierr = VecCreateSeq(PETSC_COMM_WORLD,d.nFeats*2,&d.sigma);CHKERRQ(ierr); // read variances std::ifstream in(filename); if (!in) SETERRQ(PETSC_ERR_FILE_OPEN,"Error opening variances file"); for (int i=0;i> var; VecSetValue(d.sigma,i,var,INSERT_VALUES); } in.close(); VecAssemblyBegin(d.sigma); VecAssemblyEnd(d.sigma); d.penalty = L1; } } // count active features VecGetLocalSize(d.e_ref,&n); VecGetArray(d.e_ref,&xx); ia = 0; for (int i = 0; i < n; i++) if (xx[i] != 0.0) ia++; VecRestoreArray(d.e_ref,&xx); MPI_Allreduce(&ia, &d.nActive, 1, MPI_INT, MPI_SUM, PETSC_COMM_WORLD); if (!d.initialized) { // transpose data matrix if (transpose) { Mat datat; ierr = MatTranspose(d.data, &datat);CHKERRQ(ierr); ierr = MatDestroy(d.data);CHKERRQ(ierr); d.data = datat; } } d.initialized = true; return 0; }