Casey Greene avatar Casey Greene committed 9bc4ef9

add initial set code

Comments (0)

Files changed (3)

 namespace Sleipnir {
 
 const char	CPairImpl::c_szQuantExt[]	= ".quant";
-const char   CDataPairImpl::c_acQdab[]   = ".qdab";
+const char  CDataPairImpl::c_acQdab[]   = ".qdab";
+const char  CDataPairImpl::c_acSet[]    = ".set";
 
 bool CPairImpl::Open( const char* szDatafile, std::ifstream& ifsm ) {
 	string		strToken;
  */
 bool CDataPair::Open( const char* szDatafile, bool fContinuous, bool fMemmap, size_t iSkip,
 	bool fZScore, bool fSeek ) {
-
-
 	g_CatSleipnir( ).notice( "CDataPair::Open( %s, %d )", szDatafile, fContinuous );
-	
 	Reset( fContinuous );
 	m_fQuantized = false;
-	
 	const char* file_ext = NULL;
-	
+
 	if((file_ext = strstr(szDatafile, c_acQdab)) != NULL){
-
 	  return OpenQdab( szDatafile );
-	}
-	else{
+	} else if ((file_ext = strstr(szDatafile, c_acSet)) != NULL) {
+        if ( !OpenSet( szDatafile) ) {
+            return false;
+        }
+	  return ( m_fContinuous ? true : OpenQuants( szDatafile ) );
+	} else{
 	  if( !CDat::Open( szDatafile, fMemmap, iSkip, fZScore, false, fSeek ) )
 	    return false;
-	  return ( m_fContinuous ? true : OpenQuants( szDatafile ) ); 	  
+	  return ( m_fContinuous ? true : OpenQuants( szDatafile ) );
 	}
 }
 
+
+/*!
+ * \brief
+ * Open the given set file as a CDat.  For quantization purposes, the values are quantized based on whether the values of both genes are greater than the interval.  It is important that the set file not contain any duplicate gene IDs. 
+ *
+ * \param szDatafile
+ * Filename from which CDat is loaded.
+ *
+ * \returns
+ * True if data pair was successfully opened.
+ *
+ * \see
+ * CDat::Open
+ */
+bool CDataPairImpl::OpenSet( const char* szDatafile ){
+    vector<char>    veccBuffer;
+    vector<float>   vecdScore;
+    size_t          i, j;
+    float           *adScores;
+    float           dQuantI, dQuantJ;
+    ifstream        ifsm;
+	static const size_t	c_iBuf	= 8192;
+	char		szBuf[ c_iBuf ];
+
+
+    g_CatSleipnir( ).notice( "CDataPair::OpenSet( %s )", szDatafile );
+
+    //Open Gene Set
+    ifsm.open( szDatafile );
+    veccBuffer.resize( CFile::GetBufferSize( ) );
+
+    while( !(ifsm.eof( ) ) ) {
+        vector<string> vecstrLine;
+
+        ifsm.getline(&veccBuffer[0], veccBuffer.size( ) - 1);
+        CMeta::Tokenize( &veccBuffer[0], vecstrLine);
+        if( vecstrLine.empty( ) )
+            continue;
+        m_vecstrGenes.push_back( vecstrLine[0] );
+        vecdScore.push_back( atof( vecstrLine[1].c_str() ) );
+    }
+
+    ifsm.close();
+
+    //Open Quants
+	if( !CPairImpl::Open( szDatafile, ifsm ) )
+		return false;
+	ifsm.getline( szBuf, c_iBuf - 1 );
+	ifsm.close( );
+	CPairImpl::Open( szBuf, m_vecdQuant );
+
+    //Fill half-matrix, not as nice as set membership tests but easy to plug in to existing class for proof of concept
+    m_Data.Initialize( m_vecstrGenes.size( ) );
+
+    adScores = new float[ GetGenes( ) - 1];
+    for ( i = 0; ( i + 1 ) < GetGenes( ); ++i) {
+        dQuantI = CMeta::Quantize( vecdScore[i], m_vecdQuant );
+        for (j = i + 1; j < GetGenes( ); ++j) {
+            dQuantJ = CMeta::Quantize( vecdScore[j], m_vecdQuant );
+            adScores[j - i - 1] = min(dQuantI, dQuantJ);
+        }
+        CDat::Set( i, adScores );
+    }
+    delete[] adScores;
+
+    //Vals already quantized during loading
+    m_fQuantized = true;
+
+    return true;
+}
+
 bool CDataPairImpl::OpenQdab( const char* szDatafile ){
   size_t	iTotal, i, j, num_bins, num_bits, iPos;
   float*	adScores;
 	std::vector<float>	m_vecdQuant;
 
 	static const char  c_acQdab[];
+	static const char  c_acSet[];
 	bool OpenQdab( const char* szDatafile );
+	bool OpenSet( const char* szDatafile );
 	void SetQuants( const float* adBinEdges, size_t iBins );
 	std::vector<float> GetQuants();
 };

tools/Counter/Counter.cpp

 
 static const char	c_acDab[]	= ".dab";
 static const char	c_acDat[]	= ".dat";
+static const char	c_acSet[]	= ".set";
 static const char	c_acQDab[]	= ".qdab";
 static const char	c_acQuant[]	= ".quant";
 static const char	c_acTxt[]	= ".txt";
                 const CGenes& GenesIn, const CGenes& GenesEx, const CGenes& GenesTm, const CGenes& GenesEd, const CGenes& GenesUbik ) {
     size_t				i, j, k, m, iTerm, iThread;
     vector<vector<CCountMatrix*>* >	vecpvecpMats;
-    vector<CCountMatrix*>		vecpMatRoots;
-    vector<CGenes*>			vecpGenes;
-    CDataPair				Answers, Dat;
-	CDat					wDat;
-    CDatFilter				Filter, FilterIn, FilterEx, FilterTm, FilterEd;
-    CDatFilter*				pFilter;
-    string	    			strFile;
-    vector<pthread_t>	    		vecpthdThreads;
+    vector<CCountMatrix*>   vecpMatRoots;
+    vector<CGenes*>         vecpGenes;
+    CDataPair               Answers, Dat;
+	CDat                    wDat;
+    CDatFilter              Filter, FilterIn, FilterEx, FilterTm, FilterEd;
+    CDatFilter*             pFilter;
+    string                  strFile;
+    vector<pthread_t>       vecpthdThreads;
     vector<SLearn>			vecsData;
     map<string, size_t>::const_iterator	iterZero;
     CGenome			    	Genome;
 	vector<CGenome>			Genomes;
     vector<string>			vecstrNames;
-    CRegularize			        Regularize;
+    CRegularize		        Regularize;
 	bool					isDatWeighted=false;
     if( !Answers.Open( sArgs.answers_arg, false, !!sArgs.memmap_flag ) ) {
         cerr << "Could not open: " << sArgs.answers_arg << endl;
 					isDatWeighted = true;
 					vecpGenes[ i ]->Open(wDat.GetGeneNames());}
 			}
-		}	
+		}
 		else{
 			if( !vecpGenes[ i ]->Open( ifsm ) ) {
 				cerr << "Couldn't open: " << sArgs.inputs[ i ] << endl;
-				return 1;	}	
+				return 1;	}
 		}
 	}
-    
+
     if( !vecpGenes.size( ) ) {
         vecpGenes.insert( vecpGenes.begin( ), new CGenes( Genome ) );
         vecpGenes[ 0 ]->Open( Answers.GetGeneNames( ) );
             vecsData[ i ].m_pDat = NULL;
             vecsData[ i ].m_iDat = -1;
             vecsData[ i ].m_pGenes = vecpGenes[ i ];
-	    vecsData[ i ].m_pUbikGenes = &GenesUbik;
+            vecsData[ i ].m_pUbikGenes = &GenesUbik;
             vecsData[ i ].m_pAnswers = &Answers;
             vecsData[ i ].m_iZero = -1;
             vecsData[ i ].m_pRegularize = &Regularize;
     FOR_EACH_DIRECTORY_FILE((string)sArgs.directory_arg, strFile)
     string					strName;
     vector<CCountMatrix*>*	pvecpMatCounts;
-    
+
     if( CMeta::IsExtension( strFile, c_acDab ) ) {
         i = strFile.rfind( '.' );
         strName = (string) sArgs.directory_arg + "/" + strFile.substr( 0, i ) + c_acDab;
         i = strFile.rfind( '.' );
         strName = (string) sArgs.directory_arg + "/" + strFile.substr( 0, i ) + c_acQDab;
     } else if( CMeta::IsExtension( strFile, c_acDat ) ) {
-	i = strFile.rfind( '.' );
-	strName = (string) sArgs.directory_arg + "/" + strFile.substr( 0, i ) + c_acDat;
+        i = strFile.rfind( '.' );
+        strName = (string) sArgs.directory_arg + "/" + strFile.substr( 0, i ) + c_acDat;
+    } else if( CMeta::IsExtension( strFile, c_acSet ) ) {
+        i = strFile.rfind( '.' );
+        strName = (string) sArgs.directory_arg + "/" + strFile.substr( 0, i ) + c_acSet;
     } else {
         continue;
     }
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.