Commits

muzny committed d507054

comments on xml files

  • Participants
  • Parent commits f969cd4

Comments (0)

Files changed (5)

 
 The data download holds the following files:
 
-* allsenses.txt - Holds all sense data gathered via JWKTL from the November 12th, 2012 english wiktionary dump and computed values for the features described in classifier.features.numeric.Feature.
+* `allsenses.txt` - Holds all sense data gathered via JWKTL from the November 13th, 2012 english wiktionary dump and computed values for the features described in classifier.features.numeric.Feature.
 
-* [test|dev]_unannotated.txt - The same as allsenses.txt, but only for the senses in the test or development data set.
+* `[test|dev]_unannotated.txt` - The same as `allsenses.txt`, but only for the senses in the test or development data set.
 
-* [test|dev]_annotated.txt - The same as [test|dev]_unannotated.txt, but with annotated labels. All data were annotated in accordance with the `AnnotationGuidlines.pdf` file that is also in the data download zip.
+* `[test|dev]_annotated.txt` - The same as `[test|dev]_unannotated.txt`, but with annotated labels. All data were annotated in accordance with the `AnnotationGuidlines.pdf` file that is also in the data download zip.
 
-* [test|dev]_[un]annotated_nofeatures.txt - The same as the corresponding files, but with no computed feature values.
+* `[test|dev]_[un]annotated_nofeatures.txt` - The same as the corresponding files, but with no computed feature values.
 
 
 ### MySQL

File WiktionaryIdioms/config/classifierconfig.xml

 <config>
 	<MySQL>
 		<classify>
-			<database>database_to_draw_from</database>
-			<table>table_to_draw_from</table>
-			<column>data_set_column</column>
-			<testData>what_the_data_set_column_is_for_the_test_set</testData>
+			<database></database>  <!-- the database to draw from -->
+			<table></table>  <!-- the table to draw from -->
+			<column></column>  <!-- the data set column -->
+			<testData></testData> <!-- what the data set column should be for the test data -->
 		</classify>
 	</MySQL>
 
 	<default>
-		<trainPath>./data/train.txt</trainPath>
-		<testPath>./data/dev_annotated.txt</testPath>
-	    <percents></percents>
+		<trainPath>./data/train.txt</trainPath>  <!-- path to the training data file -->
+		<testPath>./data/dev_annotated.txt</testPath>  <!-- path to the testing data file -->
 		
-		<features>1, 2, 4, 6, 8, 9, 10, 14</features>
-		<errorBound>.000001</errorBound>
-		<specificsPath>./specifics/specifics.txt</specificsPath>
-		<classifierType>perceptron</classifierType>
-		<precisionRecallGranularity>.01</precisionRecallGranularity>
+		<features>1, 2, 4, 6, 8, 9, 10, 14</features>  <!-- comma separated list of feature numbers to use -->
+		<errorBound>.000001</errorBound>  <!-- error bound for the classifier -->
 		
-		<cleanTest>false</cleanTest>
-		<testCorrectedLabel>true</testCorrectedLabel>
+		<specificsPath>./specifics/specifics.txt</specificsPath>  <!-- path to the specifics file -->
+		<classifierType>perceptron</classifierType>  <!-- type of classifier [percetron|trustingperceptron] -->
+		<precisionRecallGranularity>.01</precisionRecallGranularity>  <!-- granularity of p/r results report -->
 		
-		<verbose>true</verbose>
-		<shouldOutputGeneral>true</shouldOutputGeneral>
-		<shouldOutputModel>true</shouldOutputModel>
+		<cleanTest>false</cleanTest>  <!-- whether to remove specific senses from the test (only works with database) -->
+		<testCorrectedLabel>true</testCorrectedLabel>  <!-- whether or not the test data should draw from the corrected_label column (only works with database) -->
+		
+		<verbose>true</verbose>  <!-- verbosity of output -->
+		<shouldOutputGeneral>true</shouldOutputGeneral>  <!-- whether to output a general info file -->
+		<shouldOutputModel>true</shouldOutputModel>  <!-- whether to output a model file for the trained classifier -->
 		
 		<output>
-			<file>file_name</file>
-			<generalInfo>true</generalInfo>
-			<confidenceList>true</confidenceList>
-			<precisionRecallPoints>true</precisionRecallPoints>
-			<randomErrorAnalysis>100</randomErrorAnalysis>
+			<file>filename</file>  <!-- the path to the file to output to -->
+			<generalInfo>true</generalInfo>  <!-- whether or not to output the general info -->
+			<confidenceList>false</confidenceList>  <!-- whether or not to output the confidence list -->
+			<precisionRecallPoints>false</precisionRecallPoints>  <!-- whether or not to output the p/r points -->
+			<randomErrorAnalysis>0</randomErrorAnalysis>  <!-- how many randomly sampled errors to output -->
 		</output>
+		
 		<outputModel>
-			<file>test</file>
+			<file>./models/</file> <!-- the path to the directory to output files to, with the name output.file + ".model" -->
 		</outputModel>
+		
 	</default>
 	
     
     <BasicApply>
-    	<learningRate>1</learningRate>
-		<iterations>93</iterations>
-		<modelFile></modelFile>
+    	<learningRate>1</learningRate>  <!-- The learning rate -->
+		<iterations>93</iterations>  <!-- Number of iterations to perform -->
+		<modelFile></modelFile>  <!-- Model file to read from (trains new classifier if empty) -->
     </BasicApply>
     
     <GridSearch>
-    	<group>groups</group>
-    	<iterationMax>100</iterationMax>
-    	<learningDelta>1</learningDelta>
+    	<group>groups</group>  <!-- Whether or not to test the different groups from classifier.features.numeric.Feature-->
+    							<!-- If empty, reads features from default section -->
+    	<iterationMax>100</iterationMax>  <!-- Max number of iterations to test up to -->
+    	<learningDelta>1</learningDelta> <!-- What to change the learning rate by. A learning Delta of one just searches # of iterations -->
     </GridSearch>
     
     <CompareFeatures>
-    	<features>6,8,9,10,14</features>
+    	<features></features>  <!-- Features to compare -->
 
-		<buildUp>true</buildUp>
-		<buildByBest>false</buildByBest>
-    	<iterationMax>100</iterationMax>
-    	<learningDelta>1</learningDelta>
+		<buildUp>true</buildUp> <!-- Whether to build up the features and test the built up groups -->
+		<buildByBest>false</buildByBest> <!-- Whether to build up features based on performance -->
+    	<iterationMax>100</iterationMax> <!-- same as in GridSearch -->
+    	<learningDelta>1</learningDelta> <!-- same as in GridSearch -->
     </CompareFeatures>
     
      <CompareGroups>
-    	<iterationMax>100</iterationMax>
+    	<iterationMax>100</iterationMax>  <!-- Same as CompareFeatures, but always compares groups -->
     	<learningDelta>1</learningDelta>
     </CompareGroups>
     
     <LabelData>
-    	<modelFile></modelFile>
-    	<precisionBoundary></precisionBoundary>
+    	<modelFile></modelFile>  <!-- model to use -->
+    	<precisionBoundary></precisionBoundary>  <!-- precision boundary, should be [0, 1] -->
     </LabelData>
 </config>

File WiktionaryIdioms/config/dbconfig.xml

 <!-- config.xml -->
 <config>
 	<detector>
-		<lookupDb></lookupDb>
-		<dbTable></dbTable>
+		<lookupDb></lookupDb> <!-- The Detector methods can use this to look up things in -->
+		<dbTable></dbTable>  <!-- they can also read from files. -->
 	</detector>
 	
 	<SenseFeature>
-		<lookupDb>wiktionary_data_sets</lookupDb>
+		<lookupDb></lookupDb>
 	</SenseFeature>
 	
 </config>

File WiktionaryIdioms/config/detectorconfig.xml

 <!-- config.xml -->
 <config>
 	<default>
-		<jwktlPath>/Users/golux/Research/wiktionary_output/</jwktlPath>
-		<database>wiktionary_data_sets</database>
-		<table>example_sentences</table>
-		<column>data_set_classifier</column>
-		<testData>test</testData>
-		<label>corrected_label</label>
-		<classifierModelCorrected>./paperOutputModels/devCorrectedUncleanedAll.model</classifierModelCorrected>
-		<classifierModelCleaned>./paperOutputModels/devCorrectedCleanedAll.model</classifierModelCleaned>
-		<classifierTable>sense_data_scaled</classifierTable>
+		<jwktlPath></jwktlPath>  <!-- path to the JWKTL wiktionary output directory -->
+		<database></database>  <!-- wiktionary database-->
+		<table></table>  <!-- example sentences database -->
+		<column></column>  <!-- data set column to read from -->
+		<testData></testData>  <!-- what the data set column should say for the test data -->
+		<label></label>  <!-- what column to read the label from -->
+		<classifierModelCorrected></classifierModelCorrected>  <!-- classifier model using the annotated dev set -->
+		<classifierModelCleaned></classifierModelCleaned>  <!-- classifier model using the annotated and cleaned of specifics dev set -->
+		<classifierTable></classifierTable> <!-- sense database -->
 	</default>
-	
-	<IdentificationIncorporated>
-		<classifierModel>./outputModels/bestCompare.model</classifierModel>
-		<senseDb>wiktionary_data_sets</senseDb>
-		<senseTable>sense_data_scaled</senseTable>
-	</IdentificationIncorporated>
+
 	
 	<GoldenLabels>
-		<senseDb>wiktionary_data_sets</senseDb>
-		<senseTable>sense_data_scaled</senseTable>
+		<senseDb></senseDb>  <!-- database for wiktionary sense data -->
+		<senseTable>sense_data_scaled</senseTable>  <!-- table that the sense information is in -->
 	</GoldenLabels>
 	
 </config>

File WiktionaryIdioms/config/nodbconfig.xml

 <config>
 
 	<default>
-		<testPath>./data/dev_examples.txt</testPath>
-		<sensesPath>./data/dev_unannotated.txt</sensesPath>
-		<allSensesPath>./data/allsenses.txt</allSensesPath>
+		<testPath>./data/dev_examples.txt</testPath> <!-- path to the test example data -->
+		<sensesPath>./data/dev_unannotated.txt</sensesPath>  <!-- path to the senses that correspond to the examples -->
+		<allSensesPath>./data/allsenses.txt</allSensesPath>  <!-- path to the file with information about all senses -->
 	</default>
 	
 </config>