Source

uchardet-enhanced / essais / scalarProduct.diff

diff -w base/nsSBCSGroupProber.cpp base-scalar/nsSBCSGroupProber.cpp
48a49,61
> float scalar(const PRUint8 a1[4096], const PRUint8 a2[4096])
> {
>   double m1 = 0, m2 = 0, prod = 0;
>   for (int i = 0 ; i < 4096; i++) {
>     m1 += a1[i]*a1[i];
>     m2 += a2[i]*a2[i];
>     prod += a1[i]*a2[i];
>   }
>   if (m1 == 0 || m2 == 0)
>     return 0;
>   return float(prod / (sqrt(m1) * sqrt(m2)));
> }
> 
94a108,116
> #if 0
>   fprintf(stderr, "tr * se %.3f, se * fr %.3f, de * fr %.3f, de * se %.3f \n", 
>           scalar(ISO_8859_9turkishModel.precedenceMatrix, cp1252swedishModel.precedenceMatrix),
>           scalar(cp1252swedishModel.precedenceMatrix, cp1252frenchModel.precedenceMatrix),
>           scalar(cp1252frenchModel.precedenceMatrix, cp1252germanModel.precedenceMatrix),
>           scalar(cp1252swedishModel.precedenceMatrix, cp1252germanModel.precedenceMatrix)
>     );
> #endif
> 
diff -w base/nsSBCharSetProber.cpp base-scalar/nsSBCharSetProber.cpp
58c58,62
< 
---
> #if 0
>     if (order >= SAMPLE_SIZE)
>       printf("Got non-frequent char: order %d, 0x%x [%c]\n", order,
>              unsigned((unsigned char)aBuf[i]), aBuf[i]);
> #endif
67a72
>           ++mFreqArray[mLastOrder*SAMPLE_SIZE+order];
69a75
>           ++mFreqArray[order*SAMPLE_SIZE+mLastOrder];
94a101,109
>   for (int i = 0 ; i < 64*64; i++) {
>     mFreqArray[i] = 0;
>   }
>   mSeqModelModulus = 0.0;
>   for (int i = 0 ; i < 64*64; i++) {
>     mSeqModelModulus += mModel->precedenceMatrix[i] * mModel->precedenceMatrix[i];
>   }
>   mSeqModelModulus = sqrt(mSeqModelModulus);
> 
119a135
>     r = scalarProduct() * 1.25;
126a143,189
> #include <memory.h>
> #define CLASSIFY 1
> float nsSingleByteCharSetProber::scalarProduct()
> {
>   int freqClassArray[4096];
> #if CLASSIFY
>   vector<int> counts;
>   counts.resize(4096);
>   for (int i = 0; i < 4096; i++) {
>     counts[i] = mFreqArray[i];
>   }
>   sort(counts.begin(), counts.end());
>   int t512 = counts[512];
>   int t1024 = counts[1024];
>   for (int i = 0; i < 4096; i++) {
>     int cntocs = mFreqArray[i];
>     int fclass = 1;
>     if (cntocs > 0 && cntocs >= t512) {
>       fclass = 3;
>     } else if (cntocs > 0 && cntocs >= t1024) {
>       fclass = 2;
>     } else if (cntocs >= 2) {
>       fclass = 1;
>     } else {
>       fclass = 0;
>     }
> 
>     freqClassArray[i] = fclass;
>   }
> #else
>   memcpy(freqClassArray, mFreqArray, sizeof(mFreqArray));
> #endif
> 
>   double scalar = 0.0;
>   double modulus = 0.0;
>   for (int i = 0 ; i < 64*64; i++) {
>     scalar += mModel->precedenceMatrix[i] * freqClassArray[i];
>     modulus += freqClassArray[i] * freqClassArray[i];
>   }
>   modulus = sqrt(modulus);
>   if (modulus > 0.0) {
>     return scalar / (modulus * mSeqModelModulus);
>   } else {
>     //printf("Scalar: returning 0 because one of the vectors is null\n");
>     return 0.0;
>   }
> }
138,139c201,202
<   printf("  SBCS: %1.3f [%s] [%s]\r\n", GetConfidence(), 
<          mModel->langName, GetCharSetName());
---
>   printf("  SBCS: %1.3f [%s] [%s] Scalar %.3f\r\n", GetConfidence(), 
>          mModel->langName, GetCharSetName(), scalarProduct());
diff -w base/nsSBCharSetProber.h base-scalar/nsSBCharSetProber.h
76a77
>   virtual float     scalarProduct();
93a95,96
>   PRUint32 mFreqArray[64*64];
>   float    mSeqModelModulus;