test/unit/columnvectorizer_spec.mjs

import * as ms from '../../index.mjs';
import chai from 'chai';
const expect = chai.expect;
const csvData = [
  {
    'Review': 'This is really good',
    'Liked': 1,
  },
  {
    'Review': 'I would definitely recommend',
    'Liked': 1,
  },
  {
    'Review': 'The wait staff was really rude',
    'Liked': 0,
  },
  {
    'Review': 'Great views',
    'Liked': 1,
  },
  {
    'Review': 'the food was not great',
    'Liked': 0,
  },
  {
    'Review': 'food came out cold, took forever to get seated',
    'Liked': 0,
  },
  {
    'Review': 'we had a great time, and they were really prompt and attentive',
    'Liked': 1,
  },
  {
    'Review': 'the food was bland',
    'Liked': 0,
  },
  {
    'Review': 'not very flavorful',
    'Liked': 0,
  },
  {
    'Review': 'it was kind of so-so',
    'Liked': 0,
  },
];
const eVString = 'I would rate everything Great, views Great, food Great';

describe('nlp', function() {
  describe('ColumnVectorizer class', () => {
    const CSVDataSet = new ms.DataSet(csvData);
    const nlpVectors = new ms.nlp.ColumnVectorizer({
      data: CSVDataSet.columnArray('Review'),
      maxFeatures: 9,
    });
    nlpVectors.fit_transform();
    // console.log({ nlpVectors });
    describe('constructor', () => {
      it('should instantiate a new ColumnVectorizer Class', () => {
        expect(ms.nlp).to.be.an('object');
        expect(ms.nlp.ColumnVectorizer).to.be.a('function');
        expect(nlpVectors).to.be.instanceof(ms.nlp.ColumnVectorizer);
        expect(nlpVectors.maxFeatures).to.eql(9);
        // console.log({ nlpVectors });
      });
    });
    describe('get_tokens', () => {
      it('should return an array of all tokens', () => {
        const toks = nlpVectors.get_tokens();
        expect(toks).to.be.an('array');
        expect(toks).to.have.lengthOf(nlpVectors.tokens.size);
      });
    });
    describe('get_vector_array', () => {
      it('should return an array of tokens as vectors', () => {
        const toks = nlpVectors.get_vector_array();
        expect(toks).to.be.an('array');
        expect(toks[0]).to.be.an('array');
        expect(toks).to.have.lengthOf(nlpVectors.tokens.size);
      });
    });
    describe('get_limited_features', () => {
      it('should return a count of maxFeatures in array of tokens as vectors', () => {
        const feats = nlpVectors.get_limited_features();
        expect(feats).to.be.an('array');
        expect(feats[0]).to.be.an('array');
        expect(feats).to.have.lengthOf(nlpVectors.maxFeatures);
      });
      it('should limit features in array of tokens as vectors', () => {
        const feats = nlpVectors.get_limited_features({ maxFeatures: 5, });
        expect(feats).to.be.an('array');
        expect(feats[0]).to.be.an('array');
        expect(feats).to.have.lengthOf(5);
      });
    });
    describe('evaluateString', () => {
      it('should return object of tokens and counts', () => {
        const estring = nlpVectors.evaluateString(eVString);
        expect(estring.great).to.eql(3);
        expect(estring.view).to.eql(1);
        expect(estring.food).to.eql(1);
      });
    });
    describe('evaluate', () => {
      it('should return matrix vector for new predictions', () => {
        const estring = nlpVectors.evaluate(eVString);
        // console.log({estring})
        expect(estring).to.be.an('array');
        expect(estring[ 0 ]).to.have.lengthOf(9);
        expect(estring[ 0 ].filter(val => val === 3).length).to.eql(1);
      });
    });
    describe('fit_transform', () => {
      it('should create a set of unique tokens this.tokens', () => {
        const tokens = csvData.reduce((result, value) => {
          const val = value.Review.toLowerCase();
          const stringVal = ms.nlp.PorterStemmer.tokenizeAndStem(val).join(' ');
          result += stringVal+' ';
          return result;
        }, '');
        const tokenSet = new Set(tokens.split(' ').filter(val => val));
        expect(nlpVectors.tokens.size).to.eql(tokenSet.size);
        tokenSet.forEach((val) => {
          expect(nlpVectors.tokens.has(val)).to.be.true;
        });
      });
      it('should create a dictionary of total word counts in this.wordCountMap', () => {
        const wordCountMap = csvData.reduce((result, value) => {
          const val = value.Review.toLowerCase();
          const stringVals = ms.nlp.PorterStemmer.tokenizeAndStem(val);
          stringVals.forEach(token => {
            result[ token ] = (result[ token ])
              ? result[ token ] + 1
              : 1;
          });
          return result;
        }, {});
        Object.keys(wordCountMap).forEach(word => {
          expect(wordCountMap[ word ]).to.eql(nlpVectors.wordCountMap[ word ]);
        });
      });
      it('should create a dictionary of all words this.wordMap', () => {
        Array.from(nlpVectors.tokens).forEach(token => {
          expect(nlpVectors.wordMap[ token ]).to.eql(0);
        });
      });
      it('should create an array of all sorted words in this.sortedWordCount by word count', () => {
        nlpVectors.sortedWordCount.forEach((wordObj, i) => { 
          if (i < nlpVectors.sortedWordCount.length-1) {
            const currentSWC = nlpVectors.sortedWordCount[ i ];
            const nextSWC = nlpVectors.sortedWordCount[ i + 1 ];
            expect(nlpVectors.wordCountMap[ currentSWC ]).to.be.gte(nlpVectors.wordCountMap[ nextSWC ]);
          }
        });
      });
      it('should create a sparse matrix dictionary words from corpus in this.data as this.vectors', () => {
        const firstSentence = csvData[ 0 ].Review;
        const firstSentenceWordMap = nlpVectors.evaluateString(firstSentence);
        expect(firstSentenceWordMap).to.eql(nlpVectors.vectors[ 0 ]);
      });
      it('should create a sparse matrix of words from corpus in this.data', () => {
        const firstSentence = csvData[ 0 ].Review;
        const firstSentenceWordMap = nlpVectors.evaluate(firstSentence);
        expect(firstSentenceWordMap[0]).to.eql(nlpVectors.matrix[ 0 ]);
      });
    });
  });
});