Home Reference Source Test

src/ColumnVectorizer.mjs

// import { ml, } from './ml';
import { nlp, } from './nlp';
// import { util as utils, } from './util';
import { DataSet, } from './DataSet';

/**
 * class creating sparse matrices from a corpus
 * @class ColumnVectorizer
 * @memberOf nlp
 */
export class ColumnVectorizer {
  /**
   * creates a new instance for classifying text data for machine learning
   * @example
   * const dataset = new ms.nlp.ColumnVectorizer(csvData);
   * @param {Object} [options={}]
   * @prop {Object[]} this.data - Array of strings
   * @prop {Set} this.tokens - Unique collection of all tokenized strings
   * @prop {Object[]} this.vectors - Array of tokenized words with value of count of appreance in string
   * @prop {Object} this.wordMap - Object of all unique words, with value of 0
   * @prop {Object} this.wordCountMap - Object of all unique words, with value as total count of appearances
   * @prop {number} this.maxFeatures - max number of features
   * @prop {String[]} this.sortedWordCount - list of words as tokens sorted by total appearances
   * @prop {String[]} this.limitedFeatures - subset list of maxFeatures words as tokens sorted by total appearances
   * @prop {Array[]} this.matrix - words in sparse matrix
   * @prop {Function} this.replacer - clean string function
   * @returns {this} 
   */
  constructor(options = {}) {
    this.data = options.data || [];
    this.tokens = new Set();
    this.vectors = [];
    this.wordMap = {};
    this.wordCountMap = {};
    this.maxFeatures = options.maxFeatures;
    this.sortedWordCount = [];
    this.limitedFeatures = [];
    this.matrix = [];
    this.replacer = (value='') => {
      const cleanedValue = value
        .toLowerCase()
        .replace(/[^a-zA-Z]/gi, ' ');
      return nlp.PorterStemmer
        .tokenizeAndStem(cleanedValue)
        .join(' ');
    };
    return this;
  }
  /** 
   * Returns a distinct array of all tokens
   * @return {String[]} returns a distinct array of all tokens
  */
  get_tokens() {
    return Array.from(this.tokens);
  }
  /** 
   * Returns array of arrays of strings for dependent features from sparse matrix word map
   * @return {String[]} returns array of dependent features for DataSet column matrics
  */
  get_vector_array() {
    return this.get_tokens().map(tok => [
      tok,
    ]);
  }
  /**
   * Fits and transforms data by creating column vectors (a sparse matrix where each row has every word in the corpus as a column and the count of appearances in the corpus)
   * @param {Object} options 
   * @param {Object[]} options.data - array of corpus data 
   */
  fit_transform(options = {}) {
    const data = options.data || this.data;
    data.forEach(datum => {
      const datums = {};
      this.replacer(datum)
        .split(' ')
        .forEach(tok => {
          const token = tok.toLowerCase();
          datums[ token ] = (datums[ token ])
            ? datums[ token ] + 1
            : 1;
          this.wordCountMap[token] = (this.wordCountMap[token])
            ? this.wordCountMap[token] + 1
            : 1;
          this.tokens.add(token);
        });
      this.vectors.push(datums);
    });
    this.wordMap = Array.from(this.tokens).reduce((result, value) => { 
      result[ value ] = 0;
      return result;
    }, {});
    this.sortedWordCount = Object.keys(this.wordCountMap)
      .sort((a, b) => this.wordCountMap[ b ] - this.wordCountMap[ a ]);
    this.vectors = this.vectors.map(vector => Object.assign({}, this.wordMap, vector));
    const vectorData = new DataSet(this.vectors);
    this.limitedFeatures = this.get_limited_features(options);
    this.matrix = vectorData.columnMatrix(this.limitedFeatures);
    return this.matrix;
  }
  /**
   * Returns limited sets of dependent features or all dependent features sorted by word count
   * @param {*} options 
   * @param {number} options.maxFeatures - max number of features 
   */
  get_limited_features(options = {}) {
    const maxFeatures = options.maxFeatures || this.maxFeatures || this.tokens.size;
 
    return this.sortedWordCount
      .slice(0, maxFeatures)
      .map(feature => [ feature, ]);
  }
  /**
   * returns word map with counts
   * @example 
ColumnVectorizer.evaluateString('I would rate everything Great, views Great, food Great') => { realli: 0,
     good: 0,
     definit: 0,
     recommend: 0,
     wait: 0,
     staff: 0,
     rude: 0,
     great: 3,
     view: 1,
     food: 1,
     not: 0,
     cold: 0,
     took: 0,
     forev: 0,
     seat: 0,
     time: 0,
     prompt: 0,
     attent: 0,
     bland: 0,
     flavor: 0,
     kind: 0 }
   * @param {String} testString 
   * @return {Object} object of corpus words with accounts
   */
  evaluateString(testString = '') {
    const evalString = this.replacer(testString);
    const evalStringWordMap = evalString.split(' ').reduce((result, value) => { 
      if (this.tokens.has(value)) {
        result[ value ] = (result[ value ]!==undefined)
          ? result[ value ] + 1
          : 1;
      }
      return result;
    }, {});
    return Object.assign({}, this.wordMap, evalStringWordMap);
  }
  /**
   * returns new matrix of words with counts in columns
   * @example 
ColumnVectorizer.evaluate('I would rate everything Great, views Great, food Great') => [ [ 0, 1, 3, 0, 0, 0, 0, 0, 1 ] ]
   * @param {String} testString 
   * @return {number[][]} sparse matrix row for new classification predictions
   */
  evaluate(testString='', options) {
    const stringObj = this.evaluateString(testString);
    const limitedFeatures = this.get_limited_features(options);
    const vectorData = new DataSet([
      stringObj,
    ]);
    return vectorData.columnMatrix(limitedFeatures);
  }
}