Home Reference Source Test

src/cross_validation.mjs

import { default as Random, } from 'random-js';
import { default as range, } from 'lodash.range';
import { ml, } from './ml';
import { util, } from './util';
import { DataSet, } from './DataSet';
import { default as jgsl, } from 'js-grid-search-lite';
const { GridSearch, } = jgsl;
const Matrix = ml.Matrix;
const ConfusionMatrix = ml.ConfusionMatrix;

/**
 * Split arrays into random train and test subsets
 * @memberOf cross_validation
 * @example
 * const testArray = [20, 25, 10, 33, 50, 42, 19, 34, 90, 23, ];
// { train: [ 50, 20, 34, 33, 10, 23, 90, 42 ], test: [ 25, 19 ] }
const trainTestSplit = ms.cross_validation.train_test_split(testArray,{ test_size:0.2, random_state: 0, });
 * @param {array} dataset - array of data to split
 * @param {object} options
 * @param {number} [options.test_size=0.2] - represent the proportion of the dataset to include in the test split, can be overwritten by the train_size 
 * @param {number} [options.train_size=0.8] - represent the proportion of the dataset to include in the train split 
 * @param {number} [options.random_state=0] - the seed used by the random number generator
 * @param {boolean} [options.return_array=false] - will return an object {train,test} of the split dataset by default or [train,test] if returned as an array
 * @returns {(Object|array)} returns training and test arrays either as an object or arrays
 */
function train_test_split(dataset = [], options = {
  test_size: 0.2,
  train_size: 0.8,
  random_state: 0,
  return_array: false,
  parse_int_train_size: true,
}) {
  const engine = Random.engines.mt19937().seed(options.random_state || 0);
  const training_set = [];
  const parse_int_train_size = (typeof options.parse_int_train_size === 'boolean') ? options.parse_int_train_size : true;
  const train_size_length = (options.train_size)
    ? options.train_size * dataset.length
    : (1 - (options.test_size || 0.2)) * dataset.length;
  const train_size = parse_int_train_size
    ? parseInt(train_size_length, 10)
    : train_size_length;
  const dataset_copy = [].concat(dataset);

  while (training_set.length < train_size) {
    const index = Random.integer(0, (dataset_copy.length - 1))(engine);
    // console.log({ index });
    training_set.push(dataset_copy.splice(index, 1)[0]);
  }
  return (options.return_array) ? [training_set, dataset_copy,] : {
    train: training_set,
    test: dataset_copy,
  };
}

/**
 * Provides train/test indices to split data in train/test sets. Split dataset into k consecutive folds.
Each fold is then used once as a validation while the k - 1 remaining folds form the training set.
 * @memberOf cross_validation
 * @example
 * const testArray = [20, 25, 10, 33, 50, 42, 19, 34, 90, 23, ];
// [ [ 50, 20, 34, 33, 10 ], [ 23, 90, 42, 19, 25 ] ] 
const crossValidationArrayKFolds = ms.cross_validation.cross_validation_split(testArray, { folds: 2, random_state: 0, });
 * @param {array} dataset - array of data to split
 * @param {object} options
 * @param {number} [options.folds=3] - Number of folds 
 * @param {number} [options.random_state=0] - the seed used by the random number generator
 * @returns {array} returns  dataset split into k consecutive folds
 */
function cross_validation_split(dataset = [], options = {
  folds: 3,
  random_state: 0,
}) { //kfolds
  const engine = Random.engines.mt19937().seed(options.random_state || 0);
  const folds = options.folds || 3;
  const dataset_split = [];
  const dataset_copy = [].concat(dataset);
  const foldsize = parseInt(dataset.length / (folds || 3), 10);
  for (let i in range(folds)) {
    const fold = [];
    while (fold.length < foldsize) {
      const index = Random.integer(0, (dataset_copy.length - 1))(engine);
      fold.push(dataset_copy.splice(index, 1)[0]);
    }
    dataset_split.push(fold);
  }

  return dataset_split;
}

/**
 * Used to test variance and bias of a prediction
 * @memberOf cross_validation
 * @param {object} options
 * @param {function} options.classifier - instance of classification model used for training, or function to train a model. e.g. new DecisionTreeClassifier({ gainFunction: 'gini', }) or ml.KNN
 * @param {function} options.regression - instance of regression model used for training, or function to train a model. e.g. new RandomForestRegression({ nEstimators: 300, }) or ml.MultivariateLinearRegression
 * @return {number[]} Array of accucracy calculations 
 */
function cross_validate_score(options = {}) {
  const config = Object.assign({}, {
    // classifier,
    // regression,
    // dataset,
    // testingset,
    dependentFeatures: [['X', ], ],
    independentFeatures: [['Y', ], ],
    // random_state,
    folds: 10,
    accuracy: 'standardError',
    use_train_x_matrix: true,
    use_train_y_matrix: false,
    use_train_y_vector: false,
    use_estimates_y_vector: false,
  }, options);
  const classifier = config.classifier;
  const regression = config.regression;
  const folds = cross_validation_split(config.dataset, {
    folds: config.folds,
    random_state: config.random_state,
  });
  const testingDataSet = new DataSet(config.testingset);
  const y_test_matrix = testingDataSet.columnMatrix(config.independentFeatures);
  const x_test_matrix = testingDataSet.columnMatrix(config.dependentFeatures);
  const actuals = util.pivotVector(y_test_matrix)[ 0 ];
  // console.log({ actuals });
  const prediction_accuracies = folds.map(fold => { 
    const trainingDataSet = new DataSet(fold);
    const x_train = trainingDataSet.columnMatrix(config.dependentFeatures);
    const y_train = trainingDataSet.columnMatrix(config.independentFeatures);
    const x_train_matrix = (config.use_train_x_matrix)
      ? new Matrix(x_train)
      : x_train;
    const y_train_matrix = (config.use_train_y_matrix)
      ? new Matrix(y_train)
      : (config.use_train_y_vector)
        ? util.pivotVector(y_train)[0]
        : y_train;
    if (regression) {
      let regressor;
      let pred_y_test;
      if (typeof regression.train === 'function') {
        regressor = regression.train(x_train_matrix, y_train_matrix, config.modelOptions);
        pred_y_test = regression.predict(x_test_matrix);
      } else {
        regressor = new regression(x_train_matrix, y_train_matrix, config.modelOptions);
        pred_y_test = regressor.predict(x_test_matrix);
      }
      // console.log({ x_test_matrix });
      // console.log({ pred_y_test });
      const estimates = pred_y_test;//util.pivotVector(pred_y_test)[0];
      // console.log({ estimates, actuals });
      return (config.accuracy === 'standardError')
        ? util.standardError(actuals, estimates)
        : util.rSquared(actuals, estimates);
    } else {
      let classification;
      let estimates;
      if (typeof classifier.train === 'function') {
        classifier.train(x_train_matrix, y_train_matrix, config.modelOptions);
        estimates = classifier.predict(x_test_matrix);
      } else {
        classification = new classifier(x_train_matrix, y_train_matrix, config.modelOptions);
        estimates = classification.predict(x_test_matrix);
      }
      // classification.train(x_train_matrix, y_train_matrix);
      // classifier.train(x_train_matrix, y_train_matrix);
      const compareEstimates = (config.use_estimates_y_vector)
        ? util.pivotVector(estimates)[ 0 ]
        : estimates;
      const CM = ConfusionMatrix.fromLabels(actuals, compareEstimates);
      return CM.getAccuracy();
    }
  });
  return prediction_accuracies;
}

/**
 * Used to test variance and bias of a prediction with parameter tuning
 * @memberOf cross_validation
 * @param {object} options
 * @param {function} options.classifier - instance of classification model used for training, or function to train a model. e.g. new DecisionTreeClassifier({ gainFunction: 'gini', }) or ml.KNN
 * @param {function} options.regression - instance of regression model used for training, or function to train a model. e.g. new RandomForestRegression({ nEstimators: 300, }) or ml.MultivariateLinearRegression
 * @return {number[]} Array of accucracy calculations 
 */
function grid_search(options = {}) {
  const config = Object.assign({}, {
    return_parameters: false,
    compare_score:'mean',
    sortAccuracyScore:'desc',
    parameters: {},
  }, options);
  const regressor = config.regression;
  const classification = config.classifier;
  const sortAccuracyScore = (!options.sortAccuracyScore && config.regression)
    ? 'asc'
    : config.sortAccuracyScore;
  
  // const scoreSorter = ;
  const gs = new GridSearch({
    params: config.parameters,
    run_callback: (params) => {
      if (config.regression) {
        config.regression = new regressor(params);
      } else {
        config.classifier = new classification(params);
      }
      const score = cross_validate_score(config);
      return (config.compare_score)
        ? util[config.compare_score](score)
        : score;
    },
  });
  gs.run();
  const accuracySorter = (sortAccuracyScore === 'desc')
    ? (a, b) => b.results - a.results
    : (a, b) => a.results - b.results;
  const results = gs._results.sort(accuracySorter);
  // GridSearch;
  return config.return_parameters
    ? results
    : results[ 0 ];
}

/**
 * @namespace
 * @see {@link https://machinelearningmastery.com/implement-resampling-methods-scratch-python/}
 */
export const cross_validation = {
  train_test_split,
  cross_validation_split,
  kfolds: cross_validation_split,
  cross_validate_score,
  grid_search,
  GridSearch,
};