Table Of Contents

Search

Enter search terms or a module, class or function name.

Source code for mclearn.preprocessing

""" Useful general-purpose preprocessing functions. """

import gc
import os.path
import pandas as pd
import numpy as np
import nose.tools
from numpy.random import RandomState


[docs]def normalise_z(features): """ Normalise each feature to have zero mean and unit variance. Parameters ---------- features : array, shape = [n_samples, n_features] Each row is a sample point and each column is a feature. Returns ------- features_normalised : array, shape = [n_samples, n_features] """ mu = np.mean(features, axis=0) sigma = np.std(features, axis=0) return (features - mu) / sigma
[docs]def normalise_unit_var(features): """ Normalise each feature to have unit variance. Parameters ---------- features : array, shape = [n_samples, n_features] Each row is a sample point and each column is a feature. Returns ------- features_normalised : array, shape = [n_samples, n_features] """ sigma = np.std(features, axis=0) return features / sigma
[docs]def normalise_01(features): """ Normalise each feature to unit interval. Parameters ---------- features : array, shape = [n_samples, n_features] Each row is a sample point and each column is a feature. Returns ------- features_normalised : array, shape = [n_samples, n_features] """ minimum = np.min(features, axis=0) maximum = np.max(features, axis=0) return (features - minimum) / (maximum - minimum)
def _get_train_test_size(train_size, test_size, n_samples): """ Convert user train and test size inputs to integers. """ if test_size is None and train_size is None: test_size = 0.3 train_size = 1.0 - test_size if isinstance(test_size, float): test_size = np.round(test_size * n_samples).astype(int) if isinstance(train_size, float): train_size = np.round(train_size * n_samples).astype(int) if test_size is None: test_size = n_samples - train_size if train_size is None: train_size = n_samples - test_size return train_size, test_size @nose.tools.nottest
[docs]def balanced_train_test_split(X, y, test_size=None, train_size=None, bootstrap=False, random_state=None): """ Split the data into a balanced training set and test set of some given size. For a dataset with an unequal numer of samples in each class, one useful procedure is to split the data into a training and a test set in such a way that the classes are balanced. Parameters ---------- X : array, shape = [n_samples, n_features] Feature matrix. y : array, shape = [n_features] Target vector. test_size : float or int (default=0.3) If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is automatically set to the complement of the train size. If train size is also None, test size is set to 0.3. train_size : float or int (default=1-test_size) If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, the value is automatically set to the complement of the test size. random_state : int, optional (default=None) Pseudo-random number generator state used for random sampling. Returns ------- X_train : array The feature vectors (stored as columns) in the training set. X_test : array The feature vectors (stored as columns) in the test set. y_train : array The target vector in the training set. y_test : array The target vector in the test set. """ # initialise the random number generator rng = RandomState(random_state) # make sure X and y are numpy arrays X = np.asarray(X) y = np.asarray(y) # get information about the class distribution classes, y_indices = np.unique(y, return_inverse=True) n_classes = len(classes) cls_count = np.bincount(y_indices) # get the training and test size train_size, test_size = _get_train_test_size(train_size, test_size, len(y)) # number of samples in each class that is included in the training and test set n_train = np.round(train_size / n_classes).astype(int) n_test = np.round(test_size / n_classes).astype(int) n_total = n_train + n_test # make sure we have enough samples to create a balanced split min_count = min(cls_count) if min_count < (n_train + n_test) and not bootstrap: raise ValueError('The smallest class contains {} examples, which is not ' 'enough to create a balanced split. Choose a smaller size ' 'or enable bootstraping.'.format(min_count)) # selected indices are stored here train = [] test = [] # get the desired sample from each class for i, cls in enumerate(classes): if bootstrap: shuffled = rng.choice(cls_count[i], n_total, replace=True) else: shuffled = rng.permutation(cls_count[i]) cls_i = np.where(y == cls)[0][shuffled] train.extend(cls_i[:n_train]) test.extend(cls_i[n_train:n_total]) train = list(rng.permutation(train)) test = list(rng.permutation(test)) return X[train], X[test], y[train], y[test]
[docs]def csv_to_hdf(csv_path, no_files=1, hdf_path='store.h5', data_cols=None, expectedrows=7569900, min_itemsize=40, table_name='table'): """ Convert csv files to a HDF5 table. Parameters ---------- csv_path : str The path of the source csv files. no_files : int The number of csv parts. hdf_path : str The path of the output. data_cols : array The names of the columns. Should be the same as the first line in the first csv file. expectedrows : int The expected number of rows in the HDF5 table. min_itemsize : int The minimum string size. table_name : str The name of the HDF5 table. """ if os.path.isfile(hdf_path): print('HDF5 Table already exists. No changes were made.') return store = pd.HDFStore(hdf_path, complevel=9, complib='zlib', fletcher32=True) for i in np.arange(no_files): csv_file = csv_path.format(i) if i == 0: data = pd.io.api.read_csv(csv_file) else: data = pd.io.api.read_csv(csv_file, header=None, names=data_cols) store.append(table_name, data, index=False, expectedrows=expectedrows, min_itemsize=min_itemsize) del data gc.collect() store.close()