[sourcecode language ="python" wraplines="true" collapse="false" firstline (1)]
# The dataset is first loaded, # the string values converted to numeric and the output column is converted from strings to the integer values of 0 and 1. # This is achieved with helper functions load_csv(), # str_column_to_float() and str_column_to_int() to load and prepare the dataset. # I will use k-fold cross validation to estimate the performance of the learned model on unseen data. # This means that I will construct and evaluate k models and estimate the performance as the mean model error. # Classification accuracy will be used to evaluate each model. # These behaviors are provided in the cross_validation_split(), accuracy_metric() and evaluate_algorithm() helper functions. # I will also use an implementation of the Classification and Regression Trees (CART) algorithm # adapted for bagging including the helper functions test_split() to split a dataset into groups, # gini_index() to evaluate a split point, # our modified get_split() function discussed in the previous step, # to_terminal(), split() and build_tree() used to create a single decision tree, # predict() to make a prediction with a decision tree, # subsample() to make a subsample of the training dataset and bagging_predict() to make a prediction with a list of decision trees. # A new function name random_forest() is developed that first creates a list of decision trees # from subsamples of the training dataset and then uses them to make predictions. # Select the best split point for a dataset def get_split(dataset, n_features): class_values = list(set(row[-1] for row in dataset)) b_index, b_value, b_score, b_groups = 999, 999, 999, None features = list() while len(features) < n_features: index = randrange(len(dataset[0])-1) if index not in features: features.append(index) for index in features: for row in dataset: groups = test_split(index, row[index], dataset) gini = gini_index(groups, class_values) if gini < b_score: b_index, b_value, b_score, b_groups = index, row[index], gini, groups return {'index':b_index, 'value':b_value, 'groups':b_groups} # Random Forest Algorithm on **** Dataset from random import seed from random import randrange from csv import reader from math import sqrt # Load a CSV file def load_csv(filename): dataset = list() with open(filename, 'r') as file: csv_reader = reader(file) for row in csv_reader: if not row: continue dataset.append(row) return dataset # Convert string column to float def str_column_to_float(dataset, column): for row in dataset: row = float(row.strip()) # Convert string column to integer def str_column_to_int(dataset, column): class_values = for row in dataset] unique = set(class_values) lookup = dict() for i, value in enumerate(unique): lookup[value] = i for row in dataset: row = lookup] return lookup # Split a dataset into k folds def cross_validation_split(dataset, n_folds): dataset_split = list() dataset_copy = list(dataset) fold_size = int(len(dataset) / n_folds) for i in range(n_folds): fold = list() while len(fold) < fold_size: index = randrange(len(dataset_copy)) fold.append(dataset_copy.pop(index)) dataset_split.append(fold) return dataset_split # Calculate accuracy percentage def accuracy_metric(actual, predicted): correct = 0 for i in range(len(actual)): if actual[i] == predicted[i]: correct += 1 return correct / float(len(actual)) * 100.0 # Evaluate an algorithm using a cross validation split def evaluate_algorithm(dataset, algorithm, n_folds, *args): folds = cross_validation_split(dataset, n_folds) scores = list() for fold in folds: train_set = list(folds) train_set.remove(fold) train_set = sum(train_set, []) test_set = list() for row in fold: row_copy = list(row) test_set.append(row_copy) row_copy[-1] = None predicted = algorithm(train_set, test_set, *args) actual = for row in fold] accuracy = accuracy_metric(actual, predicted) scores.append(accuracy) return scores # Split a dataset based on an attribute and an attribute value def test_split(index, value, dataset): left, right = list(), list() for row in dataset: if row[index] < value: left.append(row) else: right.append(row) return left, right # Calculate the Gini index for a split dataset def gini_index(groups, classes): # count all samples at split point n_instances = float(sum([len(group) for group in groups])) # sum weighted Gini index for each group gini = 0.0 for group in groups: size = float(len(group)) # avoid divide by zero if size == 0: continue score = 0.0 # score the group based on the score for each class for class_val in classes: p = for row in group].count(class_val) / size score += p * p # weight the group score by its relative size gini += (1.0 - score) * (size / n_instances) return gini # Select the best split point for a dataset def get_split(dataset, n_features): class_values = list(set(row[-1] for row in dataset)) b_index, b_value, b_score, b_groups = 999, 999, 999, None features = list() while len(features) < n_features: index = randrange(len(dataset[0])-1) if index not in features: features.append(index) for index in features: for row in dataset: groups = test_split(index, row[index], dataset) gini = gini_index(groups, class_values) if gini < b_score: b_index, b_value, b_score, b_groups = index, row[index], gini, groups return {'index':b_index, 'value':b_value, 'groups':b_groups} # Create a terminal node value def to_terminal(group): outcomes = for row in group] return max(set(outcomes), key=outcomes.count) # Create child splits for a node or make terminal def split(node, max_depth, min_size, n_features, depth): left, right = node['groups'] del(node['groups']) # check for a no split if not left or not right: node['left'] = node['right'] = to_terminal(left + right) return # check for max depth if depth >= max_depth: node['left'], node['right'] = to_terminal(left), to_terminal(right) return # process left child if len(left) <= min_size: node['left'] = to_terminal(left) else: node['left'] = get_split(left, n_features) split(node['left'], max_depth, min_size, n_features, depth+1) # process right child if len(right) <= min_size: node['right'] = to_terminal(right) else: node['right'] = get_split(right, n_features) split(node['right'], max_depth, min_size, n_features, depth+1) # Build a decision tree def build_tree(train, max_depth, min_size, n_features): root = get_split(train, n_features) split(root, max_depth, min_size, n_features, 1) return root # Make a prediction with a decision tree def predict(node, row): if row[node['index']] < node['value']: if isinstance(node['left'], dict): return predict(node['left'], row) else: return node['left'] else: if isinstance(node['right'], dict): return predict(node['right'], row) else: return node['right'] # Create a random subsample from the dataset with replacement def subsample(dataset, ratio): sample = list() n_sample = round(len(dataset) * ratio) while len(sample) < n_sample: index = randrange(len(dataset)) sample.append(dataset[index]) return sample # Make a prediction with a list of bagged trees def bagging_predict(trees, row): predictions = [predict(tree, row) for tree in trees] return max(set(predictions), key=predictions.count) # Random Forest Algorithm def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features): trees = list() for i in range(n_trees): sample = subsample(train, sample_size) tree = build_tree(sample, max_depth, min_size, n_features) trees.append(tree) predictions = [bagging_predict(trees, row) for row in test] return(predictions) # Test the random forest algorithm seed(2) # load and prepare data filename = '****.all.data.csv' dataset = load_csv(filename) # convert string attributes to integers for i in range(0, len(dataset[0])-1): str_column_to_float(dataset, i) # convert class column to integers str_column_to_int(dataset, len(dataset[0])-1) # evaluate algorithm n_folds = 5 max_depth = 10 min_size = 1 sample_size = 1.0 n_features = int(sqrt(len(dataset[0])-1)) for n_trees in [1, 5, 10]: scores = evaluate_algorithm(dataset, random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features) print('Trees: %d' % n_trees) print('Scores: %s' % scores) print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores)))) [/sourcecode] To be continued]]>
In the previous part we talked about Training, Validation, and Testing dataset and how to define problem using your dataset. In that process we talked about couple of techniques, high level though, in order to identify what entails in your dataset. In this part we are discussing some statistical issues related to Classification Problem.
Two things affect the learning or the generalization capability: the choice of the algorithm (and its parameters) and number of training data. This ability to generalize can be estimated by various metrics including the prediction errors. The overall estimate of unseen error or risk of the model is given by:
Here, is the stochastic noise, is called the variance error and is a measure of how susceptible our hypothesis or the algorithm (G) is, if given different datasets. is called the bias error and represents how far away the best algorithm in the model (average learner over all possible datasets) is from the optimal one.
Learning curves as shown in Figures below —where training and testing errors are plotted keeping either the algorithm with its parameters constant or the training data size constant—give an indication of underfitting or overfitting.
When the training data size is fixed, different algorithms or the same algorithms with different parameter choices can exhibit different learning curves. The Figure below shows two cases of algorithms on the same data size giving two different learning curves based on bias and variance.
Figure above shows the Training Data relationship with Error Rate when the model complexity is fixed indicates different choices of models.
The algorithm or model choice also impacts model performance. A complex algorithm, with more parameters to tune, can result in overfitting, while a simple algorithm with less parameters might be underfitting. The classic figure to illustrate the model performance and complexity when the training data size is fixed is as follows:
Figure above shows the Model Complexity relationship with Error rate, over the training and the testing data when training data size is fixed.
Validation allows for exploring the parameter space to find the model that generalizes best. Regularization (will be discussed in linear models) and validation are two mechanisms that should be used for preventing overfitting. Sometimes the “k-fold cross-validation” process is used for validation, which involves creating samples of the data and using to train on and the remaining one to test, repeated times to give an average estimate. The following figure shows 5-fold cross-validation as an example:
The following are some commonly used techniques to perform data sampling, validation, and learning:
In this post we discussed the modelling risks we bear in machine learning. This may get compounded in multinomial, multi-dimensional, multi-label dataset. Multiclass classifications are similar to binary classifications, with the difference that there are several possible discrete outcomes instead of just two. What should be the priori distribution of the overall dataset if the total dataset inherently containing subsets. Can one distribution represent the overall nature of the dataset? These are valid question a data scientist encounters in dealing with multinomial dataset. How do you determine if there are significant outlier in the dataset, can we associate those as attributes in our training, valildation and testing datasets? can box plots and parallel coordinates plots help in determining the algorithm choice?
Exploratory studies for the multinomial data have revealed a very interesting problem. In particular, the box plots, coupled with the parallel coordinates plot, suggest that a good choice of algorithm might be an ensemble method if there’s enough data to fit it. The sets of attributes corresponding to one class or another apparently have a complicated boundary between them. What algorithm will give the best predictive performance remains to be seen in machine learning. The exploratory methods you may have seen have done their job. They have given a good understanding of the tradeoffs for this problem, leading to some guesses about what algorithm will give the best performance.
In our next post we will discuss the priori dirichlet distribution to see how convenient it can be in multiclass, multinomial dataset.
Continued…
]]>In the first part we discussed some bits of Classification Problem. In this post we will continue with classification problem with more inputs.
In our first post we left the post at the topic ML application to numeric attributes. But what about categorical attributes? You want to check to see how many categories they have and how many examples there are from each category. You want to check these things for a couple of reasons. The gender attribute has two possible values (Male and Female), but if the attribute had been the state of the United States, there would have been 50 possible categories, or if it happens to be in India, 29 states. As the number of attributes grows, the complexity of dealing with them mounts. Most binary tree algorithms, which are the basis for ensemble methods, have a cutoff on how many categories they can handle. The popular Random Forests package written by Breiman and Cutler (the inventors of the algorithm) has a cutoff of 32 categories. If an attribute has more than 32 categories, you’ll need to aggregate them.
It is quite important to see that training involves taking a random subset of the data and training a series of models on it. Suppose, for instance, that the category is the state of the United States and that Ohio has only two examples. A random draw of training examples might not get any from Ohio. You need to see those kinds of problems before they occur so that you can address them. In the case of the two Ohio examples, you might merge them with Pennsylvania or Indiana, you might duplicate them, or you might manage the random draw so that you ensure getting Ohio examples (a procedure called stratified sampling).
Calculating the correlations and printing them or drawing cross‐plots works fine for a few correlations, but it is difficult to get a grasp of a large table of numbers, and it is difficult to squeeze all the cross‐plots onto a page if the problem has 100 attributes. One way to check correlations with a large number of attributes is to calculate the Pearson’s correlation coefficient for pairs of attributes, arrange those correlations into a matrix where the ij‐th entry is the correlation between the ith attribute and the jth attribute, and then plot them in a heat map. Different coloring can define correlation appropriately. The light areas along the diagonal confirm that attributes close to one another in index have relatively high correlations. This is due to the way in which the data are generated. If close data points are sampled at short time intervals from one another and consequently have similar frequencies. Similar frequencies reflect off the targets similarly (and so on).
The Holy Grail of creating good classification models is to train on a set of good quality, representative, (training data), tune the parameters and find effective models (validation data), and finally, estimate the model’s performance by its behavior on unseen data (test data). The central idea behind the logical grouping is to make sure models are validated or tested on data that has not been seen during training. Otherwise, a simple “rote learner” can outperform the algorithm. The generalization capability of the learning algorithm must be evaluated on a dataset which is different from the training dataset, but comes from the same population. The balance between removing too much data from training to increase the budget of validation and testing can result in models which suffer from “underfitting”, that is, not having enough examples to build patterns that can help in generalization. On the other hand, the extreme choice of allocating all the labeled data for training and not performing any validation or testing can lead to “overfitting”, that is, models that fit the examples too faithfully and do not generalize well enough.
Typically, in most machine learning challenges and real world customer problems, one is given a training set and testing set upfront for evaluating the performance of the models. In these engagements, the only question is how to validate and find the most effective parameters given the training set. In some engagements, only the labeled dataset is given and you need to consider the training, validation, and testing sets to make sure your models do not overfit or underfit the data.
Three logical processes are needed for modeling and hence three logical datasets are needed, namely,
The purpose of the training dataset is to give labeled data to the learning algorithm to build the models. The purpose of the validation set is to see the effect of the parameters of the training model being evaluated by training on the validation set. Finally, the best parameters or models are retrained on the combination of the training and validation sets to find an optimum model that is then tested on the blind test set.
In the next part we will cover how to generate Training, Validation, and Test data and how to use them, handle underfitting, overfitting etc.
Continued…
]]>Since machine learning is a complex and technically heavy subject, we shall discuss some of the aspects of machine learning, before we dwell into dataset issues and where it can affect your algorithms, coding patterns and outputs. As we know there are many ways we can design and develop codes around machine learning issues, in order to ascertain the types of learning approach, we put machine learning in various baskets, namely:
If your dataset is heavily focused on category labels, then you not just encounter a classification problem but also face dichotomy of dimension reduction. Here you need to decide on the spot, what you should expect as an output. Dimension reduction helps in identifying the insights, but whether those reduced dimensions make sense is completely a data scientist’s decision. Technical understanding of issues such as linearity/ non-linearity in dataset, other characteristics such as ability to create a canonical variable using existing two or more variables from the dataset , determine how you should target the dataset and devise your machine learning algorithms.
In Supervised Learning, you can assign tasks of inferring various functions from labeled training data. The training data can consist of a set of training examples as a case of some benchmark, as a first step to devise your model. In supervised learning, each training set is a pair consisting of an input object/labeled category (typically a vector) and a desired output value as the the supervisory signal, as sort of mapping fucntion, where your algorithm can determine how off your output is from the supervisory signals.
We shall be discussing more on Supervised Learning and touch the topics of Blending , GBRT, RBFN and other Neural Network approachs and Regression topics such as LARS and LASSO etc. in coming posts
In Unsupervised Learning, the goal is to model the underlying structure or distribution in the data in order to learn more about the data, unlike supervised learning. These are called unsupervised learning because unlike supervised learning above there is no correct answers and there is mapping function or benchmark available to determine the desired output. Algorithms are left to their own devises to discover and present the interesting structure in the data. Unsupervised learning problems can be further grouped into clustering and association problems.
Here we also encounter issues like classification vs. predictive models. Which one to use is determined by the dataset and the client’s business profile. Some popular examples of unsupervised learning algorithms can be highlighted as k-means for clustering problems and apriori algorithm for association rule learning problems. Detailed discussion will be published in later posts.
In Semi-Supervised Learning, the approach is that of trying to find hidden structure in unlabeled data with some predetermined benchmarks. Since the examples given to the learner are unlabeled, there is no error or reward signal to evaluate a potential solution, though skewness can be traced using pre-supplied benchmarks, the quality or usabilty of training examples sets is determined by the data scientist, thus it is a subjective decision. As we can see, these problems sit in between both supervised and unsupervised learning.
A good example is a photo archive where only some of the images are labeled, (e.g. dog, cat, person) and the majority are unlabeled. Many real world machine learning problems fall into this area. This is because it can be expensive or time-consuming to label data as it may require access to domain experts. Whereas unlabeled data is cheap and easy to collect and store. You can use unsupervised learning techniques to discover and learn the structure in the input variables. You can also use supervised learning techniques to make best guess predictions for the unlabeled data, feed that data back into the supervised learning algorithm as training data and use the model to make predictions on new unseen data.
Continued…
]]>Let’s understand through a simple example to review some basic problem structure, nomenclature, and characteristics of a machine learning data set. First we need to ascertain & introduce some language to discuss different types of function approximation problems one by one. These problems illustrate common variations of machine learning problems so that you’ll know how to recognize the variants when you see them and will know how to handle them (and will have code examples for them)
Attributes and labels go by a variety of names, and new machine learners can get tripped up by the name switching from one algo-author to another or even one section to another from a single author. Attributes (the variables being used to make predictions) are also known as the following:
One type of attribute is called a categorical or factor variable. Categorical variables have the property that there’s no order relation between the various values. Categorical variables can be two‐valued, like Boy or Girl, or multivalued, like states (MP, MH, AP . . . MIZ). Other distinctions can be drawn regarding attributes (integer versus float, for example), but they do not have the same impact on machine learning algorithms.
The reason for this is that many machine learning algorithms take numeric attributes only; they cannot handle categorical or factor variables. Penalized regression algorithms deal only with numeric attributes. The same is true for support vector machines, kernel methods, and K‐nearest neighbors. There are some methods for converting categorical variables to numeric variables. The nature of the variables will shape your algorithm choices and the direction you take in developing a predictive model, so it’s one of the things you need to pay attention to when you face a new problem.
A similar dichotomy arises for the labels. When the labels are numeric, the problem is called a regression problem. When the labels are categorical, the problem is called a classification problem. If the categorical target takes only two values, the problem is called a binary classification problem. If it takes more than two values, the problem is called a multi-class classification problem. In many cases, the choice of problem type is up to the algo-designer. A problem can be converted from a regression problem to a binary classification problem by the simple transformation of the labels. These are trade-offs that you may might to make as part of your attack on a problem
You’ll want to ascertain a number of other features of the data set as part of your initial inspection of the data. The following is a checklist and a sequence of things to learn about your data set to familiarize yourself with the data and to formulate the predictive model development steps that you want to follow. These are simple things to check and directly impact your next steps. In addition, the process gets you moving around the data and learning its properties.
One of the first things to check is the size and shape of the data. Read the data into a list of lists; then the dimension of the outer list is the number of rows, and the dimension of one of the inner lists is the number of columns. The next section shows the concrete application of this to one of the data sets that you’ll see used later to illustrate the properties of an algorithm that will be developed.
The next step in the process is to determine how many missing values there are in each row. The reason for doing it on a row‐by‐row basis is that the simplest way to deal with missing values is to throw away instances that aren’t complete (examples with at least one missing value). In many situations, this can bias the results, but just a few incomplete examples will not make a material difference. By counting the rows with missing data (in addition to the total number of missing entries), you’ll know how much of the data set you have to discard if you use the easy method.
If you have a large number of rows, as you might if you’re collecting web data, the number you’ll lose may be small compared to the number of rows of data you have available. If you’re working on biological problems where the data are expensive and you have many attributes, you might not be able to afford to throw data out. In that case, you’ll have to figure out some ways to fill in the missing values or use an algorithm that can deal with them.
Filling them in is called imputation. The easiest way to impute the missing data is to fill in the missing entries using average values of the entries in each row, or in worst case zero (this is debatable though). A more sophisticated method is to use one of the predictive methods will be covered in our next post on penalized regression & ensemble techniques. To use a predictive method, you treat a column of attributes with missing values as though it were labels. Be sure to remove the original problem labels before undertaking this process.
The next several sections we are going to go through the process to be outlined here and will introduce some methods for characterizing your data set to help you decide how to attack the modeling process.
It starts with simple measurements of size and shape, reporting data types, counting missing values, and so forth. Then it moves on to statistical properties of the data and interrelationships between attributes and between attributes and the labels
What difference does this make? The number of rows and columns has several impacts on how you proceed. First, the overall size gives you a rough idea of how long your training times are going to be. For a small data set training time will be less than a minute, which will facilitate iterating through the process of training and tweaking.
If the data set grows to 1,000 x 1,000, the training times will grow to a fraction of a minute for penalized linear regression and a few minutes for an ensemble method. As the data set gets to several tens of thousands of rows and columns, the training times will expand to 3 or 4 hours for penalized linear regression and 12 to 24 hours for an ensemble method. The larger training times will have an impact on your development time because you’ll iterate a number of times.
The second important observation regarding row and column counts is that if the data set has many more columns than rows, you may be more likely to get the best prediction with penalized linear regression and vice versa.
At this point it is important that you check how many of the columns of data are numeric versus categorical
After determining which attributes are categorical and which are numeric, you’ll want some descriptive statistics for the numeric variables and a count of the unique categories in each categorical attribute. The first step is to calculate the mean and standard deviation for the chosen attribute. Knowing these will undergird your intuition as you’re developing models.
The next step is to look at outliers. One way to reveal this sort of mismatch is to divide a set of numbers into percentiles. The easiest way to visualize forming these groupings is to imagine that the data are sorted into numeric order. The numbers in the preceding list are arranged in numeric order. That makes it easy to see where the percentile boundaries go. Some often used percentiles are given special names. The percentiles defined by dividing the set into equal quarters, fifths, and tenths are called respectively quartiles, quintiles, and deciles.
One way to study outliers in more detail is to plot the distribution of the data in question relative to some reasonable distributions to see whether the relative numbers match up. If the data being analyzed comes from a Gaussian distribution, the point being plotted will lie on a straight line
Outliers may cause trouble either for model building or prediction. After you’ve trained a model on this data set, you can look at the errors your model makes and see whether the errors are correlated with these outliers. If they are, you can then take steps to correct them. One way you can replicate the poor‐performing examples to force them to be more heavily represented. You can segregate them out and train on them as a separate class. You can also edit them out of the data if they represent an abnormality that won’t be present in the data your model will see when deployed. A reasonable process for this might be to generate quartile boundaries during the exploration phase and note potential outliers to get a feel for how much of a problem you might (or might not) have with it. Then when you’re evaluating performance data, use quantile‐quantile (Q‐Q) plots to determine which points to call outliers for use in your error analysis.
Continued…
]]>
Spatial Models |
Simulations |
Time Series |
Churn Analysis |
Survival Analysis |
Inventory management |
Market Segmentation (Customer / Products / Services) |
Optimum Bidding |
Recommendation Systems |
Optimum Pricing |
Association Rule Learning |
Indexation |
Attribution Modeling |
Search Engines |
Scoring |
Cross-Selling |
Predictive Modeling |
Clinical trials |
Clustering |
Multivariate Testing |
Supervised Classification |
Queuing Systems |
Extreme Value Measurement |
Supply Chain Optimization |
]]>DataIntel’s expertise in this working layer is about designing algorithms to extract insights from rather large and potentially unstructured data (text mining), sometimes called nugget discovery, for instance unearthing a massive Botnets after looking at 50 million rows of data. DataIntel’s techniques include pattern recognition, feature selection, clustering, and monitored classification and encompasses a few statistical techniques (though without the p-values or confidence intervals attached to most statistical methods being used). Instead, emphasis is on robust, data-driven, scalable techniques, without much interest in discovering causes or interpretability. Data mining thus have some intersection with statistics, and it is a subset of data science.
]]>Exploratory analytics looks to discover signals and nonobvious relationships or patterns among broader data sets to uncover new insights that can positively impact decision making and organizational performance. Once discovered and validated, newly discovered signals are operationalized to improve operational intelligence and monitored for ongoing quality using performance management processes. Discovery can be visual, collaborative, and algorithmic.