diff --git a/learning_curve.py b/learning_curve.py index 2364f2c..e4bfee5 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -6,11 +6,24 @@ from sklearn.cross_validation import train_test_split from sklearn.linear_model import LogisticRegression + data = load_digits() print data.DESCR -num_trials = 10 + +num_trials = 100 train_percentages = range(5,95,5) -test_accuracies = numpy.zeros(len(train_percentages)) +test_accuracies = [] +for i in train_percentages: + avg_test_accuracy = 0 + for j in range(0, num_trials): + X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size=i / 100.0) + model = LogisticRegression(C=10**-3) + model.fit(X_train, y_train) + avg_test_accuracy += model.score(X_test, y_test) + avg_test_accuracy /= num_trials + print i + print "Test accuracy %f"%avg_test_accuracy + test_accuracies.append(avg_test_accuracy) # train a model with training percentages between 5 and 90 (see train_percentages) and evaluate # the resultant accuracy. @@ -18,7 +31,6 @@ # for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner # TODO: your code here - fig = plt.figure() plt.plot(train_percentages, test_accuracies) plt.xlabel('Percentage of Data Used for Training') diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..2eb6b71 --- /dev/null +++ b/questions.txt @@ -0,0 +1,4 @@ +1. The general trend in the curve is upwards, with some amount of diminishing returns at higher percentages of data used for training. +2. The lower percentages of data used for training tends to result in more noise. This makes sense because with less training data, the model is likely to have more fluctuations in accuracy. +3. 1000 trials produces a decently smooth curve. +4. As C increases, the accuracy values all increase since the acceptable threshold decreases (C is the inverse of regularization strength). The graph also acquires a more curved shape, suggesting that the rate of diminishing returns increases as C increases. In other words, the first few percentage points of increase for low values of C matter less than for they do for high values of C.