From 54cc245733638a1f7e512d2580c7859061c17404 Mon Sep 17 00:00:00 2001
From: arianaolson419 <ariana.olson@students.olin.edu>
Date: Tue, 15 Mar 2016 20:10:22 -0400
Subject: [PATCH] submitting this toolbox

---
 learning_curve.py | 27 ++++++++++++++++++++++++---
 questions.txt     |  7 +++++++
 2 files changed, 31 insertions(+), 3 deletions(-)
 create mode 100644 questions.txt

diff --git a/learning_curve.py b/learning_curve.py
index 2364f2c..2fa92bf 100644
--- a/learning_curve.py
+++ b/learning_curve.py
@@ -7,8 +7,8 @@
 from sklearn.linear_model import LogisticRegression
 
 data = load_digits()
-print data.DESCR
-num_trials = 10
+# print data.DESCR
+num_trials = 20
 train_percentages = range(5,95,5)
 test_accuracies = numpy.zeros(len(train_percentages))
 
@@ -17,10 +17,31 @@
 # You should repeat each training percentage num_trials times to smooth out variability
 # for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner
 
-# TODO: your code here
+
+def train_test(percent):
+	'''partitions data into training and testing sets, using these groups to train and test the data,
+	and returns tesing accuracy
+	percent: percent of data partitioned for training'''
+	X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size = percent/100.0)
+	model = LogisticRegression(C=10**0)
+	model.fit(X_train, y_train)
+	return model.score(X_test,y_test)
+
+for i in enumerate(train_percentages):
+	t = 0
+	for j in range(num_trials):
+		t += train_test(i[1])/num_trials #averages accuracies for each percentage
+	test_accuracies[i[0]] = t
+
+
 
 fig = plt.figure()
+# for i in range(10):
+# 	subplot = fig.add_subplot(5,2,i+1)
+# 	subplot.matshow(numpy.reshape(data.data[i], \
+# 		(8,8)), cmap='gray')
 plt.plot(train_percentages, test_accuracies)
 plt.xlabel('Percentage of Data Used for Training')
 plt.ylabel('Accuracy on Test Set')
+plt.title(str(num_trials))
 plt.show()
diff --git a/questions.txt b/questions.txt
new file mode 100644
index 0000000..42d4ae0
--- /dev/null
+++ b/questions.txt
@@ -0,0 +1,7 @@
+1. The accuracey increases as the percentage of the data partitioned for training increases.
+
+2. The ends of the curve appear to be slightly less noisey than the middle. This may be because there is a greater chance of accuracy when the amount of training data is very high, and there is little chance for variation because the testing set was so small. At the bottom, the reverse is true where there is so little training data and so much testing data, so there is a very small chance that the tests will yield a high accuracy. In the middle, there is a greater chance for variation in accuracey.
+
+3. The curve seems to be much smoother after about 25 trials
+
+4. The trend of the curve becomes increasingly logarithmic as C becomes larger.