sd16spring · rebeccagettys · Feb 25, 2016 · Apr 17, 2016 · Apr 17, 2016
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/README.md b/README.md
@@ -1,2 +1,5 @@
-# TextMining
-This is the base repo for the text mining and analysis project for Software Design, Spring 2016 at Olin College.
+This is a color frequency analysis tool.  In order to use the tool, simply run run_books.py. This will automatically
+analyze the text of Grimm's fairy tales. If you would like to change the colors searched for in these tales, open
+text_filter.py and change the strings in COLORS (a global variable) to whatever you would like to be searched.
+After running ths script, follow the prompts presented when you run the program for name of the story (letters and
+ underscores only), URL of the text file of the story,  save file name (letters and underscores only), and graph title.
diff --git a/__init__.py b/__init__.py
diff --git a/gettys_writeup_reflection_textmining.pdf b/gettys_writeup_reflection_textmining.pdf
diff --git a/pickling_import_text.py b/pickling_import_text.py
@@ -0,0 +1,16 @@
+from pattern.web import *
+import urllib2
+import pickle
+
+
+
+def get_text(URL_string, name):
+    """This function grabs the text from a text file on the web and pickles it for future use.
+Arguments: URL of text file to be saved as a string, name of the file for use in naming data variables and pickle files (also a string).
+ Returns: Pickled data!"""
+
+    tale = URL(URL_string).download()
+    save_file = open(name + '.pickle', 'w')
+    pickle.dump(tale, save_file)
+    save_file.close()
+
diff --git a/run_books.py b/run_books.py
@@ -0,0 +1,43 @@
+from pickling_import_text import *
+from text_filter import *
+import sys
+
+
+#def run_func(name_string = 'grimm', url_string='http://www.gutenberg.org/cache/epub/2591/pg2591.txt', graph_title_string='Color Word Frequencies in Brothers Grimm Stories', save_image_string = 'grimm_chart.png'):
+def run_func(name_string, url_string, graph_title_string, save_image_string):
+    get_text(url_string,name_string)
+    a_text=text_importing(name_string)
+    tale_list_of_words = tale_slicing(a_text)
+    tale_dict = color_searching (tale_list_of_words)
+    item_dump_list = tale_dict.items()
+    tale_color_freq = list_dumping(item_dump_list)
+    universal_graph_func(tale_color_freq,graph_title_string,save_image_string)
+
+
+
+user_input_name = raw_input("Please input a Name of Story (letters and underscores only)")
+user_input_url = raw_input("Please input a URL to a text file to be analysed")
+user_input_title = raw_input("Please input what your desired title for the graph that will be generated")
+user_input_save = raw_input("Please input the file name you would like the graph to be saved to (letters and underscores only)") + '.png'
+
+run_func(user_input_name,user_input_url, user_input_title, user_input_save)
+
+
+
+#if len(sys.argv) ==0:
+#    run_func()
+#elif len(sys.argv) ==4:
+#    run_func(name_string=str(sys.argv[0]) , url_string=str(sys.argv[1]),
+#             graph_title_string=str(sys.argv[2]), save_image_string= str(sys.argv[3]))
+
+#else:
+#    print "Incorrect Number of Arguements. Please run without arguements for Grimm anaylsis or follow the following format: name url graph_title save_image_name)
+
+
+
+
+
+#run_func('grimm', 'http://www.gutenberg.org/cache/epub/2591/pg2591.txt','Color Word Frequencies in Brothers Grimm Stories', 'grimm_chart_2.png' )
+#run_func('perrault','https://ia600302.us.archive.org/15/items/thefairytalesofc29021gut/pg29021.txt', 'Color Word Frequencies in Charles Perrault Stories','perrault_chart_2.png', "perrault_chart_2.png")
+#run_func('andersen','https://archive.org/download/fairytalesofhans27200gut/27200.txt', 'Color Word Frequencies in Hans Christian Andersen Stories', 'andersen_chart_2.png')
+
diff --git a/text_filter.py b/text_filter.py
@@ -0,0 +1,114 @@
+""" This code was written by Rebecca Gettys, except where otherwise noted.  """
+#### MAIN SECIOTN ####
+import pickle
+import string
+import seaborn as sns
+
+COLORS = ['red', 'orange', 'yellow', 'green', 'blue', 'purple', 'brown', 'grey', 'black', 'white', 'pink', 'ivory', 'tan', 'silver', 'gold', 'rose','gray', 'olive', 'crimson', 'maroon',
+    'fuchsia', 'teal', 'lavender', 'lilac', 'aqua', 'azure', 'beige', 'indigo', 'magenta', 'cyan', 'scarlet',
+    'canary', 'periwinkle']
+
+
+
+
+def text_importing(name):
+    """Imports previously-pickled fairy tale data (in string format from disk and returns a list of the strings.
+    Arguements: name of the pickle file of previously pickled data (as a string, without the .pickle ending)!
+    Returns: a pickle-imported string"""
+    # Load data for each from from a file (will be part of your data processing script)
+    input_file = open(name+ '.pickle','r')
+    tale = pickle.load(input_file)
+    return tale
+
+
+
+
+def color_searching(tale):
+    """Searches the tale for a list of color words and counts the instances of these words up using a dictionary.
+    Arguments: object (in this contex a list) to search, dictionary to search with
+    Returns: dictionary containing keys and key-occurance frequencies (how many times the word showed up in the object)
+    Due to the non-orderedness of dicionaries, hard to use a doctest"""
+    color_dict = {color:0 for color in COLORS}
+    for word in tale: #need to slice each tale into a list of words for this to work
+        if word in color_dict:
+            current_val = color_dict.get(word)
+            val = current_val + 1
+            color_dict[word] = val #made a dictionary of the string (color, frequnecy)
+    return color_dict
+
+
+
+
+
+def tale_slicing(tale):
+    """Slices the tales (strings) up into a list of words without spaces or punctuation
+     NOTE: https://mail.python.org/pipermail/tutor/2001-October/009454.html explains punctuation removal method that I used
+    Arguments: list of strings (texts of the gutenberg tales)
+    Returns: lists of words"""
+    tale_no_punc = ''
+    for char in tale: #killing punctuation
+        if not is_punct_char(char):
+            tale_no_punc = tale_no_punc+char #so extend the string everytime we run into a letter
+    list_of_words = []
+    list_of_words = tale_no_punc.split( ) #splitting the string into the list)
+    return list_of_words
+
+
+
+
+
+def is_punct_char(char):
+    """From python.org (link above), all this does is check if a character is puncutation or not! the ultimate helper funcion!
+    Arguments: character
+    Returns: True/False if the character it is given is a puncuation mark - 1 is punctuation, 0 is not """
+    return char in string.punctuation #1 is punctuation, 0 is not punctuation
+
+
+
+
+def list_dumping (list):
+    """This method I found on #http://stackoverflow.com/questions/7558908/unpacking-a-list-tuple-of-pairs-into-two-lists-tuples;
+    just a convenient snippet of code which converts from the .items output to 2 lists in correct order
+    Arguments: list (of two-item-tuples) that need to be seperated into lists
+     Returns: a list containing keys as items in one list, values as items in the other list, in the correct order"""
+    color = []
+    frequency = []
+    for i in list:
+        color.append(i[0])
+        frequency.append(i[1])
+    return [color, frequency]
+
+### END OF MAIN SECTION ###
+
+### GRAPHING AND DATA PROCESSING ###
+
+
+
+## patrick is amazing for helping with this!!
+
+
+
+def universal_graph_func(text_variable,title_string,save_file_name_string):
+    sns.set(font_scale=.8)
+    sns.axlabel('Color', 'Frequency')
+    # colors from http://www.color-hex.com and wikipedia
+    flatui = ["#4b0082", "#ffd700", "#e6e6fa", "#ffff00", "#FF2400", "#ff6eb4", "#d2b48c", "#ff00ff", "#0000ff",
+              "#C8A2C8",
+              "#800080", "#FF007F", "#FD3F92", "#000000", "#dc143c", "#CCCCFF", "#ffffff", "#ff0000", "#631919",
+              "#fffff0",
+              "#ffa500", "#730000", "#808000", "#00ffff", "#c0c0c0", "#808080", "#7fffd4", "#808080", "#008000",
+              "#f5f5dc",
+              "#329999", "#f0ffff", "#FFEF00"]
+    custom_palette = sns.color_palette(flatui)
+    colors = text_variable[0]
+    occurences = text_variable[1]
+    ax = sns.barplot(colors, occurences, palette = custom_palette)
+    fig = ax.get_figure()
+    for item in ax.get_xticklabels():
+        item.set_rotation(45)
+    sns.plt.title(title_string)
+    fig.savefig(save_file_name_string)
+    fig.clf()
+
+
+