Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
# TextMining
This is the base repo for the text mining and analysis project for Software Design, Spring 2016 at Olin College.
This is a color frequency analysis tool. In order to use the tool, simply run run_books.py. This will automatically
analyze the text of Grimm's fairy tales. If you would like to change the colors searched for in these tales, open
text_filter.py and change the strings in COLORS (a global variable) to whatever you would like to be searched.
After running ths script, follow the prompts presented when you run the program for name of the story (letters and
underscores only), URL of the text file of the story, save file name (letters and underscores only), and graph title.
Empty file added __init__.py
Empty file.
Binary file added gettys_writeup_reflection_textmining.pdf
Binary file not shown.
16 changes: 16 additions & 0 deletions pickling_import_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from pattern.web import *
import urllib2
import pickle



def get_text(URL_string, name):
"""This function grabs the text from a text file on the web and pickles it for future use.
Arguments: URL of text file to be saved as a string, name of the file for use in naming data variables and pickle files (also a string).
Returns: Pickled data!"""

tale = URL(URL_string).download()
save_file = open(name + '.pickle', 'w')
pickle.dump(tale, save_file)
save_file.close()

43 changes: 43 additions & 0 deletions run_books.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from pickling_import_text import *
from text_filter import *
import sys


#def run_func(name_string = 'grimm', url_string='http://www.gutenberg.org/cache/epub/2591/pg2591.txt', graph_title_string='Color Word Frequencies in Brothers Grimm Stories', save_image_string = 'grimm_chart.png'):
def run_func(name_string, url_string, graph_title_string, save_image_string):
get_text(url_string,name_string)
a_text=text_importing(name_string)
tale_list_of_words = tale_slicing(a_text)
tale_dict = color_searching (tale_list_of_words)
item_dump_list = tale_dict.items()
tale_color_freq = list_dumping(item_dump_list)
universal_graph_func(tale_color_freq,graph_title_string,save_image_string)



user_input_name = raw_input("Please input a Name of Story (letters and underscores only)")
user_input_url = raw_input("Please input a URL to a text file to be analysed")
user_input_title = raw_input("Please input what your desired title for the graph that will be generated")
user_input_save = raw_input("Please input the file name you would like the graph to be saved to (letters and underscores only)") + '.png'

run_func(user_input_name,user_input_url, user_input_title, user_input_save)



#if len(sys.argv) ==0:
# run_func()
#elif len(sys.argv) ==4:
# run_func(name_string=str(sys.argv[0]) , url_string=str(sys.argv[1]),
# graph_title_string=str(sys.argv[2]), save_image_string= str(sys.argv[3]))

#else:
# print "Incorrect Number of Arguements. Please run without arguements for Grimm anaylsis or follow the following format: name url graph_title save_image_name)





#run_func('grimm', 'http://www.gutenberg.org/cache/epub/2591/pg2591.txt','Color Word Frequencies in Brothers Grimm Stories', 'grimm_chart_2.png' )
#run_func('perrault','https://ia600302.us.archive.org/15/items/thefairytalesofc29021gut/pg29021.txt', 'Color Word Frequencies in Charles Perrault Stories','perrault_chart_2.png', "perrault_chart_2.png")
#run_func('andersen','https://archive.org/download/fairytalesofhans27200gut/27200.txt', 'Color Word Frequencies in Hans Christian Andersen Stories', 'andersen_chart_2.png')

114 changes: 114 additions & 0 deletions text_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
""" This code was written by Rebecca Gettys, except where otherwise noted. """
#### MAIN SECIOTN ####
import pickle
import string
import seaborn as sns

COLORS = ['red', 'orange', 'yellow', 'green', 'blue', 'purple', 'brown', 'grey', 'black', 'white', 'pink', 'ivory', 'tan', 'silver', 'gold', 'rose','gray', 'olive', 'crimson', 'maroon',
'fuchsia', 'teal', 'lavender', 'lilac', 'aqua', 'azure', 'beige', 'indigo', 'magenta', 'cyan', 'scarlet',
'canary', 'periwinkle']




def text_importing(name):
"""Imports previously-pickled fairy tale data (in string format from disk and returns a list of the strings.
Arguements: name of the pickle file of previously pickled data (as a string, without the .pickle ending)!
Returns: a pickle-imported string"""
# Load data for each from from a file (will be part of your data processing script)
input_file = open(name+ '.pickle','r')
tale = pickle.load(input_file)
return tale




def color_searching(tale):
"""Searches the tale for a list of color words and counts the instances of these words up using a dictionary.
Arguments: object (in this contex a list) to search, dictionary to search with
Returns: dictionary containing keys and key-occurance frequencies (how many times the word showed up in the object)
Due to the non-orderedness of dicionaries, hard to use a doctest"""
color_dict = {color:0 for color in COLORS}
for word in tale: #need to slice each tale into a list of words for this to work
if word in color_dict:
current_val = color_dict.get(word)
val = current_val + 1
color_dict[word] = val #made a dictionary of the string (color, frequnecy)
return color_dict





def tale_slicing(tale):
"""Slices the tales (strings) up into a list of words without spaces or punctuation
NOTE: https://mail.python.org/pipermail/tutor/2001-October/009454.html explains punctuation removal method that I used
Arguments: list of strings (texts of the gutenberg tales)
Returns: lists of words"""
tale_no_punc = ''
for char in tale: #killing punctuation
if not is_punct_char(char):
tale_no_punc = tale_no_punc+char #so extend the string everytime we run into a letter
list_of_words = []
list_of_words = tale_no_punc.split( ) #splitting the string into the list)
return list_of_words





def is_punct_char(char):
"""From python.org (link above), all this does is check if a character is puncutation or not! the ultimate helper funcion!
Arguments: character
Returns: True/False if the character it is given is a puncuation mark - 1 is punctuation, 0 is not """
return char in string.punctuation #1 is punctuation, 0 is not punctuation




def list_dumping (list):
"""This method I found on #http://stackoverflow.com/questions/7558908/unpacking-a-list-tuple-of-pairs-into-two-lists-tuples;
just a convenient snippet of code which converts from the .items output to 2 lists in correct order
Arguments: list (of two-item-tuples) that need to be seperated into lists
Returns: a list containing keys as items in one list, values as items in the other list, in the correct order"""
color = []
frequency = []
for i in list:
color.append(i[0])
frequency.append(i[1])
return [color, frequency]

### END OF MAIN SECTION ###

### GRAPHING AND DATA PROCESSING ###



## patrick is amazing for helping with this!!



def universal_graph_func(text_variable,title_string,save_file_name_string):
sns.set(font_scale=.8)
sns.axlabel('Color', 'Frequency')
# colors from http://www.color-hex.com and wikipedia
flatui = ["#4b0082", "#ffd700", "#e6e6fa", "#ffff00", "#FF2400", "#ff6eb4", "#d2b48c", "#ff00ff", "#0000ff",
"#C8A2C8",
"#800080", "#FF007F", "#FD3F92", "#000000", "#dc143c", "#CCCCFF", "#ffffff", "#ff0000", "#631919",
"#fffff0",
"#ffa500", "#730000", "#808000", "#00ffff", "#c0c0c0", "#808080", "#7fffd4", "#808080", "#008000",
"#f5f5dc",
"#329999", "#f0ffff", "#FFEF00"]
custom_palette = sns.color_palette(flatui)
colors = text_variable[0]
occurences = text_variable[1]
ax = sns.barplot(colors, occurences, palette = custom_palette)
fig = ax.get_figure()
for item in ax.get_xticklabels():
item.set_rotation(45)
sns.plt.title(title_string)
fig.savefig(save_file_name_string)
fig.clf()