diff --git a/Text_Mining_Project.pdf b/Text_Mining_Project.pdf new file mode 100644 index 0000000..715aa07 Binary files /dev/null and b/Text_Mining_Project.pdf differ diff --git a/revisedtherapyanal.py b/revisedtherapyanal.py new file mode 100644 index 0000000..200cc67 --- /dev/null +++ b/revisedtherapyanal.py @@ -0,0 +1,85 @@ +import pickle +from pattern.en import * +#this is the list of things that will be cleaned out, in the order that they +#will be cleaned out. i determined this through trial. +badthings = ['\\n','lists.olin.edu','Anonymous rants and raves','Original', + 'Message','bounces','On Behalf Of','From','therapy','To','InReply', + 'PM','GMT','http','!','+','/','-','.',':',',','Re','RE','message', + 'Therapy','html','phpid','listinfo','mailman','olin edu','\\t','mailto', + 'From Date To Subject','To Subject','Sent','\\tmailman therapy', + 'MessageID','mailiman therapy','at Subject','ferences','Fromtherapy', + 'attherapy','Subject','Date','January','February','March','April', + 'May','June','July','August','September','October','November', + 'December','Jan','Feb','Apr','May','Jun','Jul','Aug','Sep','Oct', + 'Nov','Dec','ID','ply','On','AM','gmail','InplyIFID','InplyD', + 'Behalf Of',' ',' ',' ',' ',' ',' e ',' com ',' text ',' s ', + ' content ',' FW ',' RF ',' Of ',' Sniffer ',' Behalf ',' CO ',' E ',' C '] + +#opens and cleans the text file, first by translating extraneous symbols into +#None, and then by replacing all the bad things with spaces +old_therapyy = open('oldtherapy.txt', 'r+') +old_therapy = str(old_therapyy.read()) +old_therapy = old_therapy.translate(None, '>"[]()1*23%4;56&$789#0=@_?<\'') +old_therapy = str(old_therapy.partition(' ')) +for i in badthings: + old_therapy = old_therapy.replace(i,' ') +old_therapy = old_therapy.split() +old_therapyy.close() + +#same thing with the new therapy text file +new_therapyy = open('newtherapy.txt', 'r+') +new_therapy = str(new_therapyy.read()) +new_therapy = new_therapy.translate(None, '>/"[]()1*23%456&$789#0,!=@+_?<\'') +new_therapy = str(new_therapy.partition(' ')) +for i in badthings: + new_therapy = new_therapy.replace(i,' ') +new_therapy = new_therapy.split() +new_therapyy.close() + +# creates a dictionary with a count of the number of times a specific element +#occurs in a list +def histogram(s): + d = dict() + for c in s: + if d.get(c,0)>0: + d[c] = d[c] + 1 + else: + d[c] = d.get(c,1) + d.get(c,0) + return d + +#finds the items that occur most often in a list by making a histogram and +#sorting that dictionary from highest to lowest value +def most_frequent(s): + hist = histogram(s) + t = [] + for x, freq in hist.iteritems(): + t.append((freq, x)) + t.sort(reverse=True) + res = [] + for freq, x in t: + res.append(x) + return res + +#start at actual nouns, after all the articles and stuff +old_words = most_frequent(old_therapy)[23:223] +new_words = most_frequent(new_therapy)[20:220] + +#finds the words in each most frequent list that are not in the other +#and makes lists of them +unique_new = [] +unique_old = [] +for word in new_words: + if word not in old_words: + unique_new.append(word) +for word in old_words: + if word not in new_words: + unique_old.append(word) + +#razalts +print 'Sentiment for first 200 old therapy:'+ str(sentiment(old_words)) +print 'Sentiment for first 200 new therapy:'+ str(sentiment(new_words)) +print unique_old +print 'Sentiment for unique old therapy:'+ str(sentiment(unique_old)) +print unique_new +print 'Sentiment for unique new therapy:'+ str(sentiment(unique_new)) + diff --git a/therapyanal.py b/therapyanal.py new file mode 100644 index 0000000..4d7e55c --- /dev/null +++ b/therapyanal.py @@ -0,0 +1,213 @@ + +import pickle +from pattern.en import * +old_therapyy = open('oldtherapy.txt', 'r+') +old_therapy = str(old_therapyy.read()) +old_therapy = old_therapy.translate(None, '>"[]()1*23%4;56&$789#0=@_?<\'') +old_therapy = str(old_therapy.partition(' ')) +old_therapy = old_therapy.replace('\\n',' ') +old_therapy = old_therapy.replace('lists.olin.edu',' ') +old_therapy = old_therapy.replace('Anonymous rants and raves',' ') +old_therapy = old_therapy.replace('Original',' ') +old_therapy = old_therapy.replace('Message',' ') +old_therapy = old_therapy.replace('bounces',' ') +old_therapy = old_therapy.replace('On Behalf Of',' ') +old_therapy = old_therapy.replace('From',' ') +old_therapy = old_therapy.replace('therapy',' ') +old_therapy = old_therapy.replace('To',' ') +old_therapy = old_therapy.replace('InReply',' ') +old_therapy = old_therapy.replace('PM',' ') +old_therapy = old_therapy.replace('GMT',' ') +old_therapy = old_therapy.replace('http',' ') +old_therapy = old_therapy.replace('!',' ') +old_therapy = old_therapy.replace('+',' ') +old_therapy = old_therapy.replace('/',' ') +old_therapy = old_therapy.replace('-',' ') +old_therapy = old_therapy.replace('.',' ') +old_therapy = old_therapy.replace(':',' ') +old_therapy = old_therapy.replace(',',' ') +old_therapy = old_therapy.replace('Re',' ') +old_therapy = old_therapy.replace('RE',' ') +old_therapy = old_therapy.replace('Original message',' ') +old_therapy = old_therapy.replace('Therapy',' ') +old_therapy = old_therapy.replace('html',' ') +old_therapy = old_therapy.replace('phpid',' ') +old_therapy = old_therapy.replace('listinfo',' ') +old_therapy = old_therapy.replace('mailman','') +old_therapy = old_therapy.replace('olin edu',' ') +old_therapy = old_therapy.replace('\\t',' ') +old_therapy = old_therapy.replace('mailto',' ') +old_therapy = old_therapy.replace('From Date To Subject',' ') +old_therapy = old_therapy.replace('On Behalf Of Sent',' ') +old_therapy = old_therapy.replace('To Subject',' ') +old_therapy = old_therapy.replace('To therapy',' ') +old_therapy = old_therapy.replace('Sent',' ') +old_therapy = old_therapy.replace('\\tmailman therapy',' ') +old_therapy = old_therapy.replace('MessageID',' ') +old_therapy = old_therapy.replace('mailman therapy',' ') +old_therapy = old_therapy.replace('at Subject',' ') +old_therapy = old_therapy.replace('ferences',' ') +old_therapy = old_therapy.replace('Fromtherapy',' ') +old_therapy = old_therapy.replace('attherapy',' ') +old_therapy = old_therapy.replace('Subject',' ') +old_therapy = old_therapy.replace('Date',' ') +old_therapy = old_therapy.replace('November',' ') +old_therapy = old_therapy.replace('December',' ') +old_therapy = old_therapy.replace('January',' ') +old_therapy = old_therapy.replace('February',' ') +old_therapy = old_therapy.replace('March',' ') +old_therapy = old_therapy.replace('April',' ') +old_therapy = old_therapy.replace('May',' ') +old_therapy = old_therapy.replace('June',' ') +old_therapy = old_therapy.replace('July',' ') +old_therapy = old_therapy.replace('August',' ') +old_therapy = old_therapy.replace('September',' ') +old_therapy = old_therapy.replace('October',' ') +old_therapy = old_therapy.replace('Nov',' ') +old_therapy = old_therapy.replace('Dec',' ') +old_therapy = old_therapy.replace('Jan',' ') +old_therapy = old_therapy.replace('Feb',' ') +old_therapy = old_therapy.replace('Apr',' ') +old_therapy = old_therapy.replace('May',' ') +old_therapy = old_therapy.replace('Jun',' ') +old_therapy = old_therapy.replace('Jul',' ') +old_therapy = old_therapy.replace('Aug',' ') +old_therapy = old_therapy.replace('Sep',' ') +old_therapy = old_therapy.replace('Oct',' ') +old_therapy = old_therapy.replace('ID',' ') +old_therapy = old_therapy.replace('ply',' ') +old_therapy = old_therapy.replace('On',' ') +old_therapy = old_therapy.replace('AM',' ') +old_therapy = old_therapy.replace('gmail',' ') +old_therapy = old_therapy.replace('InplyIFID','') +old_therapy = old_therapy.replace('InplyD','') +old_therapy = old_therapy.replace('behalf','') +old_therapy = old_therapy.replace(' ',' ') +old_therapy = old_therapy.replace(' ',' ') +old_therapy = old_therapy.replace(' ',' ') +old_therapy = old_therapy.replace(' ',' ') +old_therapy = old_therapy.replace(' ',' ') +old_therapy = old_therapy.replace(' e ',' ') +old_therapy = old_therapy.replace(' com ',' ') +old_therapy = old_therapy.replace(' text ',' ') +old_therapy = old_therapy.replace(' content ',' ') +old_therapy = old_therapy.replace(' FW ',' ') +old_therapy = old_therapy.split() +old_therapyy.close() + +new_therapyy = open('newtherapy.txt', 'r+') +new_therapy = str(new_therapyy.read()) +new_therapy = new_therapy.translate(None, '>/"[]()1*23%456&$789#0,!=@+_?<\'') +new_therapy = str(new_therapy.partition(' ')) +new_therapy = new_therapy.replace('\\n',' ') +new_therapy = new_therapy.replace('lists.olin.edu',' ') +new_therapy = new_therapy.replace('Original',' ') +new_therapy = new_therapy.replace('Message',' ') +new_therapy = new_therapy.replace('therapy',' ') +new_therapy = new_therapy.replace('PM',' ') +new_therapy = new_therapy.replace('GMT',' ') +new_therapy = new_therapy.replace('http',' ') +new_therapy = new_therapy.replace('-',' ') +new_therapy = new_therapy.replace('.',' ') +new_therapy = new_therapy.replace(':',' ') +new_therapy = new_therapy.replace('InReply',' ') +new_therapy = new_therapy.replace('Re',' ') +new_therapy = new_therapy.replace('Original message',' ') +new_therapy = new_therapy.replace('Therapy',' ') +new_therapy = new_therapy.replace('heredownatyale',' ') +new_therapy = new_therapy.replace('orgtheyaleproblem',' ') +new_therapy = new_therapy.replace('beginsinhighschool',' ') +new_therapy = new_therapy.replace('heterodoxacademy',' ') +new_therapy = new_therapy.replace('phpid',' ') +new_therapy = new_therapy.replace('bounces',' ') +new_therapy = new_therapy.replace('From Date To Subject',' ') +new_therapy = new_therapy.replace('On Behalf Of',' ') +new_therapy = new_therapy.replace('To',' ') +new_therapy = new_therapy.replace('Sent',' ') +new_therapy = new_therapy.replace('\\tmailman',' ') +new_therapy = new_therapy.replace('ID',' ') +new_therapy = new_therapy.replace('mailman',' ') +new_therapy = new_therapy.replace('at Subject',' ') +new_therapy = new_therapy.replace('ferences',' ') +new_therapy = new_therapy.replace('Subject',' ') +new_therapy = new_therapy.replace('Date',' ') +new_therapy = new_therapy.replace('November',' ') +new_therapy = new_therapy.replace('December',' ') +new_therapy = new_therapy.replace('January',' ') +new_therapy = new_therapy.replace('February',' ') +new_therapy = new_therapy.replace('March',' ') +new_therapy = new_therapy.replace('April',' ') +new_therapy = new_therapy.replace('May',' ') +new_therapy = new_therapy.replace('June',' ') +new_therapy = new_therapy.replace('July',' ') +new_therapy = new_therapy.replace('August',' ') +new_therapy = new_therapy.replace('September',' ') +new_therapy = new_therapy.replace('October',' ') +new_therapy = new_therapy.replace('Nov',' ') +new_therapy = new_therapy.replace('Apr',' ') +new_therapy = new_therapy.replace('Dec',' ') +new_therapy = new_therapy.replace('Feb',' ') +new_therapy = new_therapy.replace('Jan',' ') +new_therapy = new_therapy.replace('From',' ') +new_therapy = new_therapy.replace('mailto',' ') +new_therapy = new_therapy.replace('Behalf','') +new_therapy = new_therapy.replace('behalf','') +new_therapy = new_therapy.replace('Of','') +new_therapy = new_therapy.replace('AM','') +new_therapy = new_therapy.replace('ply','') +new_therapy = new_therapy.replace(' ',' ') +new_therapy = new_therapy.replace(' ',' ') +new_therapy = new_therapy.replace(' ',' ') +new_therapy = new_therapy.replace(' ',' ') +new_therapy = new_therapy.replace(' ',' ') +new_therapy = new_therapy.replace('Anonymous rants and raves','') +new_therapy = new_therapy.replace(' Id ','') +new_therapy = new_therapy.replace(' RF ','') +new_therapy = new_therapy.replace(' Sniffer ','') +new_therapy = new_therapy.replace(' E ','') +new_therapy = new_therapy.replace(' CO ','') +new_therapy = new_therapy.replace(' s ','') +new_therapy = new_therapy.replace(' C ','') +new_therapy = new_therapy.split() +new_therapyy.close() + +def histogram(s): + d = dict() + for c in s: + if d.get(c,0)>0: + d[c] = d[c] + 1 + else: + d[c] = d.get(c,1) + d.get(c,0) + return d + +def most_frequent(s): + hist = histogram(s) + t = [] + for x, freq in hist.iteritems(): + t.append((freq, x)) + t.sort(reverse=True) + res = [] + for freq, x in t: + res.append(x) + return res + +old_words = most_frequent(old_therapy)[23:223] +new_words = most_frequent(new_therapy)[20:220] + +unique_new = [] +unique_old = [] +for word in new_words: + if word not in old_words: + unique_new.append(word) +for word in old_words: + if word not in new_words: + unique_old.append(word) + + +print 'Sentiment for first 200 old therapy:'+ str(sentiment(old_words)) +print 'Sentiment for first 200 new therapy:'+ str(sentiment(new_words)) +print unique_old +print 'Sentiment for unique old therapy:'+ str(sentiment(unique_old)) +print unique_new +print 'Sentiment for unique new therapy:'+ str(sentiment(unique_new)) +