diff --git a/Text_Mining_Project.pdf b/Text_Mining_Project.pdf new file mode 100644 index 0000000..715aa07 Binary files /dev/null and b/Text_Mining_Project.pdf differ diff --git a/therapyanal.py b/therapyanal.py new file mode 100644 index 0000000..4d7e55c --- /dev/null +++ b/therapyanal.py @@ -0,0 +1,213 @@ + +import pickle +from pattern.en import * +old_therapyy = open('oldtherapy.txt', 'r+') +old_therapy = str(old_therapyy.read()) +old_therapy = old_therapy.translate(None, '>"[]()1*23%4;56&$789#0=@_?<\'') +old_therapy = str(old_therapy.partition(' ')) +old_therapy = old_therapy.replace('\\n',' ') +old_therapy = old_therapy.replace('lists.olin.edu',' ') +old_therapy = old_therapy.replace('Anonymous rants and raves',' ') +old_therapy = old_therapy.replace('Original',' ') +old_therapy = old_therapy.replace('Message',' ') +old_therapy = old_therapy.replace('bounces',' ') +old_therapy = old_therapy.replace('On Behalf Of',' ') +old_therapy = old_therapy.replace('From',' ') +old_therapy = old_therapy.replace('therapy',' ') +old_therapy = old_therapy.replace('To',' ') +old_therapy = old_therapy.replace('InReply',' ') +old_therapy = old_therapy.replace('PM',' ') +old_therapy = old_therapy.replace('GMT',' ') +old_therapy = old_therapy.replace('http',' ') +old_therapy = old_therapy.replace('!',' ') +old_therapy = old_therapy.replace('+',' ') +old_therapy = old_therapy.replace('/',' ') +old_therapy = old_therapy.replace('-',' ') +old_therapy = old_therapy.replace('.',' ') +old_therapy = old_therapy.replace(':',' ') +old_therapy = old_therapy.replace(',',' ') +old_therapy = old_therapy.replace('Re',' ') +old_therapy = old_therapy.replace('RE',' ') +old_therapy = old_therapy.replace('Original message',' ') +old_therapy = old_therapy.replace('Therapy',' ') +old_therapy = old_therapy.replace('html',' ') +old_therapy = old_therapy.replace('phpid',' ') +old_therapy = old_therapy.replace('listinfo',' ') +old_therapy = old_therapy.replace('mailman','') +old_therapy = old_therapy.replace('olin edu',' ') +old_therapy = old_therapy.replace('\\t',' ') +old_therapy = old_therapy.replace('mailto',' ') +old_therapy = old_therapy.replace('From Date To Subject',' ') +old_therapy = old_therapy.replace('On Behalf Of Sent',' ') +old_therapy = old_therapy.replace('To Subject',' ') +old_therapy = old_therapy.replace('To therapy',' ') +old_therapy = old_therapy.replace('Sent',' ') +old_therapy = old_therapy.replace('\\tmailman therapy',' ') +old_therapy = old_therapy.replace('MessageID',' ') +old_therapy = old_therapy.replace('mailman therapy',' ') +old_therapy = old_therapy.replace('at Subject',' ') +old_therapy = old_therapy.replace('ferences',' ') +old_therapy = old_therapy.replace('Fromtherapy',' ') +old_therapy = old_therapy.replace('attherapy',' ') +old_therapy = old_therapy.replace('Subject',' ') +old_therapy = old_therapy.replace('Date',' ') +old_therapy = old_therapy.replace('November',' ') +old_therapy = old_therapy.replace('December',' ') +old_therapy = old_therapy.replace('January',' ') +old_therapy = old_therapy.replace('February',' ') +old_therapy = old_therapy.replace('March',' ') +old_therapy = old_therapy.replace('April',' ') +old_therapy = old_therapy.replace('May',' ') +old_therapy = old_therapy.replace('June',' ') +old_therapy = old_therapy.replace('July',' ') +old_therapy = old_therapy.replace('August',' ') +old_therapy = old_therapy.replace('September',' ') +old_therapy = old_therapy.replace('October',' ') +old_therapy = old_therapy.replace('Nov',' ') +old_therapy = old_therapy.replace('Dec',' ') +old_therapy = old_therapy.replace('Jan',' ') +old_therapy = old_therapy.replace('Feb',' ') +old_therapy = old_therapy.replace('Apr',' ') +old_therapy = old_therapy.replace('May',' ') +old_therapy = old_therapy.replace('Jun',' ') +old_therapy = old_therapy.replace('Jul',' ') +old_therapy = old_therapy.replace('Aug',' ') +old_therapy = old_therapy.replace('Sep',' ') +old_therapy = old_therapy.replace('Oct',' ') +old_therapy = old_therapy.replace('ID',' ') +old_therapy = old_therapy.replace('ply',' ') +old_therapy = old_therapy.replace('On',' ') +old_therapy = old_therapy.replace('AM',' ') +old_therapy = old_therapy.replace('gmail',' ') +old_therapy = old_therapy.replace('InplyIFID','') +old_therapy = old_therapy.replace('InplyD','') +old_therapy = old_therapy.replace('behalf','') +old_therapy = old_therapy.replace(' ',' ') +old_therapy = old_therapy.replace(' ',' ') +old_therapy = old_therapy.replace(' ',' ') +old_therapy = old_therapy.replace(' ',' ') +old_therapy = old_therapy.replace(' ',' ') +old_therapy = old_therapy.replace(' e ',' ') +old_therapy = old_therapy.replace(' com ',' ') +old_therapy = old_therapy.replace(' text ',' ') +old_therapy = old_therapy.replace(' content ',' ') +old_therapy = old_therapy.replace(' FW ',' ') +old_therapy = old_therapy.split() +old_therapyy.close() + +new_therapyy = open('newtherapy.txt', 'r+') +new_therapy = str(new_therapyy.read()) +new_therapy = new_therapy.translate(None, '>/"[]()1*23%456&$789#0,!=@+_?<\'') +new_therapy = str(new_therapy.partition(' ')) +new_therapy = new_therapy.replace('\\n',' ') +new_therapy = new_therapy.replace('lists.olin.edu',' ') +new_therapy = new_therapy.replace('Original',' ') +new_therapy = new_therapy.replace('Message',' ') +new_therapy = new_therapy.replace('therapy',' ') +new_therapy = new_therapy.replace('PM',' ') +new_therapy = new_therapy.replace('GMT',' ') +new_therapy = new_therapy.replace('http',' ') +new_therapy = new_therapy.replace('-',' ') +new_therapy = new_therapy.replace('.',' ') +new_therapy = new_therapy.replace(':',' ') +new_therapy = new_therapy.replace('InReply',' ') +new_therapy = new_therapy.replace('Re',' ') +new_therapy = new_therapy.replace('Original message',' ') +new_therapy = new_therapy.replace('Therapy',' ') +new_therapy = new_therapy.replace('heredownatyale',' ') +new_therapy = new_therapy.replace('orgtheyaleproblem',' ') +new_therapy = new_therapy.replace('beginsinhighschool',' ') +new_therapy = new_therapy.replace('heterodoxacademy',' ') +new_therapy = new_therapy.replace('phpid',' ') +new_therapy = new_therapy.replace('bounces',' ') +new_therapy = new_therapy.replace('From Date To Subject',' ') +new_therapy = new_therapy.replace('On Behalf Of',' ') +new_therapy = new_therapy.replace('To',' ') +new_therapy = new_therapy.replace('Sent',' ') +new_therapy = new_therapy.replace('\\tmailman',' ') +new_therapy = new_therapy.replace('ID',' ') +new_therapy = new_therapy.replace('mailman',' ') +new_therapy = new_therapy.replace('at Subject',' ') +new_therapy = new_therapy.replace('ferences',' ') +new_therapy = new_therapy.replace('Subject',' ') +new_therapy = new_therapy.replace('Date',' ') +new_therapy = new_therapy.replace('November',' ') +new_therapy = new_therapy.replace('December',' ') +new_therapy = new_therapy.replace('January',' ') +new_therapy = new_therapy.replace('February',' ') +new_therapy = new_therapy.replace('March',' ') +new_therapy = new_therapy.replace('April',' ') +new_therapy = new_therapy.replace('May',' ') +new_therapy = new_therapy.replace('June',' ') +new_therapy = new_therapy.replace('July',' ') +new_therapy = new_therapy.replace('August',' ') +new_therapy = new_therapy.replace('September',' ') +new_therapy = new_therapy.replace('October',' ') +new_therapy = new_therapy.replace('Nov',' ') +new_therapy = new_therapy.replace('Apr',' ') +new_therapy = new_therapy.replace('Dec',' ') +new_therapy = new_therapy.replace('Feb',' ') +new_therapy = new_therapy.replace('Jan',' ') +new_therapy = new_therapy.replace('From',' ') +new_therapy = new_therapy.replace('mailto',' ') +new_therapy = new_therapy.replace('Behalf','') +new_therapy = new_therapy.replace('behalf','') +new_therapy = new_therapy.replace('Of','') +new_therapy = new_therapy.replace('AM','') +new_therapy = new_therapy.replace('ply','') +new_therapy = new_therapy.replace(' ',' ') +new_therapy = new_therapy.replace(' ',' ') +new_therapy = new_therapy.replace(' ',' ') +new_therapy = new_therapy.replace(' ',' ') +new_therapy = new_therapy.replace(' ',' ') +new_therapy = new_therapy.replace('Anonymous rants and raves','') +new_therapy = new_therapy.replace(' Id ','') +new_therapy = new_therapy.replace(' RF ','') +new_therapy = new_therapy.replace(' Sniffer ','') +new_therapy = new_therapy.replace(' E ','') +new_therapy = new_therapy.replace(' CO ','') +new_therapy = new_therapy.replace(' s ','') +new_therapy = new_therapy.replace(' C ','') +new_therapy = new_therapy.split() +new_therapyy.close() + +def histogram(s): + d = dict() + for c in s: + if d.get(c,0)>0: + d[c] = d[c] + 1 + else: + d[c] = d.get(c,1) + d.get(c,0) + return d + +def most_frequent(s): + hist = histogram(s) + t = [] + for x, freq in hist.iteritems(): + t.append((freq, x)) + t.sort(reverse=True) + res = [] + for freq, x in t: + res.append(x) + return res + +old_words = most_frequent(old_therapy)[23:223] +new_words = most_frequent(new_therapy)[20:220] + +unique_new = [] +unique_old = [] +for word in new_words: + if word not in old_words: + unique_new.append(word) +for word in old_words: + if word not in new_words: + unique_old.append(word) + + +print 'Sentiment for first 200 old therapy:'+ str(sentiment(old_words)) +print 'Sentiment for first 200 new therapy:'+ str(sentiment(new_words)) +print unique_old +print 'Sentiment for unique old therapy:'+ str(sentiment(unique_old)) +print unique_new +print 'Sentiment for unique new therapy:'+ str(sentiment(unique_new)) +