Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added Text_Mining_Project.pdf
Binary file not shown.
85 changes: 85 additions & 0 deletions revisedtherapyanal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import pickle
from pattern.en import *
#this is the list of things that will be cleaned out, in the order that they
#will be cleaned out. i determined this through trial.
badthings = ['\\n','lists.olin.edu','Anonymous rants and raves','Original',
Copy link

@LucyWilcox LucyWilcox May 5, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is fine, but you probably could have done a bit more cleaning up, or something else in addition.

'Message','bounces','On Behalf Of','From','therapy','To','InReply',
'PM','GMT','http','!','+','/','-','.',':',',','Re','RE','message',
'Therapy','html','phpid','listinfo','mailman','olin edu','\\t','mailto',
'From Date To Subject','To Subject','Sent','\\tmailman therapy',
'MessageID','mailiman therapy','at Subject','ferences','Fromtherapy',
'attherapy','Subject','Date','January','February','March','April',
'May','June','July','August','September','October','November',
'December','Jan','Feb','Apr','May','Jun','Jul','Aug','Sep','Oct',
'Nov','Dec','ID','ply','On','AM','gmail','InplyIFID','InplyD',
'Behalf Of',' ',' ',' ',' ',' ',' e ',' com ',' text ',' s ',
' content ',' FW ',' RF ',' Of ',' Sniffer ',' Behalf ',' CO ',' E ',' C ']

#opens and cleans the text file, first by translating extraneous symbols into
#None, and then by replacing all the bad things with spaces
old_therapyy = open('oldtherapy.txt', 'r+')
old_therapy = str(old_therapyy.read())
old_therapy = old_therapy.translate(None, '>"[]()1*23%4;56&$789#0=@_?<\'')
old_therapy = str(old_therapy.partition(' '))
for i in badthings:
old_therapy = old_therapy.replace(i,' ')
old_therapy = old_therapy.split()
old_therapyy.close()

#same thing with the new therapy text file
new_therapyy = open('newtherapy.txt', 'r+')
new_therapy = str(new_therapyy.read())
new_therapy = new_therapy.translate(None, '>/"[]()1*23%456&$789#0,!=@+_?<\'')
new_therapy = str(new_therapy.partition(' '))
for i in badthings:
new_therapy = new_therapy.replace(i,' ')
new_therapy = new_therapy.split()
new_therapyy.close()

# creates a dictionary with a count of the number of times a specific element
#occurs in a list
def histogram(s):
d = dict()
for c in s:
if d.get(c,0)>0:
d[c] = d[c] + 1
else:
d[c] = d.get(c,1) + d.get(c,0)
return d

#finds the items that occur most often in a list by making a histogram and
#sorting that dictionary from highest to lowest value
def most_frequent(s):
hist = histogram(s)
t = []
for x, freq in hist.iteritems():
t.append((freq, x))
t.sort(reverse=True)
res = []
for freq, x in t:
res.append(x)
return res

#start at actual nouns, after all the articles and stuff
old_words = most_frequent(old_therapy)[23:223]
new_words = most_frequent(new_therapy)[20:220]

#finds the words in each most frequent list that are not in the other
#and makes lists of them
unique_new = []
unique_old = []
for word in new_words:
if word not in old_words:
unique_new.append(word)
for word in old_words:
if word not in new_words:
unique_old.append(word)

#razalts
print 'Sentiment for first 200 old therapy:'+ str(sentiment(old_words))
print 'Sentiment for first 200 new therapy:'+ str(sentiment(new_words))
print unique_old
print 'Sentiment for unique old therapy:'+ str(sentiment(unique_old))
print unique_new
print 'Sentiment for unique new therapy:'+ str(sentiment(unique_new))

213 changes: 213 additions & 0 deletions therapyanal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@

import pickle
from pattern.en import *
old_therapyy = open('oldtherapy.txt', 'r+')
old_therapy = str(old_therapyy.read())
old_therapy = old_therapy.translate(None, '>"[]()1*23%4;56&$789#0=@_?<\'')
old_therapy = str(old_therapy.partition(' '))
old_therapy = old_therapy.replace('\\n',' ')
old_therapy = old_therapy.replace('lists.olin.edu',' ')
old_therapy = old_therapy.replace('Anonymous rants and raves',' ')
old_therapy = old_therapy.replace('Original',' ')
old_therapy = old_therapy.replace('Message',' ')
old_therapy = old_therapy.replace('bounces',' ')
old_therapy = old_therapy.replace('On Behalf Of',' ')
old_therapy = old_therapy.replace('From',' ')
old_therapy = old_therapy.replace('therapy',' ')
old_therapy = old_therapy.replace('To',' ')
old_therapy = old_therapy.replace('InReply',' ')
old_therapy = old_therapy.replace('PM',' ')
old_therapy = old_therapy.replace('GMT',' ')
old_therapy = old_therapy.replace('http',' ')
old_therapy = old_therapy.replace('!',' ')
old_therapy = old_therapy.replace('+',' ')
old_therapy = old_therapy.replace('/',' ')
old_therapy = old_therapy.replace('-',' ')
old_therapy = old_therapy.replace('.',' ')
old_therapy = old_therapy.replace(':',' ')
old_therapy = old_therapy.replace(',',' ')
old_therapy = old_therapy.replace('Re',' ')
old_therapy = old_therapy.replace('RE',' ')
old_therapy = old_therapy.replace('Original message',' ')
old_therapy = old_therapy.replace('Therapy',' ')
old_therapy = old_therapy.replace('html',' ')
old_therapy = old_therapy.replace('phpid',' ')
old_therapy = old_therapy.replace('listinfo',' ')
old_therapy = old_therapy.replace('mailman','')
old_therapy = old_therapy.replace('olin edu',' ')
old_therapy = old_therapy.replace('\\t',' ')
old_therapy = old_therapy.replace('mailto',' ')
old_therapy = old_therapy.replace('From Date To Subject',' ')
old_therapy = old_therapy.replace('On Behalf Of Sent',' ')
old_therapy = old_therapy.replace('To Subject',' ')
old_therapy = old_therapy.replace('To therapy',' ')
old_therapy = old_therapy.replace('Sent',' ')
old_therapy = old_therapy.replace('\\tmailman therapy',' ')
old_therapy = old_therapy.replace('MessageID',' ')
old_therapy = old_therapy.replace('mailman therapy',' ')
old_therapy = old_therapy.replace('at Subject',' ')
old_therapy = old_therapy.replace('ferences',' ')
old_therapy = old_therapy.replace('Fromtherapy',' ')
old_therapy = old_therapy.replace('attherapy',' ')
old_therapy = old_therapy.replace('Subject',' ')
old_therapy = old_therapy.replace('Date',' ')
old_therapy = old_therapy.replace('November',' ')
old_therapy = old_therapy.replace('December',' ')
old_therapy = old_therapy.replace('January',' ')
old_therapy = old_therapy.replace('February',' ')
old_therapy = old_therapy.replace('March',' ')
old_therapy = old_therapy.replace('April',' ')
old_therapy = old_therapy.replace('May',' ')
old_therapy = old_therapy.replace('June',' ')
old_therapy = old_therapy.replace('July',' ')
old_therapy = old_therapy.replace('August',' ')
old_therapy = old_therapy.replace('September',' ')
old_therapy = old_therapy.replace('October',' ')
old_therapy = old_therapy.replace('Nov',' ')
old_therapy = old_therapy.replace('Dec',' ')
old_therapy = old_therapy.replace('Jan',' ')
old_therapy = old_therapy.replace('Feb',' ')
old_therapy = old_therapy.replace('Apr',' ')
old_therapy = old_therapy.replace('May',' ')
old_therapy = old_therapy.replace('Jun',' ')
old_therapy = old_therapy.replace('Jul',' ')
old_therapy = old_therapy.replace('Aug',' ')
old_therapy = old_therapy.replace('Sep',' ')
old_therapy = old_therapy.replace('Oct',' ')
old_therapy = old_therapy.replace('ID',' ')
old_therapy = old_therapy.replace('ply',' ')
old_therapy = old_therapy.replace('On',' ')
old_therapy = old_therapy.replace('AM',' ')
old_therapy = old_therapy.replace('gmail',' ')
old_therapy = old_therapy.replace('InplyIFID','')
old_therapy = old_therapy.replace('InplyD','')
old_therapy = old_therapy.replace('behalf','')
old_therapy = old_therapy.replace(' ',' ')
old_therapy = old_therapy.replace(' ',' ')
old_therapy = old_therapy.replace(' ',' ')
old_therapy = old_therapy.replace(' ',' ')
old_therapy = old_therapy.replace(' ',' ')
old_therapy = old_therapy.replace(' e ',' ')
old_therapy = old_therapy.replace(' com ',' ')
old_therapy = old_therapy.replace(' text ',' ')
old_therapy = old_therapy.replace(' content ',' ')
old_therapy = old_therapy.replace(' FW ',' ')
old_therapy = old_therapy.split()
old_therapyy.close()

new_therapyy = open('newtherapy.txt', 'r+')
new_therapy = str(new_therapyy.read())
new_therapy = new_therapy.translate(None, '>/"[]()1*23%456&$789#0,!=@+_?<\'')
new_therapy = str(new_therapy.partition(' '))
new_therapy = new_therapy.replace('\\n',' ')
new_therapy = new_therapy.replace('lists.olin.edu',' ')
new_therapy = new_therapy.replace('Original',' ')
new_therapy = new_therapy.replace('Message',' ')
new_therapy = new_therapy.replace('therapy',' ')
new_therapy = new_therapy.replace('PM',' ')
new_therapy = new_therapy.replace('GMT',' ')
new_therapy = new_therapy.replace('http',' ')
new_therapy = new_therapy.replace('-',' ')
new_therapy = new_therapy.replace('.',' ')
new_therapy = new_therapy.replace(':',' ')
new_therapy = new_therapy.replace('InReply',' ')
new_therapy = new_therapy.replace('Re',' ')
new_therapy = new_therapy.replace('Original message',' ')
new_therapy = new_therapy.replace('Therapy',' ')
new_therapy = new_therapy.replace('heredownatyale',' ')
new_therapy = new_therapy.replace('orgtheyaleproblem',' ')
new_therapy = new_therapy.replace('beginsinhighschool',' ')
new_therapy = new_therapy.replace('heterodoxacademy',' ')
new_therapy = new_therapy.replace('phpid',' ')
new_therapy = new_therapy.replace('bounces',' ')
new_therapy = new_therapy.replace('From Date To Subject',' ')
new_therapy = new_therapy.replace('On Behalf Of',' ')
new_therapy = new_therapy.replace('To',' ')
new_therapy = new_therapy.replace('Sent',' ')
new_therapy = new_therapy.replace('\\tmailman',' ')
new_therapy = new_therapy.replace('ID',' ')
new_therapy = new_therapy.replace('mailman',' ')
new_therapy = new_therapy.replace('at Subject',' ')
new_therapy = new_therapy.replace('ferences',' ')
new_therapy = new_therapy.replace('Subject',' ')
new_therapy = new_therapy.replace('Date',' ')
new_therapy = new_therapy.replace('November',' ')
new_therapy = new_therapy.replace('December',' ')
new_therapy = new_therapy.replace('January',' ')
new_therapy = new_therapy.replace('February',' ')
new_therapy = new_therapy.replace('March',' ')
new_therapy = new_therapy.replace('April',' ')
new_therapy = new_therapy.replace('May',' ')
new_therapy = new_therapy.replace('June',' ')
new_therapy = new_therapy.replace('July',' ')
new_therapy = new_therapy.replace('August',' ')
new_therapy = new_therapy.replace('September',' ')
new_therapy = new_therapy.replace('October',' ')
new_therapy = new_therapy.replace('Nov',' ')
new_therapy = new_therapy.replace('Apr',' ')
new_therapy = new_therapy.replace('Dec',' ')
new_therapy = new_therapy.replace('Feb',' ')
new_therapy = new_therapy.replace('Jan',' ')
new_therapy = new_therapy.replace('From',' ')
new_therapy = new_therapy.replace('mailto',' ')
new_therapy = new_therapy.replace('Behalf','')
new_therapy = new_therapy.replace('behalf','')
new_therapy = new_therapy.replace('Of','')
new_therapy = new_therapy.replace('AM','')
new_therapy = new_therapy.replace('ply','')
new_therapy = new_therapy.replace(' ',' ')
new_therapy = new_therapy.replace(' ',' ')
new_therapy = new_therapy.replace(' ',' ')
new_therapy = new_therapy.replace(' ',' ')
new_therapy = new_therapy.replace(' ',' ')
new_therapy = new_therapy.replace('Anonymous rants and raves','')
new_therapy = new_therapy.replace(' Id ','')
new_therapy = new_therapy.replace(' RF ','')
new_therapy = new_therapy.replace(' Sniffer ','')
new_therapy = new_therapy.replace(' E ','')
new_therapy = new_therapy.replace(' CO ','')
new_therapy = new_therapy.replace(' s ','')
new_therapy = new_therapy.replace(' C ','')
new_therapy = new_therapy.split()
new_therapyy.close()

def histogram(s):
d = dict()
for c in s:
if d.get(c,0)>0:
d[c] = d[c] + 1
else:
d[c] = d.get(c,1) + d.get(c,0)
return d

def most_frequent(s):
hist = histogram(s)
t = []
for x, freq in hist.iteritems():
t.append((freq, x))
t.sort(reverse=True)
res = []
for freq, x in t:
res.append(x)
return res

old_words = most_frequent(old_therapy)[23:223]
new_words = most_frequent(new_therapy)[20:220]

unique_new = []
unique_old = []
for word in new_words:
if word not in old_words:
unique_new.append(word)
for word in old_words:
if word not in new_words:
unique_old.append(word)


print 'Sentiment for first 200 old therapy:'+ str(sentiment(old_words))
print 'Sentiment for first 200 new therapy:'+ str(sentiment(new_words))
print unique_old
print 'Sentiment for unique old therapy:'+ str(sentiment(unique_old))
print unique_new
print 'Sentiment for unique new therapy:'+ str(sentiment(unique_new))