Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.txt
Binary file added Text_Mining_Reflection_AHOPPE.pdf
Binary file not shown.
64 changes: 64 additions & 0 deletions markov_chat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
""" This program is an extension of the Text Mining mini project I did on the
text in our Facebook group chat. It synthesizes a semi-interactive group chat
exchange """
import message_scrape as ms
import random
import pickle
import datetime

def start_chat(trained_dict):
print 'Welcome to the 2015 Comrades Markov chat!\nPress ENTER to get started!'
while True:
raw_input()
print create_msg(trained_dict)

def train(message_block):
"""This method will use a dictionary full of lists to capture every token
that could come after any given token in a set"""
tokens = message_block.split()
td = {token:[] for token in tokens}
for i in range(len(tokens) - 1):
td[tokens[i]].append(tokens[i+1])
return td


def create_msg(t_dict):
"""this function creates a message by adding on messages from the training set
until it reaches a newline character"""
_run = True
msg = ''
last = ''
nxt = ''
while _run:
if not last:
#pick a random value from a random key
nxt = random.choice(t_dict[random.choice(t_dict.keys())])
else:
#pick a random value from the last key
nxt = random.choice(t_dict[last])
if nxt == '|':
_run = False
nxt = '\n'
#update step
msg += nxt + " "
last = nxt

if msg == "\n ":
msg = ':thumbs-up:\n'
return str(datetime.datetime.now().time()) +' > '+ msg

if __name__ == '__main__':
#check if we've already trained this thing
try:
f = open('t_dict.txt', 'r+')
t_dict = pickle.load(f)
except IOError:
f = open('t_dict.txt', 'w')
#scrape the messages
block = ms.get_msgs('messages.htm', 'DEFAULT')
text = ms.strip_markov(block)
#train the data structure
t_dict = train(text)
pickle.dump(t_dict, f)
#start the chatbot
start_chat(t_dict)
105 changes: 105 additions & 0 deletions message_scrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
"""
This project takes a formatted Facebook message dump, messages.htm, scrapes it
for the most frequent words in a message thread I specified, and then
returns a word cloud of the most popular words in the thread.

I opted out of unit tests for BeautifulSoup things because the way it does data
types is difficult and confusing.
"""
from bs4 import BeautifulSoup
from wordcloud import WordCloud
import string
import sys


def get_msgs(filename, threadname):
"""
This function takes a filename object that is the entire message dump
and returns the block of messages with the thread being searched for
"""
# read in file
bs = BeautifulSoup(open(filename), 'lxml')
# get to the correct thread
block = bs.body.contents[1].div.contents[70]
return block

def strip_msgs(msg_block):
"""
Strips all the messages out of the block of text and removes their p tags
"""
# get all the messages without any of the metadata
tagged_msgs = msg_block.contents[2::2]
# strip p tags
msgs = [str(m)[3:-4] for m in tagged_msgs]
# make them into a string
return " ".join(msgs)

def strip_markov(msg_block):
"""
Strips all the messages out of the block of text and removes their p tags, but leaves them as a list
"""
# get all the messages without any of the metadata
tagged_msgs = msg_block.contents[2::2]
# strip p tags
msgs = [str(m)[3:-4] for m in tagged_msgs]
# add a special designator for End of Message
return " |\n".join(msgs)

def word_cloud(text):
"""
This function makes a wordcloud object and attempts to generate a word cloud
using the collected messages.
"""
wc = WordCloud()
wc.generate(text)
wc.to_file('test.png')

def get_freq(text):
"""
This function makes a frequency dictionary of all of the words it's given
"""
words = dict()
total_words = 0
for raw in text.split():
total_words += 1
word = raw.rstrip(string.punctuation).lstrip(string.punctuation).lower()
words[word] = words.get(word, 0) + 1
return words, total_words

def decorate_sort(dictionary):
"""
This function takes a dictionary and sorts it, returning it as an ordered
list of tuples of frequency-word pairs
"""
output = [(dictionary[word], word) for word in dictionary.keys()]
output.sort(reverse=True)
return output

if __name__ == '__main__':
#prompt user for input
print 'How many words to display: '
words = int(raw_input())
print 'What thread to look for: '
thread = raw_input()

#Extract message data
block = get_msgs('messages.htm', thread)
text = strip_msgs(block)

strip_markov(block)

#try to make a word cloud
try:
word_cloud(text)
except ImportError as e:
print e
print "Probably a libfreetype error again"

# Generate sorted frequency list
freq, total = get_freq(text)
freq = decorate_sort(freq)

print 'Of {} total words in the chat,'.format(total)
print 'the {} most used words in our chat are: '.format(words)
string_freq = [str(e) for e in freq]
print "\n".join(string_freq[:words])