Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# TextMining

This is the base repo for the text mining and analysis project for Software Design at Olin College.
My writeup is located at https://github.com/NickShermeister/TextMining/blob/master/Writeup.pdf
Binary file added Writeup.pdf
Binary file not shown.
561 changes: 561 additions & 0 deletions alltrump

Large diffs are not rendered by default.

Binary file added allwords.p
Binary file not shown.
Binary file added backup.p
Binary file not shown.
Binary file added dict1.p
Binary file not shown.
Binary file added dict2.p
Binary file not shown.
Binary file added dicts.p
Binary file not shown.
174 changes: 174 additions & 0 deletions project.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
"""
Mini Project 5--Re-editing of Mini Project 3 (Text Mining)
Author: Nicholas Sherman
Date: 4/22/2017
Class: Software Design

This project markov chains Family Guy scripts with Donald Trump speeches for entertainment purposes.
"""
from bs4 import BeautifulSoup
import requests
import re
import pickle
import os.path
from pathlib import Path
import random

def dict_creation_one(script, things = dict()):
'''Create the base dictionary for ONE word; only to be called if there is no dicts1.p already.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I appreciate the new header comments and file documentation - the program itself is cleaner and more follow-able as a result.

Basically, this is a one-off dictionary in order to determine the first words. It also allows
for the possibility of more randomness at some point.
inputs: a single string containing all scripts, a dictionary can be overloaded to help shorten the creation time of the dictionary
outputs: a dictionary of one word keys for markov chaining'''
entiredict = things
everything = script
newstring = ''
for x in script: #get rid of symbols that I don't want in my speech
if x not in ('[', ']', '@', '#', '$', '%', '^', '&', '*', '(', ')', ':', '"', '\\', '—', '|', '-'):
newstring += (x)
newstring = newstring.lower()
listowords = newstring.split()
index = 0
for y in listowords: #run through all of the words in the speech and create a dictionary with them; this is a single word dictionary
index += 1
if index != len(listowords):
if y not in entiredict:
entiredict[y] = [listowords[index]]
else:
entiredict[y].append(listowords[index])
return entiredict

def dict_creation_two(script, things = dict()):
'''Create the base dictionary for TWO words; only to be called if there is no dicts.p already.
This is the dictionary that is referenced for most of the project.
inputs: a single string containing all scripts, a dictionary can be overloaded to help shorten the creation time of the dictionary
outputs: a dictionary with two word keys for markov chaining'''
entiredict = things
everything = script
newstring = ''
for x in script: #get rid of symbols that I don't want in my speech
if x not in ('[', ']', '@', '#', '$', '%', '^', '&', '*', '(', ')', ':', '"', '\\', '—', '|', '-'):
if x in ('!', '.', '?'):
newstring += (x)
else:
newstring += (x)
newstring = newstring.lower()
listowords = newstring.split()
index = 0
for x in range(0, len(listowords)-3): #run through all of the words and create a dictionary with them; the keys are tuples.
index = tuple((listowords[x], listowords[x+1]))
if index not in entiredict:
entiredict[index] = [listowords[x+2]]
else:
entiredict[index].append(listowords[x+2])
return entiredict

def markov_chain_two(yup, yup2, allwordss):
"""The markov chain function that actually generates the speech
inputs: a two key-word dictionary, a one key-word dictionary, and a string of all scripts
outputs: a string that is the "final" speech that Trump would make"""
spech = random.randrange(300, 500) #generates a random speech of length randomly selected in this range (in words)
speech = []
listowords = allwordss.split()
speech.append(random.choice(listowords).lower())
speech.append(random.choice(yup2[speech[0]]))
speech[0] = speech[0].capitalize()
for i in range(2, spech-1): #Make a speech of length spech through referencing the dictionaries yup and yup2
checker = (speech[i-2].lower(), speech[i-1].lower())
if '.' in (speech[i-1]) or ('?' in (speech[i-1])) or ('!' in (speech[i-1])):
speech.append(random.choice(yup[checker]).capitalize())
else:
appendix = random.choice(yup[checker])
if appendix in "i'm i i'll i've": #capitalize I
appendix = appendix.capitalize()
speech.append(appendix)
index = 0
for x in speech: #A variety of statements that change Family Guy specific phrases to those that Trump could use
xl = x.lower()
if "peter" in xl:
speech[index] = "Trump"
elif "lois" in xl:
speech[index] = "Ivanka"
elif "brian" in xl:
speech[index] = "Pence"
elif "stewie" in xl:
speech[index] = "Obama"
elif "quagmire" in xl:
speech[index] = "ISIS"
elif "meg" in xl:
speech[index] = "Mexico"
elif "chris" in xl:
speech[index] = "China"
elif "quahog" in xl:
speech[index] = "MURICA" #This is more likely to sound like Trump than "America"
elif "griffin" in xl:
speech[index] = "President"
if '.' in x:
temp = x
mini = temp.split('.')
speech[index] = mini[0]
speech.insert(index, mini[1].capitalize())
index += 1
index = 0
for y in speech: #Fixes some of the spacing issues
if " " in y:
speech[index] = y.replace(' ', '')
index += 1
if ('.' in speech[-1]) or ('!' in speech[-1]) or ('?' in speech[-1]): #how to determine if punctuation needs to be added at the end
product = ' '.join(speech)
else:
product = ' '.join(speech) + '.'
product = product.replace(' ', ' ') #replace all double spaces with single spaces.
return product

if __name__ == '__main__':
max_season = 3
allscripts = ''
my_file = Path("dict1.p")
my_file2 = Path("dict2.p")
my_file3 = Path("allwords.p")
remake = False
y = input("Do you want to customize the number of seasons? (y/n)")
if y.lower() == 'y':
max_seasons = input("How many seasons do you want?: ")
remake = True
# print(not my_file.is_file()) #For testing purposes

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For cleanliness/best practices, removing commented-out code and print-statement-debugging artifacts would be a good idea.

# print(not my_file2.is_file()) #For testing purposes
# print(not my_file3.is_file()) #For testing purposes
if (not my_file.is_file()) or (not my_file2.is_file()) or (not my_file3.is_file()) or remake: #check to see if there are already dictionaries; if there aren't then dictionaries are created.
basehtml = 'http://www.springfieldspringfield.co.uk/view_episode_scripts.php?tv-show=family-guy&episode=s'
temptml = basehtml
episodes_in_season = [-1, 8, 22, 23, 28, 19, 13, 17, 22, 19, 24, 23, 22, 19, 21, 14]
for season in range (1, max_season):
for i in range (1,episodes_in_season[season]): #rewritten for all seasons; writes the link in a way to grab the seasons correctly
if season < 10:
if i < 10:
temp = temptml + '0%de' % season + '0%d' % (i)
else:
temp = temptml + '0%de' % season + '%d' % (i)
else:
if i < 10:
temp = temptml + '%de' % season + '0%d' % (i)
else:
temp = temptml + '%de' % season + '%d' % (i)
html = BeautifulSoup(requests.get(temp).text, 'lxml')
thing = html.find("div", class_ = 'scrolling-script-container')
allscripts = allscripts + " " + str(thing.text)

#gather the trump speeches
html = BeautifulSoup(requests.get('https://github.com/PedramNavid/trump_speeches/blob/master/data/full_speech.txt').text, 'lxml')
thing = html.find("table", class_ = 'highlight tab-size js-file-line-container')
allscripts += thing.text

#pickle the necessary information
pickle.dump(allscripts, open("allwords.p", "wb"))
pickle.dump(dict_creation_one(allscripts), open("dict1.p", "wb"))
pickle.dump(dict_creation_two(allscripts), open("dict2.p", "wb"))
pickle.dump(allscripts, open("backup.p", "wb"))

#call the dictionaries/all words to run the program with
oneworddict = pickle.load( open( "dict1.p", "rb"))
twoworddict = pickle.load( open("dict2.p", "rb"))
allwords = pickle.load( open("allwords.p", "rb"))
blah2 = markov_chain_two(twoworddict, oneworddict, allwords) #create the markov chain

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not digging the variable choice, but the comments do help clarify what the code's doing in this section and all.

print(blah2) #print the markov chaining
238 changes: 238 additions & 0 deletions samplescript

Large diffs are not rendered by default.