sd16spring · DuncanDHall · Feb 26, 2016
diff --git a/analyze.py b/analyze.py
@@ -0,0 +1,88 @@
+import pickle
+import re
+from tabulate import tabulate
+from save import merge_dictionaries
+
+word_pattern = re.compile('([\w\']*)')
+
+current_candidates = [
+    'Carson', 'Cruz', 'Kasich', 'Rubio', 'Trump', 'Sanders', 'Clinton']
+
+
+def get_candidate_vocabularies(location='downloads/data/master.pickle'):
+    ''' returns candidate_vocabularies in the form {candidate: {word: frequency}}
+        derived from candidate_remarks {candidate: [remarks]} dictionary 
+        pickled at location
+    '''
+    candidate_remarks = pickle.load(open(location, 'r'))
+
+    candidate_vocabularies = {}  # {cand: {word: freq}}
+    for candidate in [name.upper() for name in current_candidates]:
+        print 'processing {1} remarks from {0}...'.format(
+            candidate, len(candidate_remarks[candidate]))
+
+        candidate_vocabulary = {}
+
+        for remark in candidate_remarks[candidate]:
+            candidate_vocabulary[remark] = candidate_vocabulary.get(
+                remark, 0) + 1
+            words = re.findall(word_pattern, remark)
+            for word in words:
+                if word != '':
+                    candidate_vocabulary[word.lower()] = [
+                        candidate_vocabulary.get(word.lower(), 0) + 1
+                        ][0]
+        candidate_vocabularies[candidate] = candidate_vocabulary
+
+    return candidate_vocabularies, candidate_remarks
+
+
+def get_highlights(subject_VD, population_VD, proportion):
+    ''' returns the list of words most frequently used by the candidate, 
+        but are also used only, or in large percentage (proportion) by
+        that candidate
+    '''
+    uniques = []
+    for word, frequency in subject_VD.items():
+        if frequency > proportion * population_VD[word]:
+            uniques.append((word, frequency))
+    items = []
+    for word, frequency in sorted(uniques, key=lambda x: x[1])[-30:]:
+        items += [str(frequency) + ' ' * (5 - len(str(frequency))) + str(word)]
+    return items
+
+
+def print_all_highlights(proportion=0.99, candidates=current_candidates):
+    ''' prints a table of most those words most frequently used which are also
+        used only (or primarily, controlled by proprotion) by each candidate
+    '''
+    candidate_vocabularies = get_candidate_vocabularies()[0]
+    population_VD = merge_dictionaries(
+        [dict(vocabulary) for vocabulary in candidate_vocabularies.values()],
+        0)
+    highlights = {}
+    for candidate in candidates:
+        candidate_VD = dict(candidate_vocabularies[candidate.upper()])
+        highlights[candidate] = get_highlights(
+            candidate_VD, population_VD, proportion)
+
+    # now we print...
+    print tabulate(highlights, headers='keys')
+
+
+# def markov():
+#     candies_vocab, candy_remarks = get_candidate_vocabularies()
+#     for candy_vocab in candies_vocab:
+#         candy_map = {}
+#         for word in candy_vocab:
+
+#     pass
+
+
+if __name__ == '__main__':
+    print_all_highlights(0.5)
+    # import doctest
+    # doctest.run_docstring_examples(
+    #     markov, globals(),
+    #     verbose=True, name="Jus' Testin'")
+    pass
diff --git a/downloads/.DS_Store b/downloads/.DS_Store
diff --git a/downloads/data/.DS_Store b/downloads/data/.DS_Store
diff --git a/downloads/data/master.pickle b/downloads/data/master.pickle
diff --git a/downloads/data/script0.pickle b/downloads/data/script0.pickle
diff --git a/downloads/data/script1.pickle b/downloads/data/script1.pickle
diff --git a/downloads/data/script10.pickle b/downloads/data/script10.pickle
diff --git a/downloads/data/script11.pickle b/downloads/data/script11.pickle
diff --git a/downloads/data/script12.pickle b/downloads/data/script12.pickle
diff --git a/downloads/data/script13.pickle b/downloads/data/script13.pickle
diff --git a/downloads/data/script14.pickle b/downloads/data/script14.pickle
diff --git a/downloads/data/script15.pickle b/downloads/data/script15.pickle
diff --git a/downloads/data/script16.pickle b/downloads/data/script16.pickle
diff --git a/downloads/data/script17.pickle b/downloads/data/script17.pickle
diff --git a/downloads/data/script18.pickle b/downloads/data/script18.pickle
diff --git a/downloads/data/script19.pickle b/downloads/data/script19.pickle
diff --git a/downloads/data/script2.pickle b/downloads/data/script2.pickle
diff --git a/downloads/data/script20.pickle b/downloads/data/script20.pickle
diff --git a/downloads/data/script21.pickle b/downloads/data/script21.pickle
diff --git a/downloads/data/script3.pickle b/downloads/data/script3.pickle
diff --git a/downloads/data/script4.pickle b/downloads/data/script4.pickle
diff --git a/downloads/data/script5.pickle b/downloads/data/script5.pickle
diff --git a/downloads/data/script6.pickle b/downloads/data/script6.pickle
diff --git a/downloads/data/script7.pickle b/downloads/data/script7.pickle
diff --git a/downloads/data/script8.pickle b/downloads/data/script8.pickle
diff --git a/downloads/data/script9.pickle b/downloads/data/script9.pickle
diff --git a/downloads/data2/script0.pickle b/downloads/data2/script0.pickle
diff --git a/downloads/data2/script1.pickle b/downloads/data2/script1.pickle
diff --git a/downloads/data2/script10.pickle b/downloads/data2/script10.pickle
diff --git a/downloads/data2/script2.pickle b/downloads/data2/script2.pickle
diff --git a/downloads/data2/script3.pickle b/downloads/data2/script3.pickle
diff --git a/downloads/data2/script4.pickle b/downloads/data2/script4.pickle
diff --git a/downloads/data2/script5.pickle b/downloads/data2/script5.pickle
diff --git a/downloads/data2/script6.pickle b/downloads/data2/script6.pickle
diff --git a/downloads/data2/script7.pickle b/downloads/data2/script7.pickle
diff --git a/downloads/data2/script8.pickle b/downloads/data2/script8.pickle
diff --git a/downloads/data2/script9.pickle b/downloads/data2/script9.pickle
diff --git a/downloads/html_transcripts/.DS_Store b/downloads/html_transcripts/.DS_Store
diff --git a/downloads/html_transcripts/script0.html b/downloads/html_transcripts/script0.html
diff --git a/downloads/html_transcripts/script1.html b/downloads/html_transcripts/script1.html
diff --git a/downloads/html_transcripts/script10.html b/downloads/html_transcripts/script10.html
diff --git a/downloads/html_transcripts/script11.html b/downloads/html_transcripts/script11.html
diff --git a/downloads/html_transcripts/script12.html b/downloads/html_transcripts/script12.html
diff --git a/downloads/html_transcripts/script13.html b/downloads/html_transcripts/script13.html
diff --git a/downloads/html_transcripts/script14.html b/downloads/html_transcripts/script14.html
diff --git a/downloads/html_transcripts/script15.html b/downloads/html_transcripts/script15.html
diff --git a/downloads/html_transcripts/script16.html b/downloads/html_transcripts/script16.html
diff --git a/downloads/html_transcripts/script17.html b/downloads/html_transcripts/script17.html
diff --git a/downloads/html_transcripts/script18.html b/downloads/html_transcripts/script18.html
diff --git a/downloads/html_transcripts/script19.html b/downloads/html_transcripts/script19.html
diff --git a/downloads/html_transcripts/script2.html b/downloads/html_transcripts/script2.html
diff --git a/downloads/html_transcripts/script20.html b/downloads/html_transcripts/script20.html
diff --git a/downloads/html_transcripts/script21.html b/downloads/html_transcripts/script21.html
diff --git a/downloads/html_transcripts/script3.html b/downloads/html_transcripts/script3.html
diff --git a/downloads/html_transcripts/script4.html b/downloads/html_transcripts/script4.html
diff --git a/downloads/html_transcripts/script5.html b/downloads/html_transcripts/script5.html
diff --git a/downloads/html_transcripts/script6.html b/downloads/html_transcripts/script6.html
diff --git a/downloads/html_transcripts/script7.html b/downloads/html_transcripts/script7.html
diff --git a/downloads/html_transcripts/script8.html b/downloads/html_transcripts/script8.html
diff --git a/downloads/html_transcripts/script9.html b/downloads/html_transcripts/script9.html
diff --git a/downloads/index.html b/downloads/index.html
diff --git a/save.py b/save.py
@@ -0,0 +1,143 @@
+from pattern.web import URL, plaintext
+import pickle
+import os
+import re
+import glob
+
+# this is the url that I am finding all other transcript urls from:
+TOC_url = 'http://www.presidency.ucsb.edu/debates.php'
+sampletranscript_url = 'http://www.presidency.ucsb.edu/ws/index.php?pid=111520'
+candidates_remarks = {}
+
+# pre-compiling with re.S so '.' inclues \n
+script_pattern = re.compile('\\n([A-Z]{4,}): (.*?)(?=\\n?[A-Z]{4,})', re.S)
+
+
+def save_dictionary(d, filename, folder=''):
+    ''' pickles a dictionary with filename in downloads/folder/
+    '''
+    if not os.path.exists('downloads/' + folder):
+        os.mkdir('downloads/' + folder)
+    f = open('downloads/' + folder + filename, 'w')
+    pickle.dump(d, f)
+    f.close
+
+
+def get_remarks_from_transcript(html, start='MODERATORS'):
+    ''' retrieves remarks from html transcript of a debate and stores them 
+        in a dictionary {candidate: [remarks]} which is returned
+    '''
+    candidates_remarks = {}
+    plain_text = plaintext(html)
+    script_tuple = script_pattern.findall(plain_text)
+    for name, remark in script_tuple:
+        clean_remark = re.sub('\[[^\]]+\]', '', remark.replace('\n', ' '))
+        candidates_remarks[name] = candidates_remarks.get(
+            name, []) + [clean_remark]
+    return candidates_remarks
+
+
+def save_html(url, filename, folder=''):
+    ''' saves html from url to filename in downloads/folder/
+    '''
+    if not os.path.exists('downloads/' + folder):
+        os.mkdir('downloads/' + folder)
+    result = URL(url).download()
+    f = open('downloads/' + folder + filename, 'w')
+    f.write(result)
+    f.close()
+
+
+def get_transcript_links(
+        html, years, end_string='Candidates Debate in'):
+    ''' returns a list of debate transcript links retrieved from html
+        (the table of contents page)
+    '''
+    if isinstance(years, int):
+        years = (years,)
+
+    year_pattern = '(?:'+'|'.join(str(year) for year in years)+')'
+
+    url_pattern = re.compile(
+        ', {0}.{1}href="(http:\/\/[^"]*?)">(?:Republican|Democratic)'.format(
+            year_pattern, '{10,200}?'),
+        re.S)
+    links = re.findall(
+        url_pattern,
+        html)
+    return [link for link in links if link]
+
+
+def get_html_transcripts():
+    ''' saves html retrieved from links found on table of contents page
+    '''
+    print 'getting table of contents...'
+    # TOC_html = URL(TOC_url).download()
+    TOC_html = open('downloads/index.html', 'r').read()
+
+    links = get_transcript_links(TOC_html, (2015, 2016))
+    print 'downloading html transcripts...'
+    # print links
+    for link in links:
+        save_html(link, 'script' +
+                  str(links.index(link)) +
+                  '.html', 'html_transcripts/')
+
+    print 'html is saved.'
+
+
+def merge_dictionaries(dictionaries, default=[]):
+    ''' returns a single list dictionary {key: [values]} which combines list
+        values for all input dictionaries
+    >>> merge_dictionaries({'a':[1,2], 'b':[3,4]}, {'b':[5,6], 'c':[7,8]})
+    {'a': [1, 2], 'b': [3, 4, 5, 6], 'c': [7, 8]}  # or similar
+    '''
+    master = {}
+    for d in dictionaries:
+        for key in d:
+            master[key] = master.get(key, default) + d[key]
+    return master
+
+
+def build_candidate_remarks():
+    ''' compiles a master set of remarks from html transcripts merged together
+        and stores it in master.pickle in downloads
+    >>> build_candidate_remarks()
+    '''
+    project_dir = os.getcwd()
+
+    transcript_paths = glob.glob(project_dir+'/downloads/html_transcripts/*')
+
+    # this block is legacy--saves individual debate dictionaries before
+    # merge to master:
+
+    # debate_no = 0
+    # for html_transcript in transcript_paths:
+    #     transcript = open(html_transcript, 'r').read()
+    #     d = get_remarks_from_transcript(transcript)
+    #     save_dictionary(d, 'script{}.pickle'.format(debate_no), 'data/')
+    #     debate_no += 1
+
+    print 'getting debate remarks dictionaries...'
+    debate_dictionaries = [
+        get_remarks_from_transcript(
+            open(html_transcript, 'r').read()
+            )
+        for html_transcript in transcript_paths
+        ]
+
+    print 'merging dictionaries...'
+    master = merge_dictionaries(debate_dictionaries)
+
+    print 'pickling and saving...'
+    save_dictionary(master, 'master.pickle', 'data/')
+
+    print 'master remarks dictionary saved'
+
+
+if __name__ == '__main__':
+
+    build_candidate_remarks()
+    # import doctest
+    # doctest.run_docstring_examples(
+    #     build_candidate_remarks, globals(), verbose=True, name="Jus' Testin'")
diff --git a/writeup.txt b/writeup.txt
@@ -0,0 +1,86 @@
+Project Overview [Maximum 100 words]
+
+The object of this project was to analyze the speach of our dear candidates for president of the United States. I used transcripts from http://www.presidency.ucsb.edu/, and the program save.py allows the future downloading and incorporation of coming debates into the data set used in analyze.py. I spent most of my time on the data collection side, trying to automate it as much as possible, with basic frequency analysis performed on that data. I would like to expand this program to be able to generate characteristic phrases of each candidate to prove that if you sound like a robot in the debates, you are either a candidate or a python program.
+
+
+Implementation
+
+The program starts by downloading html for the Table of Contents page for the debates, from which it finds all legitimate links to transcripts with regex implementation (I learned soooo much about this...), and downloads the html. I chose to include the table of contents page in the implementation so that a user could automatically update the data set as more transcripts are added to this page. From this html, it stores each 'remark' in a dictionary of lists of remarks for each candidate, which is pickled and stored locally for analysis. In analysis, a word frequency dictionary is calculated for each candidate, and the most frequent words which are used primarily by a single candidate are printed into a nice looking table. 
+
+I also did a lot of work with saving to local folders, which was a serious learning process, but allows for better organization of the program directories. As mentioned above, regular expressions were central to a number of processes, the most complicated of which were the transcript parsing pattern: '\\n([A-Z]{4,}): (.*?)(?=\\n?[A-Z]{4,})', and the pattern used to match urls from the table of contents page: ', {0}.{1}href="(http:\/\/[^"]*?)">(?:Republican|Democratic)'.format(year_pattern, '{10,200}?'). This one I thought particularly clever because of the combination of regex and string interpolation to allow the targeting of transcripts from specified election years.
+
+Results
+Sanders             Clinton             Trump             Rubio          
+------------------  ------------------  ----------------  ---------------
+15   universities   10   donations      9    leaving      6    seconds   
+16   veterans       10   incomes        9    smart        6    sunni     
+16   corporations   10   children's     10   total        6    strongest 
+17   vermont        10   equal          10   sitting      6    harder    
+17   revolution     10   prescription   11   mess         6    vat       
+17   almost         11   proposed       11   oh           7    criminals 
+17   huge           11   possible       11   bomb         7    underminin
+17   corrupt        11   brothers       12   built        7    shia      
+17   disagree       11   toward         13   anymore      7    america's 
+18   jail           12   potential      13   politicians  7    expensive 
+18   finance        12   fund           14   hundreds     7    agency    
+19   super          13   difficult      14   atlantic     8    struggling
+19   billionaires   14   dodd           14   nice         8    blessed   
+19   large          14   figure         15   totally      8    access    
+19   criminal       15   others         16   guy          9    hold      
+19   view           15   agenda         17   city         9    human     
+21   contributions  16   plans          19   nobody       9    choose    
+24   wealth         16   comprehensive  20   mexico       10   prove     
+25   earth          17   further        20   domain       11   enterprise
+25   public         17   progressive    20   laws         11   pro       
+25   legislation    18   costs          20   eminent      11   dream     
+27   african        18   frank          20   deals        12   rand      
+32   kids           19   communities    21   frankly      13   illegally 
+35   o'             19   particularly   21   oil          16   until     
+35   income         22   certainly      21   excuse       16   21st      
+49   class          23   agreement      26   wouldn't     18   greatest  
+61   major          34   affordable     27   company      18   paycheck  
+80   street         46   try            30   trade        28   century   
+83   campaign       51   sanders        32   tremendous   32   someone   
+112  secretary      93   senator        38   jeb          37   barack    
+
+ Carson             Cruz                Kasich
+-----------------  ------------------  --------------
+3    losers        8    conservatives  6    architect
+3    honesty       8    roberts        6    served
+3    pundits       9    texas          7    400
+3    utilize       9    steve          7    lift
+3    register      10   rubio          7    veteran
+3    bully         10   org            7    town
+3    memorandum    10   defeating      7    collar
+3    pc            10   tedcruz        8    prisons
+3    healing       11   jeff           8    balancing
+3    tithing       11   maria          8    pentagon
+3    rapid         11   born           8    realize
+3    weaponry      11   sessions       9    cuts
+4    integrity     11   schumer        9    formula
+4    bencarson     13   focus          9    gulf
+4    intellect     13   mom            9    fiscal
+4    empowerment   13   defend         9    budgets
+4    box           14   secure         9    treat
+4    aspect        14   irs            10   8
+4    declare       14   s              11   growing
+4    cents         14   fed            11   message
+4    recognizing   17   simple         11   sanctions
+4    neurosurgeon  19   note           12   secondly
+5    correctness   20   islamic        12   hole
+5    ethics        24   commander      12   balance
+5    dependent     25   marco          14   rise
+6    evil          26   chief          14   surplus
+6    deductions    29   court          17   medicaid
+6    agencies      31   flat           26   balanced
+8    wars          33   donald         43   budget
+9    thinking      36   amnesty        54   ohio
+
+
+
+Above are the results of my frequency analysis for p = 0.70. This means that a given instance of a word listed above, there is a 70% probability that the candidate in whose column it lies said them. Looking through these lists, one can make see why they come up if one has been following the debates. For example, Sanders's primary opponant is secretary Clinton, who he addresses as such, and he has cornered the 'secretary' market. Likewise, she refers to him often, as reflected in 'senator' and 'sanders' the top (or really bottom) of her list. Note that Sanders' list does not include 'hillary' or 'clinton'--those are for the rupublicans who tend to talk about her more than him in their debates. Sanders also apparently gets meta more than anyone else with 'campaign', probably with statements like 'this *campaign* is about standing up to a *corrupt* *campaign* *finance* system'. 
+
+On the republican side, we can see that Trump talks a lot about 'jeb', Rubio is on a first name basis with Obama, Carson doesn't really say much at all but prefers 'thinking'. Cruz likes to target 'donald' a lot, also 'marco'. He's a moma's boy, discusses 'flat' taxation a lot, really thinks he has what it takes to be 'commander' in 'cheif', and has been spending some time defending his legitimacy given he was 'born' in Canada. Kasich enjoys 'balancing' 'budgets', so he has 'balanced' many a 'budget' when he 'served' as governer of 'ohio'. He also likes following up his statements with 'secondly'.
+
+Reflection [~1 paragraph]
+I thought my process worked pretty well. It was hard to provide unit tests for many of my functions because of their nature in handling large data sets, but it worked out. I also thought I did a good job compartmentalizing my code for modularity allong the way. Most of my issues came from regex or from the fact that I couldn't visualize my data set very well (I had an issue early on where I was storing the same remarks 22 times in each candidate entry in the master dictionary, which I didn't catch until I was well into analysis and found that all my word frequencies were multiples of 22. That was fun...), but I eventually figured everything out that I wanted to.