diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..abd308e --- /dev/null +++ b/.gitignore @@ -0,0 +1,48 @@ +api_client_codes.py + +hamilton_0.html +hamilton_1.html +hamilton_2.html +hamilton_3.html +hamilton_4.html +hamilton_5.html +hamilton_6.html +hamilton_7.html +hamilton_8.html +hamilton_9.html +hamilton_10.html +hamilton_11.html +hamilton_12.html +hamilton_13.html +hamilton_14.html +hamilton_15.html +hamilton_16.html +hamilton_17.html +hamilton_18.html +hamilton_19.html +hamilton_20.html +hamilton_21.html +hamilton_22.html +hamilton_23.html +hamilton_24.html +hamilton_25.html +hamilton_26.html +hamilton_27.html +hamilton_28.html +hamilton_29.html +hamilton_30.html +hamilton_31.html +hamilton_32.html +hamilton_33.html +hamilton_34.html +hamilton_35.html +hamilton_36.html +hamilton_37.html +hamilton_38.html +hamilton_39.html +hamilton_40.html +hamilton_41.html +hamilton_42.html +hamilton_43.html +hamilton_44.html +hamilton_45.html diff --git a/import_hamilton.py b/import_hamilton.py new file mode 100644 index 0000000..740f3a3 --- /dev/null +++ b/import_hamilton.py @@ -0,0 +1,70 @@ +"""this is the file i'll use to import/download all of the hamilton lyrics +into text files""" + +# packages are useful +from bs4 import BeautifulSoup +import requests +from lxml import html + + +def find_links(soup): + """this function takes in an html file and returns a list of the + links within the html file.""" + list_o_links = [] +# this iterates through all links in the html file and appends them to a list + for link in soup.find_all('a'): + list_o_links.append('http://www.themusicallyrics.com/' + link.get('href')) +# and the list gets returned + return list_o_links + + +def cull_links(beginning_url, list_urls): + """this function takes in the beginning of a URL and a list of URLs and + returns a list of URLs that begin with the beginning_url.""" + new_list_urls = [] +# iterates through the list of URLs; if an item contains the desired URL, +# appends the item to the new list of URLs. + for x in list_urls: + if beginning_url in x: + new_list_urls.append(x) + return new_list_urls + + +def file_names(list_of_links, base_name): + n = len(list_of_links) + names_list = [] + for i in range(n): + names_list.append(base_name + '_' + str(i) + '.html') + return names_list + + +def save_files(list_of_links, list_of_names): + for i in range(len(list_of_links)): + song = requests.get(list_of_links[i]) + text_file = open(list_of_names[i], "w") + text_file.write(song.content) + text_file.close + +# i can get the html of the page i want +url_source = requests.get('http://www.themusicallyrics.com/h/351-hamilton-the-musical-lyrics.html') +# i can save it in a file +text_file = open('url_page.txt', 'w') +text_file.write(url_source.content) +text_file.close + +# this lets me use BeautifulSoup because i want to +soup = BeautifulSoup(url_source.content, 'lxml') + +# this uses the find_links function and defines which links i want +some_urls = find_links(soup) +useful_url = '/351-hamilton-the-musical-lyrics/' + +# this makes a list of links and a base name for naming the files i create +list_of_links = cull_links(useful_url, some_urls) +base_name = 'hamilton' + +# this is a list of names for the files i create +names_list = file_names(list_of_links, base_name) + +# this uses the save_files function to get all the things i want +save_files(list_of_links, names_list) diff --git a/import_hamilton2.py b/import_hamilton2.py new file mode 100644 index 0000000..59579be --- /dev/null +++ b/import_hamilton2.py @@ -0,0 +1,72 @@ +"""this is the file i'll use to import/download all of the hamilton lyrics +into text files""" + +# packages are useful +from bs4 import BeautifulSoup +import requests +from lxml import html + + +def find_links(soup): + """this function takes in an html file and returns a list of the + links within the html file.""" + list_o_links = [] +# this iterates through all links in the html file and appends them to a list + for link in soup.find_all('a'): + list_o_links.append(link.get('href')) +# and the list gets returned + return list_o_links + + +def cull_links(beginning_url, list_urls): + """this function takes in the beginning of a URL and a list of URLs and + returns a list of URLs that begin with the beginning_url.""" + new_list_urls = [] +# iterates through the list of URLs; if an item contains the desired URL, +# appends the item to the new list of URLs. + for x in list_urls: + if x: + if beginning_url in x: + new_list_urls.append(x) + return new_list_urls + + +def file_names(list_of_links, base_name): + n = len(list_of_links) + names_list = [] + for i in range(n): + names_list.append(base_name + '_' + str(i) + '.html') + return names_list + + +def save_files(list_of_links, list_of_names): + for i in range(len(list_of_links)): + song = requests.get(list_of_links[i]) + text_file = open(list_of_names[i], "w") + text_file.write(song.content) + text_file.close + +# i can get the html of the page i want +url_source = requests.get('file:///home/lnielsen/Downloads/Hamilton%20(Original%20Broadway%20Cast%20Recording)%20-%20Act%20I%20Booklet%20-%20FINAL.pdf') +# i can save it in a file +text_file = open('url_page.txt', 'w') +text_file.write(url_source.content) +text_file.close + +# this lets me use BeautifulSoup because i want to +soup = BeautifulSoup(url_source.content, 'lxml') +print soup + +# this uses the find_links function and defines which links i want +some_urls = find_links(soup) +useful_url = 'http://genius.com/Lin-manuel-miranda-' + +# this makes a list of links and a base name for naming the files i create +list_of_links = cull_links(useful_url, some_urls) +base_name = 'hamilton' + +# this is a list of names for the files i create +names_list = file_names(list_of_links, base_name) + +# this uses the save_files function to get all the things i want +save_files(list_of_links, names_list) diff --git a/parsing_text_files.py b/parsing_text_files.py new file mode 100644 index 0000000..4a6ce11 --- /dev/null +++ b/parsing_text_files.py @@ -0,0 +1,96 @@ +"""this contains the code and functions to parse the lyrics from the text files +""" + +# let's import some useful things +from bs4 import BeautifulSoup + +# here is the functions i will use + + +def make_a_soup(filename): + """opens the file, defines a variable as the file, and returns it + """ + current_song = open(filename) + important = current_song.read() + current_song.close + return important + + +def find_lyrics(soup): + """finds the section of the html file that contains the lyrics + turns it into a list + turns that into one string + """ + start = soup.find('') + 5 + end = soup[start:].find('

') + new_soup = soup[start:end] + list_lyrics = new_soup.split('
') + string_lyrics = ' '.join([x for x in list_lyrics]) + return string_lyrics + + +def tuples_of_lyrics(string_o_lyrics): + """takes in a string of lyrics + splits it by ' ' (every time the character speaking/singing changes) + splits each of those into a tuple ('CHARACTER', 'Lyrics') + returns a list of tuples + """ + list_of_strings_of_lyrics = string_o_lyrics.split(' ') + list_of_tuples = [] + for x in list_of_strings_of_lyrics: + if x: + avocado = tuple(x.split(': ')) + list_of_tuples.append(avocado) + return list_of_tuples + + +def names(number_of_songs, base_name): + n = number_of_songs + names_list = [] + for i in range(n): + names_list.append(base_name + '_' + str(i) + '.txt') + return names_list + + +def filenames(number_of_songs, base_name): + n = number_of_songs + names_list = [] + for i in range(n): + names_list.append(base_name + '_' + str(i) + '.html') + return names_list + + +def assign_names(number_of_songs, base_name_name, base_name_file): + names_list = names(number_of_songs, base_name_name) + filenames_list = filenames(number_of_songs, base_name_file) + for i in range(len(names_list)): + x = tuples_of_lyrics(find_lyrics(make_a_soup(filenames_list[i]))) + return names_list + +names_list = names(46, 'hamilton_lyrics') +filenames_list = filenames(46, 'hamilton') +lyric = [] +for i in range(46): + lyric.append(tuples_of_lyrics(find_lyrics(make_a_soup(filenames_list[i])))) +n = 9 +# print '' +# print names_list[n] +# print '' +print lyric[n] + +soup = make_a_soup('hamilton_9.html') + +start = soup.find('') + 5 +end = soup[start:].find('

') +new_soup = soup[start:end] +list_lyrics = new_soup.split('
') +string_lyrics = ' '.join([x for x in list_lyrics]) +# print start +# print end +# print soup[start:end] +# if lyric[n][2]=None: +# print '...' + +assign_names(46, 'hamilton_lyrics', 'hamilton') + +# print tuples_of_lyrics(find_lyrics(make_a_soup('hamilton_0.html'))) diff --git a/writeup_and_reflection.pdf b/writeup_and_reflection.pdf new file mode 100644 index 0000000..462ea76 Binary files /dev/null and b/writeup_and_reflection.pdf differ