sd16spring · lzuehsow · Feb 27, 2016 · Feb 28, 2016 · Feb 28, 2016 · Feb 28, 2016
diff --git a/GetWiki.py b/GetWiki.py
@@ -0,0 +1,74 @@
+# -*- coding:utf-8 -*-
+from pattern.web import *
+import os
+import string
+import nltk
+nltk.download('stopwords')
+from nltk.corpus import stopwords
+
+"""
+	This program takes a language and a Wikipedia article title as an input. It then outputs the top ten most common words within that article, first
+	checking to see if they're significant (eg: not articles or prepositions, etc.)
+
+	This program supports English, Swedish, Portugese, Hungarian, Finnish, Turkish, German, Dutch, Norwegian, Spanish, Russian, Danish, & Italian.
+
+	The language dictionary is a global because it's referenced in both get_text() and open_text(). It's a lot easier to reference and edit as a global.
+"""
+
+dict_lang = {'English':['en', 'references'], 'Swedish':['sv', 'referenser'], 'Portuguese':['pt', 'referências'], 'Hungarian':['hu', 'források'], 'Finnish':['fi', 'lähteet'], \
+	'Turkish':['tr', 'kaynakça'], 'German':['de', 'anmerkungen'], 'Dutch':['nl', 'referenties'], 'Norwegian':['nb', 'referanser'], \
+	'Spanish':['es', 'referencias'], 'Russian':['ru', 'Примечания'.lower()], 'Danish':['da', 'referencer'], 'Italian':['it', 'bibliografia']}
+
+def get_text(language, title):
+	"""
+		Finds the Wikipedia article in the specified language, then writes it into a plaintext .txt file. (Language_us_file.txt)
+	"""
+
+	filename = language + '_us_file.txt' #Creates filename of eventual wiki text file in language "language."
+
+	wiki = Wikipedia(language = dict_lang[language][0]) #Opens wiki in language "language", referencing dict_lang for the appropriate language code
+
+	article = wiki.search(title) #Finds the right article by seaching wiki for the title
+	article_text = article.plaintext()
+
+	article_file = open(filename, 'w')
+	article_file.write(article_text.encode("UTF-8")) #Creates file "Language_us_file.txt," writes in plaintext of wiki article.
+	article_file.close
+
+def open_text(language):
+	"""
+		Opens the appropriate plaintext file and runs a histogram, creating a dictionary with every non-trivial word that appears and its frequency in the article.
+		Terminates when it detects that it has read the entire article and reached the bibliography. ("Reference" section.)
+		Outputs a tuple of the original list and the complete sorted version.
+	"""
+	filename = language + '_us_file.txt' #Creates filename of eventual wiki text file in language "language."
+	if language in dict_lang: #Checks if stopwords supports this language. Stopwords contains a library of common articles/prepositions/trash words in a couple languages.
+		common_words = set(stopwords.words(language.lower()))
+	else:
+		common_words = []
+
+	hist = {}
+
+	with open(filename, 'r') as f:
+		filetext  = [line.translate(None, string.punctuation).lower() for line in f] #Strips punctuation from the file plaintext and makes everything lowercase for processing
+		for line in filetext:
+			for word in line.split():
+				if word == dict_lang[language][1]: #If the end of the wikipedia article (eg: the "References") is reached, the function terminates and returns the sorted histogram. 
+					sorted_filetext = sorted(hist, key = hist.__getitem__, reverse = True) #Sorting the list of all words in the article by their frequency. Most frequent = first.
+					return  (sorted_filetext, hist)
+				elif word not in common_words:
+					if not word.isdigit(): #Is the "word" actually a word, or a number?
+						if word in hist: #Is the word in the histogram already?
+							hist[word] += 1 #increase word occurence frequency by one
+						else:
+							hist[word] = 1 #word frequency equals 1
+	return
+
+language = 'Hungarian' #The language of choice
+title = 'United_States' #The article title of choice
+
+get_text(language, title)
+sorted_filetext, hist = open_text(language)
+
+for i in range(1,10):
+	print "{}	{}".format(sorted_filetext[i], hist[sorted_filetext[i]]) #Print the top ten words and their frequencies.
diff --git a/Reflection.txt b/Reflection.txt
@@ -0,0 +1,37 @@
+Project Overview
+I used Wikipedia. I ran a simple histogram on the Wikipedia article plaintext, documenting every word and its frequency. I also eliminated common words from the high-frequency list.
+
+Implementation
+I implemented my function in two main parts: getting the data, and analysing it. The first half of my code pulls the wiki article of the appropriate language from the internet, and writes the plaintext into a locally stored file. The second half makes a dictionary of every word and its frequency, then returns a list sorted by decreasing frequency.
+
+I used NLTK to get a list of common words in various languages, rather than writing out lists of articles and prepositions by hand, because I was not personally familiar with several of the languages involved, and making these lists would have been error-ridden and highly time-consuming. Though NLTK only supports select languages, it's still vastly easier to use NTLK.
+
+I also decided to make dict_lang (my dictionary of supported languages, language codes, and translated "reference" strings) a global variable. Since it's pretty bulky text-wise, and since it's used by both of my functions, it was cleanest and most efficient to make dict_lang a global variable.
+
+Results
+It's interesting to note that in the English, Portuguese, and Spanish wiki pages for the US, the word 'war' appears on the top ten most frequent word list. In the Spanish article, the word 'war' appears 57 times; in Portugese, 56 times. Other common words include 'world,'(Portuguese), 'million,' (Danish, Spanish) 'most/more than,' (Spanish, Portuguese) 'change' (Turkish), and 'large.' (Swedish) As expected, 'US,' 'United,' and 'States' are also very common.
+It seems that, despite being a relatively new country, America is already "perceived as "larger than life" and a major military/world force.
+
+Other articles:
+Donald Trump-
+	US- million, new york
+	Italy- Obama, no
+	Spanish- Consulted
+	Portuguese- Reliable, no, favor, independent
+Hillary Clinton-
+	US- Campaign, first lady
+	Italy- Bill, first lady
+	Spanish- first lady, plus, bell
+	Portuguese-Obama, first lady
+Russia-
+	US- Soviet, war, largest, world
+	Russia- Cinema, consignment
+	German- Asteroid, states
+	Turkish- Soviet, Belarus, big, change
+
+Reflection
+My project was appropriately scoped. Unfortunately, I was extremely busy the past week, and I couldn't really devote enough time to this project.
+
+I did manage to get my entire project done in under 7 hours, which is actually pretty impressive in hindsight. I got a lot faster when I stopped trying to use BeautifulSoup.
+I learned a lot about HTML when I was mucking about with BeautifulSoup, though, which is good. Going forward, I know a lot more about how web pages are formatted, and how to pull information from them.
+If I could have known something before I started, I would have liked to have known how busy I was going to be this weekend, so that I would have spent time coding instead of researching BeautifulSoup.
diff --git a/getnews.py b/getnews.py
@@ -0,0 +1,81 @@
+# import urllib
+# from bs4 import BeautifulSoup
+
+from bs4 import BeautifulSoup, SoupStrainer
+import urllib2
+import re
+
+def main():
+    opener = urllib2.build_opener()
+    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
+    mainurl = 'http://www.cnn.com/'
+    # url = 'http://www.cnn.com/2013/10/29/us/florida-shooting-cell-phone-blocks-bullet/index.html?hpt=ju_c2'
+    # url = 'http://www.cnn.com/2016/02/24/middleeast/swedish-teen-freed-from-isis/index.html'
+    soup = BeautifulSoup(opener.open(mainurl))
+
+    # print type(soup.find("div", {"class":"share-bar-whatsapp-container"}))
+    #1) Link to the website 
+
+    #2) title of article
+	# title = soup.findAll("span", {"class":"cd__headline-text"})
+
+    #3) Text of the article
+    # paragraphs = soup.findAll("p", {"class":"zn-body__paragraph"})
+    # text = " ".join([ paragraph.text.encode('utf-8') for paragraph in paragraphs])
+
+    # print url
+    # print title 
+    # print text
+
+def getlinks():
+	opener = urllib2.build_opener()
+	opener.addheaders = [('User-agent', 'Mozilla/5.0')]
+	mainurl = 'http://cnnespanol.cnn.com'
+	# mainurl = 'http://cnn.com'
+	soup = BeautifulSoup(opener.open(mainurl))
+
+	urls = soup.findAll("a", {"href":re.compile("/index.html")})
+	text = " ".join([ url.text.encode('utf-8') for url in urls])
+	text_file = open('CNNtext.txt', 'a')
+	text_file.write(text)
+	text_file.close()
+	return text
+
+# def process_file(filename):
+#     hist = dict()
+#     fp = open(filename)
+#     for line in fp:
+#         process_line(line, hist)
+#     return hist
+
+# def process_line(line, hist):
+#     line = line.replace('-', ' ')
+
+#     for word in line.split():
+#         word = word.strip(string.punctuation + string.whitespace)
+#         word = word.lower()
+
+#         hist[word] = hist.get(word, 0) + 1
+
+
+# def most_common(hist):
+#     t = []
+#     for key, value in hist.items():
+#         t.append((value, key))
+
+#     t.sort(reverse=True)
+#     return t
+
+# 	hist = process_file()
+
+# 	t = most_common(hist)
+# 	common_words = ['the']
+# 	print 'The most common words are:'
+# 	for freq, word in t[0:10]:
+# 	    if word not in common_words:
+# 	        print word, '\t\t', freq
+
+
+if __name__ == '__main__': 
+	getlinks()
+    # main()