Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions GetWiki.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# -*- coding:utf-8 -*-
from pattern.web import *
import os
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

"""
This program takes a language and a Wikipedia article title as an input. It then outputs the top ten most common words within that article, first
checking to see if they're significant (eg: not articles or prepositions, etc.)

This program supports English, Swedish, Portugese, Hungarian, Finnish, Turkish, German, Dutch, Norwegian, Spanish, Russian, Danish, & Italian.

The language dictionary is a global because it's referenced in both get_text() and open_text(). It's a lot easier to reference and edit as a global.
"""

dict_lang = {'English':['en', 'references'], 'Swedish':['sv', 'referenser'], 'Portuguese':['pt', 'referências'], 'Hungarian':['hu', 'források'], 'Finnish':['fi', 'lähteet'], \
'Turkish':['tr', 'kaynakça'], 'German':['de', 'anmerkungen'], 'Dutch':['nl', 'referenties'], 'Norwegian':['nb', 'referanser'], \
'Spanish':['es', 'referencias'], 'Russian':['ru', 'Примечания'.lower()], 'Danish':['da', 'referencer'], 'Italian':['it', 'bibliografia']}

def get_text(language, title):
"""
Finds the Wikipedia article in the specified language, then writes it into a plaintext .txt file. (Language_us_file.txt)
"""

filename = language + '_us_file.txt' #Creates filename of eventual wiki text file in language "language."

wiki = Wikipedia(language = dict_lang[language][0]) #Opens wiki in language "language", referencing dict_lang for the appropriate language code

article = wiki.search(title) #Finds the right article by seaching wiki for the title
article_text = article.plaintext()

article_file = open(filename, 'w')
article_file.write(article_text.encode("UTF-8")) #Creates file "Language_us_file.txt," writes in plaintext of wiki article.
article_file.close

def open_text(language):
"""
Opens the appropriate plaintext file and runs a histogram, creating a dictionary with every non-trivial word that appears and its frequency in the article.
Terminates when it detects that it has read the entire article and reached the bibliography. ("Reference" section.)
Outputs a tuple of the original list and the complete sorted version.
"""
filename = language + '_us_file.txt' #Creates filename of eventual wiki text file in language "language."
if language in dict_lang: #Checks if stopwords supports this language. Stopwords contains a library of common articles/prepositions/trash words in a couple languages.
common_words = set(stopwords.words(language.lower()))
else:
common_words = []

hist = {}

with open(filename, 'r') as f:
filetext = [line.translate(None, string.punctuation).lower() for line in f] #Strips punctuation from the file plaintext and makes everything lowercase for processing
for line in filetext:
for word in line.split():
if word == dict_lang[language][1]: #If the end of the wikipedia article (eg: the "References") is reached, the function terminates and returns the sorted histogram.
sorted_filetext = sorted(hist, key = hist.__getitem__, reverse = True) #Sorting the list of all words in the article by their frequency. Most frequent = first.
return (sorted_filetext, hist)
elif word not in common_words:
if not word.isdigit(): #Is the "word" actually a word, or a number?
if word in hist: #Is the word in the histogram already?
hist[word] += 1 #increase word occurence frequency by one
else:
hist[word] = 1 #word frequency equals 1
return

language = 'Hungarian' #The language of choice
title = 'United_States' #The article title of choice

get_text(language, title)
sorted_filetext, hist = open_text(language)

for i in range(1,10):
print "{} {}".format(sorted_filetext[i], hist[sorted_filetext[i]]) #Print the top ten words and their frequencies.
37 changes: 37 additions & 0 deletions Reflection.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
Project Overview
I used Wikipedia. I ran a simple histogram on the Wikipedia article plaintext, documenting every word and its frequency. I also eliminated common words from the high-frequency list.

Implementation
I implemented my function in two main parts: getting the data, and analysing it. The first half of my code pulls the wiki article of the appropriate language from the internet, and writes the plaintext into a locally stored file. The second half makes a dictionary of every word and its frequency, then returns a list sorted by decreasing frequency.

I used NLTK to get a list of common words in various languages, rather than writing out lists of articles and prepositions by hand, because I was not personally familiar with several of the languages involved, and making these lists would have been error-ridden and highly time-consuming. Though NLTK only supports select languages, it's still vastly easier to use NTLK.

I also decided to make dict_lang (my dictionary of supported languages, language codes, and translated "reference" strings) a global variable. Since it's pretty bulky text-wise, and since it's used by both of my functions, it was cleanest and most efficient to make dict_lang a global variable.

Results
It's interesting to note that in the English, Portuguese, and Spanish wiki pages for the US, the word 'war' appears on the top ten most frequent word list. In the Spanish article, the word 'war' appears 57 times; in Portugese, 56 times. Other common words include 'world,'(Portuguese), 'million,' (Danish, Spanish) 'most/more than,' (Spanish, Portuguese) 'change' (Turkish), and 'large.' (Swedish) As expected, 'US,' 'United,' and 'States' are also very common.
It seems that, despite being a relatively new country, America is already "perceived as "larger than life" and a major military/world force.

Other articles:
Donald Trump-
US- million, new york
Italy- Obama, no
Spanish- Consulted
Portuguese- Reliable, no, favor, independent
Hillary Clinton-
US- Campaign, first lady
Italy- Bill, first lady
Spanish- first lady, plus, bell
Portuguese-Obama, first lady
Russia-
US- Soviet, war, largest, world
Russia- Cinema, consignment
German- Asteroid, states
Turkish- Soviet, Belarus, big, change

Reflection
My project was appropriately scoped. Unfortunately, I was extremely busy the past week, and I couldn't really devote enough time to this project.

I did manage to get my entire project done in under 7 hours, which is actually pretty impressive in hindsight. I got a lot faster when I stopped trying to use BeautifulSoup.
I learned a lot about HTML when I was mucking about with BeautifulSoup, though, which is good. Going forward, I know a lot more about how web pages are formatted, and how to pull information from them.
If I could have known something before I started, I would have liked to have known how busy I was going to be this weekend, so that I would have spent time coding instead of researching BeautifulSoup.
81 changes: 81 additions & 0 deletions getnews.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# import urllib
# from bs4 import BeautifulSoup

from bs4 import BeautifulSoup, SoupStrainer
import urllib2
import re

def main():
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
mainurl = 'http://www.cnn.com/'
# url = 'http://www.cnn.com/2013/10/29/us/florida-shooting-cell-phone-blocks-bullet/index.html?hpt=ju_c2'
# url = 'http://www.cnn.com/2016/02/24/middleeast/swedish-teen-freed-from-isis/index.html'
soup = BeautifulSoup(opener.open(mainurl))

# print type(soup.find("div", {"class":"share-bar-whatsapp-container"}))
#1) Link to the website

#2) title of article
# title = soup.findAll("span", {"class":"cd__headline-text"})

#3) Text of the article
# paragraphs = soup.findAll("p", {"class":"zn-body__paragraph"})
# text = " ".join([ paragraph.text.encode('utf-8') for paragraph in paragraphs])

# print url
# print title
# print text

def getlinks():
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
mainurl = 'http://cnnespanol.cnn.com'
# mainurl = 'http://cnn.com'
soup = BeautifulSoup(opener.open(mainurl))

urls = soup.findAll("a", {"href":re.compile("/index.html")})
text = " ".join([ url.text.encode('utf-8') for url in urls])
text_file = open('CNNtext.txt', 'a')
text_file.write(text)
text_file.close()
return text

# def process_file(filename):
# hist = dict()
# fp = open(filename)
# for line in fp:
# process_line(line, hist)
# return hist

# def process_line(line, hist):
# line = line.replace('-', ' ')

# for word in line.split():
# word = word.strip(string.punctuation + string.whitespace)
# word = word.lower()

# hist[word] = hist.get(word, 0) + 1


# def most_common(hist):
# t = []
# for key, value in hist.items():
# t.append((value, key))

# t.sort(reverse=True)
# return t

# hist = process_file()

# t = most_common(hist)
# common_words = ['the']
# print 'The most common words are:'
# for freq, word in t[0:10]:
# if word not in common_words:
# print word, '\t\t', freq


if __name__ == '__main__':
getlinks()
# main()