diff --git a/.gitignore b/.gitignore index 894a44c..fa532a7 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,6 @@ venv.bak/ # mypy .mypy_cache/ + +# PyCharm +.idea \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..b4e4cc3 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 Elia Onishchouk + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..540b720 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include requirements.txt \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..3882315 --- /dev/null +++ b/README.md @@ -0,0 +1,50 @@ +# Introduction to Python. Hometask + +RSS reader is a command-line utility which receives [RSS](wikipedia.org/wiki/RSS) URL and prints results in human-readable format. + + +Utility provides the following interface: +```shell +usage: rss_reader.py [-h] [--version] [--json] [--verbose] [--limit LIMIT] + [--date DATE] [--to-html TO_HTML] [--to-fb2 TO_FB2] + [--colorize] + source + +Pure Python command-line RSS reader. + +positional arguments: + source RSS URL + +optional arguments: + -h, --help show this help message and exit + --version Prints version info + --json Prints result as JSON in stdout + --verbose Outputs verbose status messages + --limit LIMIT Limits news topics if this parameter provided + --date DATE Shows news of specific date + --to-html TO_HTML Converts news into html format and save to a specified + path + --to-fb2 TO_FB2 Converts news into fb2 format and save to a specified + path + --colorize Colorizes the cmd output +``` + +With the argument `--json` the program converts the news into [JSON](https://en.wikipedia.org/wiki/JSON) format. + +With the argument `--limit` the program prints given number of news. + +With the argument `--verbose` the program prints all logs in stdout. + +With the argument `--version` the program prints in stdout it's current version and complete it's work. + +With the argument `--date` the program prints or saves news of source from specific date stored if there are any. + +With the argument `--to-html` the program saves news from source to the given path as a html file. + +With the argument `--to-fb2` the program saves news from source to the given path as a fb2 file. + +With the argument `--colorize` the program colorizes output in cmd. + +# Caching + +This program stores data in `"home directory"/rss_reader_cache`. In this directory images in folder `images` are stored and a `cache.json` file is located. It stores all data independent of `--date` attribute. \ No newline at end of file diff --git a/json_schema.json b/json_schema.json new file mode 100644 index 0000000..78aba63 --- /dev/null +++ b/json_schema.json @@ -0,0 +1,91 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + type": "object", + "title": "News feed json schema", + "required": [ + "Feed", + "Items" + ], + "properties": { + "Feed": { + "type": "string", + "title": "Feed", + "description": "The title of the feed" + }, + "Items": { + "type": "array", + "title": "News", + "items": { + "type": "object", + "title": "news", + "required": [ + "title", + "description", + "link", + "pubDate", + "source" + ], + "properties": { + "title": { + "type": "string", + "title": "Title", + "description": "The title of the news" + }, + "description": { + "type": "string", + "title": "Description", + "description": "The description of the news" + }, + "link": { + "type": "string", + "title": "Link", + "description": "The origin link of the news" + }, + "pubDate": { + "type": "string", + "title": "Date", + "description": "The date this news was published" + }, + "source": { + "type": "object", + "title": "Links inside the description", + "required": [ + "images_links", + "href_links", + "video_links" + ], + "properties": { + "images_links": { + "type": "array", + "title": "Images links", + "items": { + "type": "string", + "title": "Image link", + "description": "The source of the image" + } + }, + "href_links": { + "type": "array", + "title": "Hyper references", + "items": { + "type": "string", + "title": "URL link", + "description": "The source of the hyper reference" + } + }, + "video_links": { + "type": "array", + "title": "Video links", + "items": { + "type": "string", + "title": "Video link", + "description": "The source of the video" + } + } + } + } + } + } + } + } +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2a6104e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +feedparser +bs4 +colorama \ No newline at end of file diff --git a/rss_reader/__init__.py b/rss_reader/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rss_reader/rss_reader.py b/rss_reader/rss_reader.py new file mode 100644 index 0000000..084a89f --- /dev/null +++ b/rss_reader/rss_reader.py @@ -0,0 +1,675 @@ +import argparse +import feedparser +import logging +import html +import json +import urllib +import colorama +from base64 import b64encode +from pathlib import Path +from bs4 import BeautifulSoup +from rss_reader import version as vers + + +class Converter: + """This class is used to convert news feed to either html or fb2 version""" + + def __init__(self, news_feed): + """ + Constructor of Converter class. It assigns encoding value additionally + :param NewsFeed news_feed: A NewsFeed object that contains news feed + """ + + self.news_feed = news_feed + self.encoding = self.news_feed.items[0].encoding + + def convert_to_html(self, path, limit, date): + """ + This function creates a html file with news feed + + :param str path: The path where new file will be saved + :param int limit: The number of news to be saved + :param str date: Optional: if exists than resulting html file will only contain news from specific date + """ + + logging.info('Converting to html') + path_object = Path(path) + path_object.mkdir(parents=True, exist_ok=True) + path_object /= 'news feed.html' + with path_object.open('w', encoding="utf-8") as html_file: + html_file.write(self.create_html(limit, date)) + + logging.info('Converting to html successful') + + def convert_to_fb2(self, path, limit): + """ + This function creates a fb2 file with news feed + + :param str path: The path where new file will be saved + :param int limit: The number of news to be saved + """ + + logging.info('Converting to fb2') + path_object = Path(path) + path_object.mkdir(parents=True, exist_ok=True) + path_object /= 'news feed.fb2' + with path_object.open('w', encoding=self.encoding) as fb2_file: + fb2_file.write(self.create_fb2(limit)) + + logging.info('Converting to fb2 successful') + + def create_html(self, limit, date): + """ + This function creates a html version of news feed + + :param int limit: The number of news to be created + :param str date: Optional: if exists than resulting html implementation will contain news from specific date + :return: a html like news feed + :rtype: str + """ + + logging.info('Creating html text') + limit = checking_limit(limit, self.news_feed.items) + news = '\n'.join([item.create_div(date) for item in self.news_feed.items[:limit]]) # ???? + + return """ + + + +

News Feed

+ {0} + + + """.format(news) + + def create_fb2(self, limit): + """ + This function creates a fb2 version of news feed + + :param int limit: The number of news to be created + :return: a fb2 like news feed + :rtype: str + """ + + logging.info('Creating fb2 text') + limit = checking_limit(limit, self.news_feed.items) + news = ''.join([item.create_section() for item in self.news_feed.items[:limit]]) + binaries = ''.join([item.create_binary() for item in self.news_feed.items[:limit]]) + + return """ + + + + newspapers + RSS Reader + el0ny + + + {myprog} + {vers} + + + + {news} + + {binaries} + + """.format(myprog=__name__, vers=vers.__version__, + news=news, encoding=self.encoding, + binaries=binaries) + + +class NewsFeed: + """Base class for news feed""" + + def __init__(self, feed_title, items): + """ + This constructor only initializes two values, nothing else + + :param str feed_title: The title of news feed + :param list items: A list of Item objects, basically, a list of news + """ + + self.feed_title = feed_title + self.items = items + + def print_feed(self, _json, limit, colorize): + """ + This function allows to print news in cmd either in json or str format + + :param colorize: If true than colorize the output in cmd + :param bool _json: If true than the news will be in json format, otherwise in str format + :param int limit: The number of news to be printed + """ + + limit = checking_limit(limit, self.items) + if _json: + self.print_to_json(limit) + else: + self.print_to_console(limit, colorize) + + def create_json(self, is_cached, limit): + """ + This function allows to create json like dict of news + + :param bool is_cached: If true then json will be ready to be saved, otherwise, to be printed + :param int limit: The number of news to be printed + :return: A json like dict of news + :rtype: dict + """ + + return {'Feed': self.feed_title, 'Items': [item.return_item(is_cached) for item in self.items[:limit]]} + + def print_to_json(self, limit): + """ + This function allows to print news in cmd in json format + + :param int limit: The number of news to be printed + """ + + logging.info('Printing news in json format') + print(json.dumps(self.create_json(0, limit))) + + def print_to_console(self, limit, colorize): + """ + This function allows to print news in cmd in str format + + :param colorize: If true than colorize the output in cmd + :param int limit: The number of news to be printed + """ + + logging.info('Printing news in console format') + print('Feed: {0}'.format(self.feed_title)) + for item in self.items[:limit]: + item.print_to_console(colorize) + logging.info('Printed %s news', limit) + + def save_news(self, limit): + """ + This function allows to save news in a json file in homedirectory/rss_reader_cache/cache.json + + :param int limit: The number of news to be saved + """ + logging.info('Saving news') + news_to_save = self.create_json(1, limit)['Items'] + existing_news = load_from_cache() + news_to_save += [item for item in existing_news if item not in news_to_save] + path = Path.home().joinpath('rss_reader_cache') + cache_file = "cache.json" + path.mkdir(parents=True, exist_ok=True) + filepath = path / cache_file + with filepath.open('w') as json_file: + json.dump(news_to_save, json_file) + logging.info('Saving news successful') + + +class Item: + """ + Class for single news item from news feed + Attributes of the class can vary depend on if this item is created from loading from cache, or from parsed feed + They are: + str title News title + str pubDate Published date in it's original form + str link Link to the news + str description Description of the news + dict links A dict with href, image, video links + str date_string (optional: only from cache) Published date in YYYYMMDD format + str source (optional: only from cache) Rss source + str encoding (optional: only from cache) Encoding of the news + """ + def __init__(self, news_dict): + for key in news_dict: + setattr(self, key, news_dict[key]) + + def print_to_console(self, colorize): + """ + This function allows to print one news item in console + + :param colorize: If true than colorize the output in cmd + """ + title_color = '' + date_color = '' + link_color = '' + description_color = '' + href_color = '' + image_color = '' + video_color = '' + divider_color = '' + if colorize: + colorama.init(autoreset=True) + title_color = colorama.Fore.MAGENTA + date_color = colorama.Fore.WHITE + link_color = colorama.Fore.LIGHTBLACK_EX + description_color = colorama.Fore.LIGHTYELLOW_EX + colorama.Back.BLACK + href_color = colorama.Fore.GREEN + image_color = colorama.Fore.LIGHTGREEN_EX + video_color = colorama.Fore.CYAN + divider_color = colorama.Fore.LIGHTWHITE_EX + colorama.Back.LIGHTWHITE_EX + print(colorama.Fore.GREEN) + print(title_color + '\nTitle: {0}'.format(self.title)) + print(date_color + 'Date: {0}'.format(self.pubDate)) + print(link_color + 'Link: {0} \n'.format(self.link)) + print(description_color + self.description) + print() + + if self.links['href_links']: + print(href_color + '\nLinks:') + for link in self.links['href_links']: + print(href_color + link) + + if self.links['images_links']: + print(image_color + '\nImages:') + for link in self.links['images_links']: + print(image_color + link) + + if self.links['video_links']: + print(video_color + '\nVideos:') + for link in self.links['video_links']: + print(video_color + link) + + print(divider_color + '\n//////////////////////////////////////////////////////////////////////////') + + def create_div(self, date): + """ + This function creates a div block of news needed for html convertation + + :param str date: Optional: if exists than resulting div implementation will only contain news from specific date + :return: A string representation of div block of news + :rtype: str + """ + + return """ +
+

{title}

+ {pubDate} +

+

{description}

+ Read More +


+
+ """.format(title=html.escape(self.title), pubDate=self.pubDate, + description=self.insert_hrefs(self.description, date), link=self.link) + + def create_section(self): + """ + This function creates a section block of news needed for fb2 convertation + + :return: A string representation of section block of news + :rtype: str + """ + + logging.info('Creating section') + description = html.escape(self.description) + return """ +
+ <p>{title}</p> +

{pubDate}

+

{description}

+
+ """.format(title=html.escape(self.title), pubDate=self.pubDate, + description=self.insert_hrefs_fb2(self.description)) + + def insert_hrefs(self, description, date): + """ + This function inserts href links in description needed for html convertation + + :param str description: The original description of news + :param str date: Optional: if exists than resulting description will only contain news from specific date + :return: A description with inserted href links + :rtype: str + """ + + description = self.insert_images(html.escape(description), date) + description = self.insert_videos(description) + for href_link in self.links['href_links']: + href_raw = description[description.find(' [link '):description.find(']', description.find(' [link '))+1] + href_content = href_raw[href_raw.find(' | ')+3:len(href_raw)-1] + href_html = '{content}'.format(href=href_link[href_link.find(': ')+2:], + content=href_content) + description = description.replace(href_raw, href_html) + logging.info('href inserted') + return description + + def insert_images(self, description, date): + """ + This function inserts images in description needed for html convertation + + :param str description: The original description of news + :param str date: Optional: if exists than resulting description will only contain news from specific date + :return: A description with inserted image links + :rtype: str + """ + logging.info('Image inserted') + for image_link in self.links['images_links']: + image_raw = description[description.find(' [image '):description.find(']', description.find(' [image '))+1] + image_alt = image_raw[image_raw.find(' | ') + 3:len(image_raw) - 1] + source = image_link[image_link.find(': ') + 2:] + if date: + image_name = source.split('/')[-1] + image_name = image_name.translate(str.maketrans('', '', '.?><"*:|')) + '.jpg' + path = Path.home().joinpath('rss_reader_cache/image') + source = path / image_name + image_html = '{alt}'.format(src=source, alt=image_alt) + description = description.replace(image_raw, image_html) + return description + + def insert_videos(self, description): + """ + This function inserts video links in description needed for html convertation + (I thought that I can convert them into full videos, but then I realised that it was a bad idea, + so I decided to just keep that part, although it isn't necessary anymore + + :param str description: The original description of news + :param str date: Optional: if exists than resulting description will only contain news from specific date + :return: A description with inserted video links + :rtype: str + """ + logging.info('Video inserted') + for video_link in self.links['video_links']: + video_href = description[description.find(' [video '):description.find(']', description.find(' [video '))+1] + logging.info(video_href) + source = video_link[video_link.find(': ') + 2:] + image_html = '{content}'.format(src=source, content=video_href[1:]) + description = description.replace(video_href, image_html) + return description + + def create_binary(self): + """ + This function creates a with b64 images needed for fb2 convertation + + :return: A string in format with images inside + """ + + logging.info('Creating binaries') + binaries = '' + if not self.links['images_links']: + return '' + for image_link in self.links['images_links']: + source = image_link[image_link.find(': ') + 2:] + image_name = source.split('/')[-1] + if source == '': + image_name = '.jpg' + encoded_string = '' + else: + logging.info('Image name %s', image_name) + image_name = image_name.translate(str.maketrans('', '', '.?><"*:|')) + '.jpg' + path = Path.home().joinpath('rss_reader_cache/image') + source = path / image_name + with open(source, "rb") as image_file: + encoded_string = b64encode(image_file.read()).decode() + + binaries += '{data}'\ + .format(src=image_name, data=encoded_string) + return binaries + + def insert_hrefs_fb2(self, description): + """ + This function allows find and insert links into description + (That is also a rudimental function. Originally I wanted to make hrefs to web links, which are stored as + notes. But something went wrong and not all rss were working correctly. So now it just makes links that are + empty) + + :param str description: Were to find those hrefs + :return: Resulting description with inserted href links + :rtype: str + """ + + logging.info('href inserted') + description = self.insert_images_fb2(html.escape(description)) + for href_link in self.links['href_links']: + href_raw = description[description.find(' [link '):description.find(']', description.find(' [link '))+1] + href_content = href_raw[href_raw.find(' | ')+3:len(href_raw)-1] + href_fb2 = '{content}'.format(href=href_link[href_link.find(': ')+2:], + content=href_content) + description = description.replace(href_raw, href_fb2) + return description + + def insert_images_fb2(self, description): + """ + This function allows find and insert links to images into description + + :param str description: Were to find those images + :return: Resulting description with inserted image links + :rtype: str + """ + logging.info('Image inserted') + for image_link in self.links['images_links']: + image_raw = description[description.find(' [image '):description.find(']', description.find(' [image '))+1] + image_alt = image_raw[image_raw.find(' | ') + 3:len(image_raw) - 1] + source = image_link[image_link.find(': ') + 2:] + image_name = source.split('/')[-1] + image_name = image_name.translate(str.maketrans('', '', '.?><"*:|')) + '.jpg' + image_html = '{alt}'.format(src=image_name, alt=image_alt) + description = description.replace(image_raw, image_html) + return description + + def return_item(self, is_cached): + """ + This function returns the content of this object as a dict + + :param bool is_cached: If true than the result dict will be able to be cached + :return: A dict with this object's content + :rtype: dict + """ + + item_content = {'title': self.title, 'description': self.description, + 'link': self.link, 'pubDate': self.pubDate, 'links': self.links} + if is_cached: + item_content['date_string'] = self.date_string + item_content['source'] = self.source + item_content['encoding'] = self.encoding + return item_content + + +def set_argparse(): + """ + This function allows to get needed parameters from command line + + :return: An object with all needed parameters inside + """ + + parser = argparse.ArgumentParser(description='Pure Python command-line RSS reader.') + parser.add_argument('source', type=str, help='RSS URL') + + parser.add_argument('--version', action='version', version='%(prog)s v'+vers.__version__, + help='Prints version info') + parser.add_argument('--json', action='store_true', help='Prints result as JSON in stdout') + parser.add_argument('--verbose', action='store_true', help='Outputs verbose status messages') + parser.add_argument('--limit', type=int, default=-1, help='Limits news topics if this parameter provided') + parser.add_argument('--date', type=str, help='Shows news of specific date') + parser.add_argument('--to-html', dest='to_html', type=str, + help='Converts news into html format and save to a specified path') + parser.add_argument('--to-fb2', dest='to_fb2', type=str, + help='Converts news into fb2 format and save to a specified path') + parser.add_argument('--colorize', action='store_true', help='Colorizes the cmd output') + return parser.parse_args() + + +def find_images(soup): + """ + This function allows to extract image links from parsed feed + + :param bs4.BeautifulSoup soup: A beautifulsoup representation of parsed news feed + :return: A list of found image links + :rtype: list + """ + + logging.info('Starting image finding') + image_iterator = 0 + images_links = [] + + for img in soup.findAll('img'): + + image_iterator += 1 + if 'alt' in img.attrs and img['alt'] != '': + replaced_data = ' [image {0} | {1}] '.format(image_iterator, img['alt']) + else: + replaced_data = ' [image {0}]'.format(image_iterator) + src = img['src'] + + if src != '': + image_name = src.split('/')[-1] + image_name = image_name.translate(str.maketrans('', '', '.?><"*:|')) + '.jpg' + path = Path.home().joinpath('rss_reader_cache/image') + path.mkdir(parents=True, exist_ok=True) + filepath = path / image_name + if filepath.is_file(): + logging.info('Image already exists') + else: + urllib.request.urlretrieve(src, filepath) + images_links.append('[{0}]: {1}'.format(image_iterator, src)) + soup.find('img').replace_with(replaced_data) + + logging.info('Image finding finished. Found %s images', image_iterator) + return images_links + + +def find_href(soup): + """ + This function allows to extract href links from parsed feed + + :param bs4.BeautifulSoup soup: A beautifulsoup representation of parsed news feed + :return: A list of found href links + :rtype: list + """ + + logging.info('Starting link finding') + href_iterator = 0 + href_links = [] + for href in soup.findAll('a'): + if 'href' in href.attrs: + href_iterator += 1 + link = href['href'] + if href.text != '': + replaced_data = ' [link {0} | {1}] '.format(href_iterator, href.text) + else: + replaced_data = ' [link {0}] '.format(href_iterator) + href_links.append('[{0}]: {1}'.format(href_iterator, link)) + href.replace_with(replaced_data) + logging.info('Link finding finished. Found %s links', href_iterator) + return href_links + + +def find_videos(soup): + """ + This function allows to extract