diff --git a/README.md b/README.md new file mode 100644 index 0000000..c1bc424 --- /dev/null +++ b/README.md @@ -0,0 +1,6 @@ +#One-shot RSS reader + +1. --date - take date in format YYYYMMDD and return cached news with that publication date +2. --to_fb2 - convert output to fb2 format +3. --to_html - convert output to html format +4. --path - choose path for file saving mods \ No newline at end of file diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/project/SQL_cache.py b/project/SQL_cache.py new file mode 100644 index 0000000..d46ba65 --- /dev/null +++ b/project/SQL_cache.py @@ -0,0 +1,92 @@ +import sqlite3 +from os.path import exists +import sys +from .log_helper import stdout_write, write_progressbar + + +class Database(): + """Class working with SQLite3 database""" + + def __init__(self): + super(Database, self).__init__() + if not exists("cache.db"): + conn = sqlite3.connect("cache.db") + cursor = conn.cursor() + cursor.execute(""" + CREATE TABLE `feed` (`source` text unique, `name` text) + """) + cursor.execute(""" + CREATE TABLE "news" ( `source` text, `date` text, + `title` text, `link` text UNIQUE, + `description` text, `links` text ) + """) + conn.commit() + conn.close() + self.conn = None + self.cursor = None + + def _open(self): + self.conn = sqlite3.connect("cache.db") + self.cursor = self.conn.cursor() + + def _close(self): + self.conn.close() + + def write_data(self, data, feed, url, verbose, color): + """Write news to database + Params: + data: turple - article data + feed: str - rss_channel feed + url: str + verbose: bool + """ + try: + self._open() + counter = 0 + if verbose: + write_progressbar(len(data)+1, counter) + for news in data: + self.cursor.execute(""" + INSERT INTO news + VALUES (?,?,?,?,?,?) + """, news) + counter += 1 + if verbose: + write_progressbar(len(data)+1, counter) + self.conn.commit() + self.cursor.execute(""" + INSERT INTO feed + VALUES (?,?) + """, (url, feed)) + self.conn.commit() + except sqlite3.IntegrityError: + pass + except sqlite3.DatabaseError: + stdout_write("Database error", color="red", colorize=color) + finally: + self._close() + counter = len(data)+1 + if verbose: + write_progressbar(len(data)+1, counter) + + def read_data(self, url, date, color): + """Get url & date + Return feed & data + """ + feed, data = None, None + try: + self._open() + self.cursor.execute(f""" + SELECT name from feed WHERE source = '{url}' + """) + feed = self.cursor.fetchall() + self.cursor.execute(f""" + SELECT * from news WHERE source = '{url}' and date = '{date}' + """) + data = self.cursor.fetchall() + except Exception as e: + stdout_write(f"Database reading error {e}", color="red", colorize=color) + sys.exit() + finally: + self._close() + return feed, data diff --git a/project/__init__.py b/project/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/project/converter.py b/project/converter.py new file mode 100644 index 0000000..10e9292 --- /dev/null +++ b/project/converter.py @@ -0,0 +1,217 @@ +from .log_helper import stdout_write, write_progressbar +from random import randint +from time import time +from base64 import b64encode +import os +import urllib.request +import urllib.error + + +def _download_image(url, verbose, sv_path, color=False): + """download image from Internet to your PC""" + stdout_write("Downloading image", verbose=verbose, color="blue", colorize=color) + try: + local_name, headers = urllib.request.urlretrieve( + url, sv_path + '/' + url.split('/')[-1]) + stdout_write(f'Image "{url}" was downloaded.', verbose=verbose, color="green", colorize=color) + return local_name + except (urllib.error.URLError, urllib.error.HTTPError): + stdout_write("Error occurred during downloading image", color="red", colorize=color) + return "" + except ValueError: + stdout_write("Error: image not found", color="red", colorize=color) + return "" + + +class Converter(): + """Converter class. Convert data to some format""" + + def to_json(self, feed, column, verbose, color): + """Take data and return it in json""" + stdout_write("Convert to json...", verbose=verbose, color="blue", colorize=color) + counter = 0 + if verbose: + write_progressbar(len(column), counter) + json_text = '{\n "title": "' + feed + '",\n "news": [' + separ = False + for news in column: + if separ: + json_text += ',' + separ = True + json_text += '{\n "title": "' + news['title'] + '",' + if 'date' in news: + json_text += '\n "date": "' + news['date'] + '",' + json_text += '\n "link": "' + news['link'] + '",' + json_text += '\n "description": "' + (news['text']) + '",' + json_text += '\n "links": [' + links = "" + for lin in news['links']: + links += f'\n "{lin}",' + if len(links) != 0: + json_text += links[:-1] + "\n ]" + else: + json_text += ']' + json_text += "\n }" + counter += 1 + if verbose: + write_progressbar(len(column), counter) + json_text += ']\n}' + return json_text + + def to_fb2(self, feed, column, url, sv_path=os.getcwd(), verbose=False, color=False): + """Function convert data to fb2 and save as file + Params: + feed - rss_channel feed + column - data from rss_channel + sv_path - path for html doc + url - link to source + """ + def next_article(id, title, images, description, feed, date="Unknown"): + """return code for single article and + binary files for used images + """ + stdout_write("Converting an article...", verbose=verbose, color="blue", colorize=color) + binary = [] + for img in images: + binary += [f'{img}'] + return f"""
+ + <p>{title}</p> + + {' '.join([f'' for img in images])} +

{date}

+

{description}

+

Source: {feed}

+
+""", binary + + stdout_write("Creating FB2 file", verbose=verbose, color="blue", colorize=color) + fb2_begin = '\n' + \ + '' + fb2_end = '' + fb2_desc = f""" + + + sci_business/genre> + + {url} + + {feed} + en + + + + {url} + + 11.11.2011 + 3.14 + {hash(time()+randint(10000000, 1000000000000))} + + + +""" + binary = [] + fb2_text = fb2_begin + fb2_desc + + stdout_write("Convert news", verbose=verbose, color="blue", colorize=color) + for news in column: + image_links = [] + text_links = [] + for link in news["links"]: + if "(image)" in link: + image_links += [link[:-8]] + else: + text_links += [link[:-7]] + images = [] + for link in image_links: + img_path = _download_image(link, verbose, sv_path, color) + try: + with open(img_path, 'rb') as binfile: + images += [b64encode(binfile.read()).decode()] + except FileNotFoundError: + pass + article, temp_bin = next_article(id=hash(hash(news["title"]) + randint(1, 10000)), + title=news["title"], + images=images, + date=news["date"], + description=news["text"] + + 'links' + "\n".join(text_links), + feed=feed + ) + fb2_text += article + binary += temp_bin + stdout_write("Text data converted", verbose=verbose, color="green", colorize=color) + binary = set(binary) + fb2_text += " " + for img in binary: + fb2_text += '\n'+img+'\n' + fb2_text += fb2_end + stdout_write("Add binary part", verbose=verbose, color="green", colorize=color) + + file_path = f"{sv_path}/{hash(time())}-{randint(0, 100)}.fb2" + open(file_path, 'a').close() + with open(file_path, "w") as file: + file.write(fb2_text) + stdout_write("FB2 document created", verbose=verbose, color="green", colorize=color) + + def to_html(self, feed, column, sv_path=os.getcwd(), verbose=False, color=False): + """Function convert data to html and save as file + Params: + feed - rss_channel feed + column - data from rss_channel + sv_path - path for html doc + """ + + def next_article(title, images, description, feed, links, date="Unknown"): + """create html-code for single article""" + return f""" +
+

{title}

+ {' '.join(f'Not found' for img in images)} +

{description}

+ {' '.join(f'link ' for link in links)} +

Date: {date}

+
+ """ + + def create_html(feed, main_part): + return f""" + + + + {feed} + + +{main_part} + + +""" + + html_text = "" + stdout_write("Creating HTML version", verbose=verbose, color="blue", colorize=color) + for news in column: + image_links = [] + text_links = [] + for link in news["links"]: + if "(image)" in link: + image_links += [link[:-8]] + else: + text_links += [link[:-7]] + images = [] + for link in image_links: + img_path = _download_image(link, verbose, sv_path, color) + images += [img_path] + html_text += next_article(links=text_links, + title=news["title"], + images=images, + date=news["date"], + description=news["text"], + feed=feed + ) + html_text = create_html(feed, html_text) + file_path = f"{sv_path}/{hash(time())}-{randint(0, 100)}.html" + open(file_path, 'a').close() + with open(file_path, "w") as file: + file.write(html_text) + stdout_write("Finish HTML document", verbose=verbose, color="green", colorize=color) diff --git a/project/html_parser.py b/project/html_parser.py new file mode 100644 index 0000000..fcfd62e --- /dev/null +++ b/project/html_parser.py @@ -0,0 +1,37 @@ +from html.parser import HTMLParser + + +class _HTMLTagsParser(HTMLParser): + """Class using for parsing html-formatted text""" + + def __init__(self): + super().__init__() + self.links = [] + self.text = "" + + def handle_starttag(self, tag, attrs): + """Convert and tags to text form""" + if tag == "img": + num = len(self.links)+1 + self.text += "[Image" + for attr in attrs: + if attr[0] == "alt" and attr[1] != "": + self.text += f": {attr[1]}" + elif attr[0] == "src": + self.links += [attr[1] + " (image)"] + self.text += f"][{num}]" + elif tag == "a": + for attr in attrs: + if attr[0] == "href": + self.links += [attr[1] + " (text)"] + + def handle_data(self, data): + """Take text from HTML""" + self.text += data + + +def parse_HTML(text): + """Return text without tags or links and a list with links""" + parser = _HTMLTagsParser() + parser.feed(text) + return parser.text, parser.links diff --git a/project/log_helper.py b/project/log_helper.py new file mode 100644 index 0000000..79d6249 --- /dev/null +++ b/project/log_helper.py @@ -0,0 +1,37 @@ +def stdout_write(string, sep=' ', end='\n', flush=False, verbose=True, color="", colorize=False): + """Output function for singe string but convert ' to '""" + if colorize: + RED = '\033[31m' + BLUE = '\033[34m' + GREEN = '\033[92m' + RESET = '\033[0m' + else: + RED, BLUE, GREEN, RESET = "", "", "", "" + + if color == "red": + color = RED + elif color == "blue": + color = BLUE + elif color == "green": + color = GREEN + else: + color, RESET = "", "" + + if verbose: + string = string.replace("'", "'") + print(color+string+RESET, sep=sep, end=end, flush=flush) + + +def write_progressbar(elems, done, length=20): + """Take arguments + elems: count of elements + done: progress (in elements) + length: progress bar length + Write progress bar to stdout + """ + if done != 0: + print("\r", end="") + col = int(length * (done/elems)) + print(f"[{'='*col + ' '*(length-col)}] {int(100*done/elems)}%", end="") + if elems == done: + print() diff --git a/project/reader.py b/project/reader.py new file mode 100644 index 0000000..4ec4ed1 --- /dev/null +++ b/project/reader.py @@ -0,0 +1,154 @@ +import sys +import urllib.request +import urllib.error +from xml.dom.minidom import parseString +import dateutil.parser +from .html_parser import parse_HTML +from .converter import Converter +from .log_helper import stdout_write, write_progressbar +from .SQL_cache import Database + + +class RSSReader(): + """RSSReader: Class for reading rss channels. + Methods: + show_news() - print news to stdout + show_json() - print news to stdout in json format + save_fb2() - save news as fb2 file + save-html() - save news as html file + """ + + def __init__(self, source, limit, verbose, date, sv_path, colorize): + super(RSSReader, self).__init__() + self.__source = source + self.__limit = limit + self.__verbose = verbose + self.__date = date + self.__sv_path = sv_path + self.__colorize = colorize + self.__text = "" + + def __find_news(self): + """Ask database for news from entered date + Return data in the same format with __parse function + """ + stdout_write("Reading data from database...", verbose=self.__verbose, color="blue", colorize=self.__colorize) + feed, data = Database().read_data(self.__source, self.__date, self.__colorize) + column = [] + if not data: + stdout_write("Error: Articles from the entered date not found", color="red", colorize=self.__colorize) + sys.exit() + counter = 0 + if self.__verbose: + write_progressbar(len(data), counter) + for news in data: + column += [{"title": news[2], + "link": news[3], + "text": news[4], + "links": news[5].split('\n')}] + counter += 1 + if self.__verbose: + write_progressbar(len(data), counter) + return feed[0][0], column + + def __cache_data(self, column, feed): + """Take parsed data and write it to database""" + stdout_write("Writing data to database...", verbose=self.__verbose, color="blue", colorize=self.__colorize) + date = lambda pubDate: dateutil.parser.parse(pubDate).strftime("%Y%m%d") + formated_data = [ + (self.__source, date(col["date"]), col["title"], + col["link"], col["text"], "\n".join(col["links"])) for col in column] + Database().write_data(formated_data, feed, self.__source, self.__verbose, self.__colorize) + + def __read_news(self): + """Read data from link""" + try: + stdout_write(f"Reading information from {self.__source}", end='...\n', + verbose=self.__verbose, color="blue", colorize=self.__colorize) + with urllib.request.urlopen(self.__source) as rss: + bytestr = rss.read() + self.__text = bytestr.decode("utf8") + stdout_write("Complete.", verbose=self.__verbose, color="green", colorize=self.__colorize) + except ValueError: + stdout_write("Error: Can't connect, please try with https://", color="red", colorize=self.__colorize) + sys.exit() + except urllib.error.URLError: + stdout_write("Error: Can't connect to web-site, please check URL", color="red", colorize=self.__colorize) + sys.exit() + except Exception: + stdout_write("Unknown error", color="red", colorize=self.__colorize) + sys.exit() + + def __parse(self): + """Parse XML data to python structures""" + stdout_write("Parsing information...", verbose=self.__verbose, color="blue", colorize=self.__colorize) + xml = parseString(self.__text) + feed = xml.getElementsByTagName("title")[0].firstChild.data + items = xml.getElementsByTagName("item") + counter = 0 + if self.__verbose: + write_progressbar(self.__limit, counter) + column = [] + for item in items: + if counter == self.__limit: + break + counter += 1 + a = item.getElementsByTagName("description")[0].firstChild.data + text, links = parse_HTML(a) + column += [{"title": item.getElementsByTagName("title")[0].firstChild.data, + "date": item.getElementsByTagName("pubDate")[0].firstChild.data, + "link": item.getElementsByTagName("link")[0].firstChild.data, + "text": text, + "links": links}] + if self.__verbose: + write_progressbar(self.__limit, counter) + self.__cache_data(column, feed) + return feed, column + + def __read(self): + """Information source selection""" + if not self.__date: + self.__read_news() + return self.__parse() + return self.__find_news() + + def show_news(self): + """Read data and print info in stdout""" + feed, column = self.__read() + stdout_write(f"{feed}", end="\n\n") + for news in column: + stdout_write(f"Title: {news['title']}") + if 'date' in news: + stdout_write(f"Date: {news['date']}") + stdout_write(f"Link: {news['link']}", end="\n\n") + stdout_write(news['text'], end="\n\n") + if len(news['links']) != 0: + stdout_write("Links:") + link_num = 1 + for link in news['links']: + stdout_write(f"[{link_num}]: {link}", color="blue", colorize=self.__colorize) + link_num += 1 + stdout_write("\n\n") + + def show_json(self): + """Read data, convert into json and print info in stdout""" + feed, column = self.__read() + json_text = Converter().to_json(feed, column, self.__verbose, color=self.__colorize) + stdout_write(json_text) + + def save_fb2(self): + """Read data, convert to fb2 & save it as file""" + feed, column = self.__read() + if self.__sv_path: + Converter().to_fb2(feed, column, self.__source, self.__sv_path, + verbose=self.__verbose, color=self.__colorize) + else: + Converter().to_fb2(feed, column, self.__source, verbose=self.__verbose, color=self.__colorize) + + def save_html(self): + """Read data, convert to fb2 & save it into files""" + feed, column = self.__read() + if self.__sv_path: + Converter().to_html(feed, column, self.__sv_path, verbose=self.__verbose, color=self.__colorize) + else: + Converter().to_html(feed, column, verbose=self.__verbose, color=self.__colorize) diff --git a/project/rss_reader.py b/project/rss_reader.py new file mode 100755 index 0000000..259537f --- /dev/null +++ b/project/rss_reader.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +import argparse +from .reader import RSSReader + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument("--version", action='version', help="Print version info", version="Version 0.5") + parser.add_argument("source", type=str, help="RSS URL") + parser.add_argument("--json", action="store_true", help="Print result as JSON in stdout") + parser.add_argument("--verbose", action="store_true", help="Outputs verbose status messages") + parser.add_argument("--to_fb2", action="store_true", help="Save as fb2 file") + parser.add_argument("--to_html", action="store_true", help="Save as html file") + parser.add_argument("--colorize",action="store_true", help="Add colors to console output") + parser.add_argument("--path", type=str, help="Save news to file at entered path.") + parser.add_argument("--limit", type=int, help="Limit news topics if this parameter provided") + parser.add_argument("--date", type=int, help="Start work with cached data. Format YYYYMMDD") + return parser.parse_args() + + +def main(): + args = parse_arguments() + if ' ' in args.path: + args.path = None + if args.colorize: + print('\033[31m' + 'Error: path cannot contain spaces.' + '\033[0m') + else: + print('Error: path cannot contain spaces.') + working_dir = input("Input Y to use working directory") + if 'y' in working_dir or 'Y' in working_dir: + pass + else: + return 0 + rss = RSSReader(args.source, args.limit, args.verbose, args.date, args.path, args.colorize) + used = False + if args.json: + rss.show_json() + used = True + if args.to_fb2: + rss.save_fb2() + used = True + if args.to_html: + rss.save_html() + used = True + if not used: + rss.show_news() + return 0 + + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..3ddf13a --- /dev/null +++ b/setup.py @@ -0,0 +1,23 @@ +from setuptools import setup, find_packages + +setup( + name='RSSReader_Kiryl', + version='0.5', + url='https://github.com/KirylDv/PythonHomework/tree/FinalTask', + packages=find_packages(), + python_requires='>=3.6', + py_modules=['project.rss_reader', 'project.reader', + 'project.html_parser', 'project.converter', + 'project.SQL_cache', 'project.log_helper'], + install_requires=['python-dateutil'], + # To provide executable scripts, use entry points in preference to the + # "scripts" keyword. Entry points provide cross-platform support and allow + # `pip` to create the appropriate form of executable for the target + # platform. + # + # For example, the following would provide a command called `sample` which + # executes the function `main` from this package when invoked: + entry_points={ # Optional + 'console_scripts': ['rss_reader=project.rss_reader:main'], + }, +)