From adb2f21897d9aa8233c9851ea89f2aa6c41df8a3 Mon Sep 17 00:00:00 2001 From: Moishe Lettvin Date: Sun, 19 Jan 2014 18:23:30 -0700 Subject: [PATCH 1/3] Don't append newlines inside a span --- html2text.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/html2text.py b/html2text.py index 17528901..f45b6b66 100755 --- a/html2text.py +++ b/html2text.py @@ -238,6 +238,7 @@ def __init__(self, out=None, baseurl=''): self.abbr_data = None # last inner HTML (for abbr being defined) self.abbr_list = {} # stack of abbreviations to write later self.baseurl = baseurl + self.in_span = False try: del unifiable_n[name2cp('nbsp')] except KeyError: pass @@ -413,7 +414,8 @@ def handle_tag(self, tag, attrs, start): else: self.soft_br() else: - self.p() + if start == 1 or not self.in_span: + self.p() if tag == "br" and start: self.o(" \n") @@ -492,12 +494,14 @@ def handle_tag(self, tag, attrs, start): a['outcount'] = self.outcount self.a.append(a) self.o("][" + str(a['count']) + "]") + self.in_span = False if tag == "img" and start and not self.ignore_images: if has_key(attrs, 'src'): attrs['href'] = attrs['src'] alt = attrs.get('alt', '') self.o("![" + escape_md(alt) + "]") + self.in_span = False if self.inline_links: self.o("(" + escape_md(attrs['href']) + ")") @@ -511,6 +515,7 @@ def handle_tag(self, tag, attrs, start): attrs['outcount'] = self.outcount self.a.append(attrs) self.o("[" + str(attrs['count']) + "]") + self.in_span = False if tag == 'dl' and start: self.p() if tag == 'dt' and not start: self.pbr() @@ -670,6 +675,7 @@ def handle_data(self, data): return else: self.o("[") + self.in_span = True self.maybe_automatic_link = None if not self.code and not self.pre: From ccb13aff1b7a3d3fba7afddc018e8ab028578a31 Mon Sep 17 00:00:00 2001 From: Moishe Lettvin Date: Mon, 20 Jan 2014 08:47:24 -0700 Subject: [PATCH 2/3] Update version number --- html2text-test.py | 10 ++++++++++ html2text.py | 2 +- setup.py | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 html2text-test.py diff --git a/html2text-test.py b/html2text-test.py new file mode 100644 index 00000000..383b5155 --- /dev/null +++ b/html2text-test.py @@ -0,0 +1,10 @@ +content = """ +
St. Helen's


I climbed St. Helen's today.

It turned out to be as close to perfect as I'd like.

I'm going back to drinking beer and watching TV now. +""" + +import html2text +h = html2text.HTML2Text() +h.body_width = 0 +h.escape_snob = 1 + +print h.handle(content) diff --git a/html2text.py b/html2text.py index f45b6b66..a3c48064 100755 --- a/html2text.py +++ b/html2text.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """html2text: Turn HTML into equivalent Markdown-structured text.""" -__version__ = "3.200.3" +__version__ = "3.200.4" __author__ = "Aaron Swartz (me@aaronsw.com)" __copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] diff --git a/setup.py b/setup.py index dd3d9bc2..581a4f04 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name = "html2text", - version = "3.200.3", + version = "3.200.4", description = "Turn HTML into equivalent Markdown-structured text.", author = "Aaron Swartz", author_email = "me@aaronsw.com", From 7facb18e7a729c739b2f17221b6070762207d7da Mon Sep 17 00:00:00 2001 From: Moishe Lettvin Date: Mon, 20 Jan 2014 08:58:11 -0700 Subject: [PATCH 3/3] Remove unneeded & clunky test script --- html2text-test.py | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 html2text-test.py diff --git a/html2text-test.py b/html2text-test.py deleted file mode 100644 index 383b5155..00000000 --- a/html2text-test.py +++ /dev/null @@ -1,10 +0,0 @@ -content = """ -
St. Helen's


I climbed St. Helen's today.

It turned out to be as close to perfect as I'd like.

I'm going back to drinking beer and watching TV now. -""" - -import html2text -h = html2text.HTML2Text() -h.body_width = 0 -h.escape_snob = 1 - -print h.handle(content)