diff --git a/CHANGELOG.md b/CHANGELOG.md index 35314d0..0faac0e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ * Add optional overrides for command-line arguments passed to `pdf2htmlEX`. * Patch and build `pdf2htmlEX` as part of this build process to use `libopenjp` instead of `libjpeg` for JPEG-2000 support. * All patches are in this source tree, and are applied to directly to the source of the upstream tag during build. -* Patch issue with non-breaking spaces in `pdf2HTMLEX`. +* Patch issue with non-breaking spaces and tab characters in `pdf2HTMLEX`. * Convert complex SVGs images to bitmaps. ## 0.1.0 diff --git a/src/Pdf2Html/Dockerfile b/src/Pdf2Html/Dockerfile index 496c5af..dee254d 100644 --- a/src/Pdf2Html/Dockerfile +++ b/src/Pdf2Html/Dockerfile @@ -19,6 +19,7 @@ RUN patch ./buildScripts/versionEnvs ./patches/versionEnvs.patch RUN patch ./buildScripts/buildPoppler ./patches/buildPoppler.patch RUN patch ./buildScripts/getBuildToolsApt ./patches/getBuildToolsApt.patch RUN patch ./buildScripts/getDevLibrariesApt ./patches/getDevLibrariesApt.patch +RUN patch ./pdf2htmlEX/src/util/unicode.cc ./patches/unicode.cc.patch RUN patch ./pdf2htmlEX/src/util/unicode.h ./patches/unicode.h.patch RUN patch ./pdf2htmlEX/CMakeLists.txt ./patches/CMakeLists.patch diff --git a/src/Pdf2Html/pdf2htmlEX/patches/unicode.cc.patch b/src/Pdf2Html/pdf2htmlEX/patches/unicode.cc.patch new file mode 100644 index 0000000..d0b9b46 --- /dev/null +++ b/src/Pdf2Html/pdf2htmlEX/patches/unicode.cc.patch @@ -0,0 +1,18 @@ +@@ -47,6 +47,8 @@ Unicode unicode_from_font (CharCode code, GfxFont * font) + if(cname) + { + Unicode ou = globalParams->mapNameToUnicodeText(cname); ++ if(ou == '\t') ++ return ' '; + if(!is_illegal_unicode(ou)) + return ou; + } +@@ -62,6 +64,8 @@ Unicode check_unicode(Unicode const * u, int len, CharCode code, GfxFont * font) + + if(len == 1) + { ++ if(*u == '\t') ++ return ' '; + if(!is_illegal_unicode(*u)) + return *u; + } diff --git a/src/Pdf2Html/pdf2htmlEX/patches/unicode.h.patch b/src/Pdf2Html/pdf2htmlEX/patches/unicode.h.patch index 4601815..11a3310 100644 --- a/src/Pdf2Html/pdf2htmlEX/patches/unicode.h.patch +++ b/src/Pdf2Html/pdf2htmlEX/patches/unicode.h.patch @@ -1,3 +1,12 @@ +@@ -27,7 +27,7 @@ namespace pdf2htmlEX { + * 00(NUL)--09(\t)--0A(\n)--0D(\r)--20(SP)--7F(DEL)--9F(APC)--A0(NBSP)--AD(SHY)--061C(ALM)--1361(Ethiopic word space) + * webkit: [--------------------------------) [------------------) [-] + * moz: [--------------------------------) [---------] [-] +- * p2h: [--------------------------------) [------------------] [-] [-] [-] ++ * p2h: [--------------------------------) [------------------) [-] [-] [-] + * + * 200B(ZWSP)--200C(ZWNJ)--200D(ZWJ)--200E(LRM)--200F(RLM)--2028(LSEP)--2029(PSEP)--202A(LRE)--202E(RL0)--2066(LRI)--2069(PDI) + * webkit: [-----------------------------------------------] [----------] @@ -39,9 +39,6 @@ namespace pdf2htmlEX { * moz: * p2h: [------------------] [-] [-] [-----------------] diff --git a/tests/E2E.Tests/Resources/CS_cheat_sheet.html b/tests/E2E.Tests/Resources/CS_cheat_sheet.html index b35cd44..7af0cb2 100644 --- a/tests/E2E.Tests/Resources/CS_cheat_sheet.html +++ b/tests/E2E.Tests/Resources/CS_cheat_sheet.html @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ff65d9e1cc4864dc0db647594c33c01333faa20e0e104379b42ae2b8e9694c0a -size 1086803 +oid sha256:e020014ff0cab94ab78700278ed7b54852b944ccb366015b1a60ae944e0780d7 +size 1086801