diff --git a/CHANGELOG.md b/CHANGELOG.md index 62919d7..61bee28 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## 0.3.0 + +* Switch to `cfl3` CFL patched fork instead of patching as part of this build. + * This improves support for certain font CMaps + * Remove `--tounicode` in favour of `--ignore-tounicode`, as `force` is no longer required. + ## 0.2.2 * Patch memory corruption bug due to PNG background images being the incorrect size. diff --git a/README.md b/README.md index 8206a7b..49b159f 100644 --- a/README.md +++ b/README.md @@ -28,4 +28,4 @@ docker run corefiling/pdf2html pdf2htmlEX:$version --help Since pdf2htmlex is licensed under the GPL, this project is too (see the LICENSE.TXT file). -As you can see from the build process, pdf2htmlEX itself is patched by the patches within this project (see [src/Pdf2Html/patches](tree/src/Pdf2Html/patches)), based on a clone of the upstream project tag we are targeting. As such we have not repeated pdf2htmlEX's source code here; you can find it via the link above. +As you can see from the build process, pdf2htmlEX itself is aquired from our fork of pdf2htmlEX/pdf2htmlEX found here: https://github.com/CoreFiling/pdf2htmlEX/tree/feature/cfl-patches diff --git a/src/Pdf2Html/Dockerfile b/src/Pdf2Html/Dockerfile index 99bcfdb..5b794ba 100644 --- a/src/Pdf2Html/Dockerfile +++ b/src/Pdf2Html/Dockerfile @@ -3,40 +3,12 @@ FROM ubuntu:noble AS build-pdf2htmlex # Produces a patched pdf2htmlEX using libopenjp 2.7 instead of libjpeg to get JPEG2000 support. -ENV PDF2HTMLEX_BRANCH= -ENV UNATTENDED="--assume-yes" -ENV MAKE_PARALLEL="-j 4" -ENV PDF2HTMLEX_PREFIX=/usr/local -ENV DEBIAN_FRONTEND=noninteractive - WORKDIR /source RUN apt update && apt install -y git patch sudo -RUN git clone --depth=1 --branch v0.18.8.rc1 https://github.com/pdf2htmlEX/pdf2htmlEX +RUN git clone --depth=1 --branch 0.18.8.rc1-cfl3 https://github.com/CoreFiling/pdf2htmlEX WORKDIR /source/pdf2htmlEX -COPY ./pdf2htmlEX/patches ./patches -RUN patch ./buildScripts/versionEnvs ./patches/versionEnvs.patch -RUN patch ./buildScripts/buildPoppler ./patches/buildPoppler.patch -RUN patch ./buildScripts/getBuildToolsApt ./patches/getBuildToolsApt.patch -RUN patch ./buildScripts/getDevLibrariesApt ./patches/getDevLibrariesApt.patch -RUN patch ./pdf2htmlEX/src/BackgroundRenderer/SplashBackgroundRenderer.cc ./patches/SplashBackgroundRenderer.cc.patch -RUN patch ./pdf2htmlEX/src/util/unicode.cc ./patches/unicode.cc.patch -RUN patch ./pdf2htmlEX/src/util/unicode.h ./patches/unicode.h.patch -RUN patch ./pdf2htmlEX/CMakeLists.txt ./patches/CMakeLists.patch - -RUN ./buildScripts/versionEnvs -RUN ./buildScripts/reportEnvs -RUN ./buildScripts/getBuildToolsApt -RUN ./buildScripts/getDevLibrariesApt -RUN ./buildScripts/getPoppler -RUN patch ./poppler/glib/poppler-enums.c.template ./patches/poppler-enums.c.template.patch -RUN patch ./poppler/glib/poppler-private.h ./patches/poppler-private.h.patch -RUN ./buildScripts/buildPoppler -RUN ./buildScripts/getFontforge -RUN patch ./fontforge/fontforge/tottfgpos.c ./patches/fontforge-tottfgpos.c.patch -RUN ./buildScripts/buildFontforge -RUN ./buildScripts/buildPdf2htmlEX -RUN ./buildScripts/installPdf2htmlEX +RUN ./buildScripts/buildInstallLocallyApt RUN git config user.name "CoreFiling" RUN git config user.email "opensource@corefiling.com" RUN ./buildScripts/createDebianPackage @@ -51,7 +23,7 @@ RUN apt update && apt install -y wget RUN wget http://archive.ubuntu.com/ubuntu/pool/main/libj/libjpeg-turbo/libjpeg-turbo8_2.0.3-0ubuntu1_amd64.deb RUN apt install -y ./libjpeg-turbo8_2.0.3-0ubuntu1_amd64.deb COPY --from=build-pdf2htmlex /source/pdf2htmlEX/imageBuild/*.deb /pdf2htmlEX/ -RUN apt install -y libjpeg62 libopenjp2-7 /pdf2htmlEX/pdf2htmlEX-0.18.8.rc1-cfl2-*-x86_64.deb +RUN apt install -y libjpeg62 libopenjp2-7 /pdf2htmlEX/pdf2htmlEX-0.18.8.rc1-cfl3-*-x86_64.deb WORKDIR /app COPY --from=build /app ./ diff --git a/src/Pdf2Html/Pdf2Html.csproj b/src/Pdf2Html/Pdf2Html.csproj index f3255d8..70d4cc9 100644 --- a/src/Pdf2Html/Pdf2Html.csproj +++ b/src/Pdf2Html/Pdf2Html.csproj @@ -4,7 +4,7 @@ net8.0 enable enable - 0.2.2 + 0.3.0 Pdf2Html Pdf2Html diff --git a/src/Pdf2Html/appsettings.json b/src/Pdf2Html/appsettings.json index 9f04ae9..12042e4 100644 --- a/src/Pdf2Html/appsettings.json +++ b/src/Pdf2Html/appsettings.json @@ -6,8 +6,7 @@ "Printing": false, "BgFormat": "svg", "SvgNodeCountLimit": 100, - "DecomposeLigature": true, - "Tounicode": true + "DecomposeLigature": true }, "Logging": { "LogLevel": { diff --git a/src/Pdf2Html/pdf2htmlEX/patches/CMakeLists.patch b/src/Pdf2Html/pdf2htmlEX/patches/CMakeLists.patch deleted file mode 100644 index 2068ac0..0000000 --- a/src/Pdf2Html/pdf2htmlEX/patches/CMakeLists.patch +++ /dev/null @@ -1,19 +0,0 @@ -@@ -23,6 +23,10 @@ add_custom_target(dist - - find_package(PkgConfig) - -+include_directories( -+ /usr/include/glib-2.0 -+ /usr/lib/x86_64-linux-gnu/glib-2.0/include -+) - - # SINCE we have a very intimate relationship with a particular version of - # poppler... we explicitly describe the poppler include and library -@@ -97,6 +101,7 @@ set(PDF2HTMLEX_LIBS ${PDF2HTMLEX_LIBS} - ${LIB_INTL_LIBRARIES} - ${CAIRO_LIBRARIES} - -ljpeg -+ -lopenjp2 - -lpng - -lfontconfig - -lfreetype diff --git a/src/Pdf2Html/pdf2htmlEX/patches/SplashBackgroundRenderer.cc.patch b/src/Pdf2Html/pdf2htmlEX/patches/SplashBackgroundRenderer.cc.patch deleted file mode 100644 index 3eb1306..0000000 --- a/src/Pdf2Html/pdf2htmlEX/patches/SplashBackgroundRenderer.cc.patch +++ /dev/null @@ -1,20 +0,0 @@ -@@ -137,7 +137,7 @@ void SplashBackgroundRenderer::embed_image(int pageno) - // end of hack - - // dump the background image only when it is not empty -- if((xmin <= xmax) && (ymin <= ymax)) -+ if((xmin < xmax) && (ymin < ymax)) - { - { - auto fn = html_renderer->str_fmt("%s/bg%x.%s", (param.embed_image ? param.tmp_dir : param.dest_dir).c_str(), pageno, format.c_str()); -@@ -185,8 +185,8 @@ void SplashBackgroundRenderer::embed_image(int pageno) - // There might be mem leak when exception is thrown ! - void SplashBackgroundRenderer::dump_image(const char * filename, int x1, int y1, int x2, int y2) - { -- int width = x2 - x1 + 1; -- int height = y2 - y1 + 1; -+ int width = x2 - x1; -+ int height = y2 - y1; - if((width <= 0) || (height <= 0)) - throw "Bad metric for background image"; - diff --git a/src/Pdf2Html/pdf2htmlEX/patches/buildPoppler.patch b/src/Pdf2Html/pdf2htmlEX/patches/buildPoppler.patch deleted file mode 100644 index e9c212d..0000000 --- a/src/Pdf2Html/pdf2htmlEX/patches/buildPoppler.patch +++ /dev/null @@ -1,9 +0,0 @@ -@@ -27,7 +27,7 @@ - -DENABLE_GOBJECT_INTROSPECTION=OFF \ - -DENABLE_GTK_DOC=OFF \ - -DENABLE_QT5=OFF \ -- -DENABLE_LIBOPENJPEG="none" \ -+ -DENABLE_LIBOPENJPEG="openjpeg2" \ - -DENABLE_CMS="none" \ - -DENABLE_DCTDECODER="libjpeg" \ - -DENABLE_LIBCURL=OFF \ diff --git a/src/Pdf2Html/pdf2htmlEX/patches/fontforge-tottfgpos.c.patch b/src/Pdf2Html/pdf2htmlEX/patches/fontforge-tottfgpos.c.patch deleted file mode 100644 index 44be157..0000000 --- a/src/Pdf2Html/pdf2htmlEX/patches/fontforge-tottfgpos.c.patch +++ /dev/null @@ -1,13 +0,0 @@ -@@ -2091,10 +2091,10 @@ - } - - static uint16 *FigureInitialClasses(FPST *fpst) { -- uint16 *initial = malloc((fpst->nccnt+1)*sizeof(uint16)); -+ uint16 *initial = malloc((fpst->rule_cnt+1)*sizeof(uint16)); - int i, cnt, j; - -- initial[fpst->nccnt] = 0xffff; -+ initial[fpst->rule_cnt] = 0xffff; - for ( i=cnt=0; irule_cnt; ++i ) { - for ( j=0; jrules[i].u.class.nclasses[0] ) diff --git a/src/Pdf2Html/pdf2htmlEX/patches/getBuildToolsApt.patch b/src/Pdf2Html/pdf2htmlEX/patches/getBuildToolsApt.patch deleted file mode 100644 index 68ecaf8..0000000 --- a/src/Pdf2Html/pdf2htmlEX/patches/getBuildToolsApt.patch +++ /dev/null @@ -1,7 +0,0 @@ -@@ -30,6 +30,6 @@ sudo apt-get $UNATTENDED install \ - dpkg \ - dpkg-dev \ - gettext \ -- openjdk-8-jre-headless \ -+ openjdk-11-jre-headless \ - jq diff --git a/src/Pdf2Html/pdf2htmlEX/patches/getDevLibrariesApt.patch b/src/Pdf2Html/pdf2htmlEX/patches/getDevLibrariesApt.patch deleted file mode 100644 index 387af0d..0000000 --- a/src/Pdf2Html/pdf2htmlEX/patches/getDevLibrariesApt.patch +++ /dev/null @@ -1,5 +0,0 @@ -@@ -21,3 +21,4 @@ - libpng-dev \ - libjpeg-dev \ - libxml2-dev \ -+ libopenjp2-7-dev \ diff --git a/src/Pdf2Html/pdf2htmlEX/patches/poppler-enums.c.template.patch b/src/Pdf2Html/pdf2htmlEX/patches/poppler-enums.c.template.patch deleted file mode 100644 index 42bd4cd..0000000 --- a/src/Pdf2Html/pdf2htmlEX/patches/poppler-enums.c.template.patch +++ /dev/null @@ -1,18 +0,0 @@ -@@ -17,7 +17,7 @@ GType - { - static volatile gsize g_define_type_id__volatile = 0; - -- if (g_once_init_enter (&g_define_type_id__volatile)) { -+ if (g_once_init_enter ((gsize*) &g_define_type_id__volatile)) { - static const G@Type@Value values[] = { - /*** END value-header ***/ - -@@ -31,7 +31,7 @@ GType - GType g_define_type_id = - g_@type@_register_static (g_intern_static_string ("@EnumName@"), values); - -- g_once_init_leave (&g_define_type_id__volatile, g_define_type_id); -+ g_once_init_leave ((gsize*) &g_define_type_id__volatile, g_define_type_id); - } - - return g_define_type_id__volatile; diff --git a/src/Pdf2Html/pdf2htmlEX/patches/poppler-private.h.patch b/src/Pdf2Html/pdf2htmlEX/patches/poppler-private.h.patch deleted file mode 100644 index c71a271..0000000 --- a/src/Pdf2Html/pdf2htmlEX/patches/poppler-private.h.patch +++ /dev/null @@ -1,15 +0,0 @@ -@@ -155,12 +155,12 @@ GType - type_name##_get_type (void) \ - { \ - static volatile gsize g_define_type_id__volatile = 0; \ -- if (g_once_init_enter (&g_define_type_id__volatile)) { \ -+ if (g_once_init_enter ((gsize*) &g_define_type_id__volatile)) { \ - GType g_define_type_id = \ - g_boxed_type_register_static (g_intern_static_string (#TypeName), \ - (GBoxedCopyFunc) copy_func, \ - (GBoxedFreeFunc) free_func); \ -- g_once_init_leave (&g_define_type_id__volatile, g_define_type_id); \ -+ g_once_init_leave ((gsize*) &g_define_type_id__volatile, g_define_type_id); \ - } \ - return g_define_type_id__volatile; \ - } diff --git a/src/Pdf2Html/pdf2htmlEX/patches/unicode.cc.patch b/src/Pdf2Html/pdf2htmlEX/patches/unicode.cc.patch deleted file mode 100644 index d0b9b46..0000000 --- a/src/Pdf2Html/pdf2htmlEX/patches/unicode.cc.patch +++ /dev/null @@ -1,18 +0,0 @@ -@@ -47,6 +47,8 @@ Unicode unicode_from_font (CharCode code, GfxFont * font) - if(cname) - { - Unicode ou = globalParams->mapNameToUnicodeText(cname); -+ if(ou == '\t') -+ return ' '; - if(!is_illegal_unicode(ou)) - return ou; - } -@@ -62,6 +64,8 @@ Unicode check_unicode(Unicode const * u, int len, CharCode code, GfxFont * font) - - if(len == 1) - { -+ if(*u == '\t') -+ return ' '; - if(!is_illegal_unicode(*u)) - return *u; - } diff --git a/src/Pdf2Html/pdf2htmlEX/patches/unicode.h.patch b/src/Pdf2Html/pdf2htmlEX/patches/unicode.h.patch deleted file mode 100644 index 11a3310..0000000 --- a/src/Pdf2Html/pdf2htmlEX/patches/unicode.h.patch +++ /dev/null @@ -1,28 +0,0 @@ -@@ -27,7 +27,7 @@ namespace pdf2htmlEX { - * 00(NUL)--09(\t)--0A(\n)--0D(\r)--20(SP)--7F(DEL)--9F(APC)--A0(NBSP)--AD(SHY)--061C(ALM)--1361(Ethiopic word space) - * webkit: [--------------------------------) [------------------) [-] - * moz: [--------------------------------) [---------] [-] -- * p2h: [--------------------------------) [------------------] [-] [-] [-] -+ * p2h: [--------------------------------) [------------------) [-] [-] [-] - * - * 200B(ZWSP)--200C(ZWNJ)--200D(ZWJ)--200E(LRM)--200F(RLM)--2028(LSEP)--2029(PSEP)--202A(LRE)--202E(RL0)--2066(LRI)--2069(PDI) - * webkit: [-----------------------------------------------] [----------] -@@ -39,9 +39,6 @@ namespace pdf2htmlEX { - * moz: - * p2h: [------------------] [-] [-] [-----------------] - * -- * Note: 0xA0 (no-break space) affects word-spacing; and if "white-space:pre" is specified, -- * \n and \r can break line, \t can shift text, so they are considered illegal. -- * - * Resources (retrieved at 2015-03-16) - * * webkit - * * Avoid querying the font cache for the zero-width space glyph ( https://bugs.webkit.org/show_bug.cgi?id=90673 ) -@@ -58,7 +55,7 @@ namespace pdf2htmlEX { - */ - inline bool is_illegal_unicode(Unicode c) - { -- return (c < 0x20) || (c >= 0x7F && c <= 0xA0) || (c == 0xAD) -+ return (c < 0x20) || (c >= 0x7F && c < 0xA0) || (c == 0xAD) - || (c >= 0x300 && c <= 0x36f) // DCRH Combining diacriticals - || (c >= 0x1ab0 && c <= 0x1aff) // DCRH Combining diacriticals - || (c >= 0x1dc0 && c <= 0x1dff) // DCRH Combining diacriticals diff --git a/src/Pdf2Html/pdf2htmlEX/patches/versionEnvs.patch b/src/Pdf2Html/pdf2htmlEX/patches/versionEnvs.patch deleted file mode 100644 index 50d4d70..0000000 --- a/src/Pdf2Html/pdf2htmlEX/patches/versionEnvs.patch +++ /dev/null @@ -1,9 +0,0 @@ -@@ -6,7 +6,7 @@ - # see: https://poppler.freedesktop.org/releases.html - # current working: 0.89.0 - --export PDF2HTMLEX_VERSION=0.18.8.rc2 -+export PDF2HTMLEX_VERSION=0.18.8.rc1-cfl2 - - export POPPLER_VERSION=poppler-0.89.0 - #export POPPLER_VERSION=poppler-0.88.0