From bd6fd88286320c6db8063d61d838d28d0a43148d Mon Sep 17 00:00:00 2001 From: Barry Brands Date: Wed, 10 Apr 2024 15:04:49 +0200 Subject: [PATCH 1/3] Get text from docx files --- src/Service/FileService.php | 70 +++++++++++++++++++++++++- src/Service/SyncOpenWooService.php | 3 ++ src/Service/SyncXxllncCasesService.php | 3 ++ 3 files changed, 75 insertions(+), 1 deletion(-) diff --git a/src/Service/FileService.php b/src/Service/FileService.php index 4f63d4d..be7a322 100644 --- a/src/Service/FileService.php +++ b/src/Service/FileService.php @@ -13,6 +13,7 @@ use Psr\Log\LoggerInterface; use App\Entity\Gateway as Source; use Smalot\PdfParser\Parser; +use PhpOffice\PhpWord\IOFactory; /** * Service responsible for woo files. @@ -211,11 +212,13 @@ public function getTextFromDocument(Value $value): ?string return null; } + $base64Decoded = \Safe\base64_decode($file->getBase64()); + switch ($file->getMimeType()) { case 'pdf': case 'application/pdf': try { - $pdf = $this->pdfParser->parseContent(\Safe\base64_decode($file->getBase64())); + $pdf = $this->pdfParser->parseContent($base64Decoded); $text = $pdf->getText(); } catch (\Exception $e) { $this->logger->error('Something went wrong extracting text from '.$file->getName().' '.$e->getMessage()); @@ -224,6 +227,35 @@ public function getTextFromDocument(Value $value): ?string $text = null; } break; + case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': + try { + $tempFilePath = tempnam(sys_get_temp_dir(), 'docx'); + if ($tempFilePath === false) { + $this->logger->error('Failed to create a temporary file '.$file->getName()); + $this->style && $this->style->error('Failed to create a temporary file '.$file->getName()); + } + file_put_contents($tempFilePath, $base64Decoded); + + $phpWord = IOFactory::load($tempFilePath); + + $text = ''; + foreach ($phpWord->getSections() as $section) { + $text .= $this->processElements($section->getElements(), $text); + } + + if (empty($text) === true) { + $text = null; + } + + unlink($tempFilePath); + + } catch (\Exception $e) { + $this->logger->error('Something went wrong extracting text from '.$file->getName().' '.$e->getMessage()); + $this->style && $this->style->error('Something went wrong extracting text from '.$file->getName().' '.$e->getMessage()); + + $text = null; + } + break; default: $text = null; } @@ -233,6 +265,42 @@ public function getTextFromDocument(Value $value): ?string }//end getTextFromDocument() + + /** + * Loops through docx elements to get the text from. + * + * @param $elements Docx elements. + * @param string $text variable to extend. + * + * @return string $text + */ + private function processElements($elements, string $text): string + { + foreach ($elements as $element) { + switch (get_class($element)) { + case 'PhpOffice\PhpWord\Element\TextRun': + case 'PhpOffice\PhpWord\Element\Cell': + $text .= $this->processElements($element->getElements(), $text); + break; + + case 'PhpOffice\PhpWord\Element\Table': + foreach ($element->getRows() as $row) { + foreach ($row->getCells() as $cell) { + $text .= $this->processElements($cell->getElements(), $text); + } + } + break; + + case 'PhpOffice\PhpWord\Element\Text': + $text .= $element->getText(); + break; + } + } + + return $text; + }//end processElements() + + /** * Returns the data from an document as a response. * diff --git a/src/Service/SyncOpenWooService.php b/src/Service/SyncOpenWooService.php index 68e3a5d..9bbe569 100644 --- a/src/Service/SyncOpenWooService.php +++ b/src/Service/SyncOpenWooService.php @@ -326,6 +326,9 @@ public function syncOpenWooHandler(array $data, array $configuration): array $mappedResult = $this->mappingService->mapping($mapping, $result); // Map categories to prevent multiple variants of the same categorie. $mappedResult = $this->mappingService->mapping($categorieMapping, $mappedResult); + if (isset($mappedResult['samenvatting']) === true) { + $mappedResult['samenvatting'] = html_entity_decode($mappedResult['samenvatting']); + } $validationErrors = $this->validationService->validateData($mappedResult, $schema, 'POST'); if ($validationErrors !== null) { diff --git a/src/Service/SyncXxllncCasesService.php b/src/Service/SyncXxllncCasesService.php index 55094a2..e1c2d49 100644 --- a/src/Service/SyncXxllncCasesService.php +++ b/src/Service/SyncXxllncCasesService.php @@ -391,6 +391,9 @@ public function syncXxllncCasesHandler(array $data, array $configuration): array $mappedResult = $this->mappingService->mapping($mapping, $result); // Map categories to prevent multiple variants of the same categorie. $mappedResult = $this->mappingService->mapping($categorieMapping, $mappedResult); + if (isset($mappedResult['samenvatting']) === true) { + $mappedResult['samenvatting'] = html_entity_decode($mappedResult['samenvatting']); + } $validationErrors = $this->validationService->validateData($mappedResult, $schema, 'POST'); if ($validationErrors !== null) { From b683fd7e0d7b51ff3ddf734f4df857c6c3bc262d Mon Sep 17 00:00:00 2001 From: Barry Brands Date: Wed, 10 Apr 2024 15:32:09 +0200 Subject: [PATCH 2/3] Cleanup --- src/Service/FileService.php | 111 +++++++++++++++++++++++------------- 1 file changed, 71 insertions(+), 40 deletions(-) diff --git a/src/Service/FileService.php b/src/Service/FileService.php index be7a322..48d5ac3 100644 --- a/src/Service/FileService.php +++ b/src/Service/FileService.php @@ -196,6 +196,59 @@ public function generateDownloadEndpoint(string $id, Endpoint $downloadEndpoint) }//end generateDownloadEndpoint() + /** + * Writes a temporary file for short use. + * + * Don't forget to unlink($tempFilePath) after using the file to remove the temporary file. + * + * @param File $file File to write a temporary file from. + * @param string $fileExtension Extension to write the file with. + * @param $base64Decoded File in its decoded form. + * + * @return string|null $tempFilePath The temporary file path. + */ + private function createTemporaryFile(File $file, string $fileExtension, $base64Decoded): ?string + { + $tempFilePath = tempnam(sys_get_temp_dir(), $fileExtension); + if ($tempFilePath === false) { + $this->logger->error('Failed to create a temporary file '.$file->getName()); + $this->style && $this->style->error('Failed to create a temporary file '.$file->getName()); + + return null; + } + file_put_contents($tempFilePath, $base64Decoded); + + return $tempFilePath; + }//end createTemporaryFile() + + /** + * Extracts text from a docx file. + * + * @param File $file to get text from. + * @param $base64Decoded File in its decoded form. + * + * @return string + */ + private function getTextFromDocx(File $file, $base64Decoded): string + { + $tempFilePath = $this->createTemporaryFile($file, 'docx', $base64Decoded); + if ($tempFilePath === null) { + return ''; + } + + $phpWord = IOFactory::load($tempFilePath); + + $text = ''; + foreach ($phpWord->getSections() as $section) { + $text .= $this->processElements($section->getElements(), $text); + } + + // Remove temp file. + unlink($tempFilePath); + + return $text; + }//end getTextFromDocx() + /** * Extracts text from a document (File). @@ -214,49 +267,27 @@ public function getTextFromDocument(Value $value): ?string $base64Decoded = \Safe\base64_decode($file->getBase64()); - switch ($file->getMimeType()) { - case 'pdf': - case 'application/pdf': - try { - $pdf = $this->pdfParser->parseContent($base64Decoded); - $text = $pdf->getText(); - } catch (\Exception $e) { - $this->logger->error('Something went wrong extracting text from '.$file->getName().' '.$e->getMessage()); - $this->style && $this->style->error('Something went wrong extracting text from '.$file->getName().' '.$e->getMessage()); - - $text = null; - } - break; - case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': - try { - $tempFilePath = tempnam(sys_get_temp_dir(), 'docx'); - if ($tempFilePath === false) { - $this->logger->error('Failed to create a temporary file '.$file->getName()); - $this->style && $this->style->error('Failed to create a temporary file '.$file->getName()); - } - file_put_contents($tempFilePath, $base64Decoded); - - $phpWord = IOFactory::load($tempFilePath); - - $text = ''; - foreach ($phpWord->getSections() as $section) { - $text .= $this->processElements($section->getElements(), $text); - } - - if (empty($text) === true) { + try { + switch ($file->getMimeType()) { + case 'pdf': + case 'application/pdf': + $pdf = $this->pdfParser->parseContent($base64Decoded); + $text = $pdf->getText(); + break; + case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': + $text = $this->getTextFromDocx($file, $base64Decoded); + break; + default: $text = null; - } - - unlink($tempFilePath); + } + } catch (\Exception $e) { + $this->logger->error('Something went wrong extracting text from '.$file->getName().' '.$e->getMessage()); + $this->style && $this->style->error('Something went wrong extracting text from '.$file->getName().' '.$e->getMessage()); - } catch (\Exception $e) { - $this->logger->error('Something went wrong extracting text from '.$file->getName().' '.$e->getMessage()); - $this->style && $this->style->error('Something went wrong extracting text from '.$file->getName().' '.$e->getMessage()); + $text = null; + } - $text = null; - } - break; - default: + if (empty($text) === true) { $text = null; } From f686fcb38593be855733a6fd21e3b5a252f73a1d Mon Sep 17 00:00:00 2001 From: GitHub Actions <> Date: Wed, 10 Apr 2024 13:32:37 +0000 Subject: [PATCH 3/3] Update src from PHP Codesniffer --- src/Service/FileService.php | 61 ++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/src/Service/FileService.php b/src/Service/FileService.php index 48d5ac3..32a282d 100644 --- a/src/Service/FileService.php +++ b/src/Service/FileService.php @@ -196,12 +196,13 @@ public function generateDownloadEndpoint(string $id, Endpoint $downloadEndpoint) }//end generateDownloadEndpoint() + /** * Writes a temporary file for short use. * * Don't forget to unlink($tempFilePath) after using the file to remove the temporary file. * - * @param File $file File to write a temporary file from. + * @param File $file File to write a temporary file from. * @param string $fileExtension Extension to write the file with. * @param $base64Decoded File in its decoded form. * @@ -216,15 +217,18 @@ private function createTemporaryFile(File $file, string $fileExtension, $base64D return null; } + file_put_contents($tempFilePath, $base64Decoded); return $tempFilePath; + }//end createTemporaryFile() + /** * Extracts text from a docx file. * - * @param File $file to get text from. + * @param File $file to get text from. * @param $base64Decoded File in its decoded form. * * @return string @@ -247,6 +251,7 @@ private function getTextFromDocx(File $file, $base64Decoded): string unlink($tempFilePath); return $text; + }//end getTextFromDocx() @@ -269,16 +274,16 @@ public function getTextFromDocument(Value $value): ?string try { switch ($file->getMimeType()) { - case 'pdf': - case 'application/pdf': - $pdf = $this->pdfParser->parseContent($base64Decoded); - $text = $pdf->getText(); - break; - case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': - $text = $this->getTextFromDocx($file, $base64Decoded); - break; - default: - $text = null; + case 'pdf': + case 'application/pdf': + $pdf = $this->pdfParser->parseContent($base64Decoded); + $text = $pdf->getText(); + break; + case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': + $text = $this->getTextFromDocx($file, $base64Decoded); + break; + default: + $text = null; } } catch (\Exception $e) { $this->logger->error('Something went wrong extracting text from '.$file->getName().' '.$e->getMessage()); @@ -296,12 +301,11 @@ public function getTextFromDocument(Value $value): ?string }//end getTextFromDocument() - /** * Loops through docx elements to get the text from. * * @param $elements Docx elements. - * @param string $text variable to extend. + * @param string $text variable to extend. * * @return string $text */ @@ -309,26 +313,27 @@ private function processElements($elements, string $text): string { foreach ($elements as $element) { switch (get_class($element)) { - case 'PhpOffice\PhpWord\Element\TextRun': - case 'PhpOffice\PhpWord\Element\Cell': - $text .= $this->processElements($element->getElements(), $text); - break; - - case 'PhpOffice\PhpWord\Element\Table': - foreach ($element->getRows() as $row) { - foreach ($row->getCells() as $cell) { - $text .= $this->processElements($cell->getElements(), $text); - } + case 'PhpOffice\PhpWord\Element\TextRun': + case 'PhpOffice\PhpWord\Element\Cell': + $text .= $this->processElements($element->getElements(), $text); + break; + + case 'PhpOffice\PhpWord\Element\Table': + foreach ($element->getRows() as $row) { + foreach ($row->getCells() as $cell) { + $text .= $this->processElements($cell->getElements(), $text); } - break; + } + break; - case 'PhpOffice\PhpWord\Element\Text': - $text .= $element->getText(); - break; + case 'PhpOffice\PhpWord\Element\Text': + $text .= $element->getText(); + break; } } return $text; + }//end processElements()