Orzu Ionut
3 years ago
10 changed files with 516 additions and 547 deletions
-
3README.md
-
24app/Ingest/AbstractConvertor.php
-
434app/Ingest/Convertor.php
-
6app/Ingest/DocumentHandler.php
-
46app/Ingest/DocxConvertor.php
-
50app/Ingest/OtherConvertor.php
-
271app/Ingest/PDFConvertor.php
-
52app/Ingest/TextConvertor.php
-
103app/Jobs/IngestDocuments.php
-
74app/Jobs/SendToCore.php
@ -0,0 +1,24 @@ |
|||||
|
<?php |
||||
|
|
||||
|
namespace App\Ingest; |
||||
|
|
||||
|
abstract class AbstractConvertor |
||||
|
{ |
||||
|
protected $storage; |
||||
|
protected $path; |
||||
|
protected $directoryPath; |
||||
|
|
||||
|
public function __construct($storage, $path) |
||||
|
{ |
||||
|
$this->storage = $storage; |
||||
|
$this->path = $path; |
||||
|
$this->directoryPath = pathinfo($path, PATHINFO_DIRNAME); |
||||
|
} |
||||
|
|
||||
|
abstract public function execute(); |
||||
|
|
||||
|
protected function deleteOriginalDocument() |
||||
|
{ |
||||
|
$this->storage->delete($this->path); |
||||
|
} |
||||
|
} |
@ -0,0 +1,46 @@ |
|||||
|
<?php |
||||
|
|
||||
|
namespace App\Ingest; |
||||
|
|
||||
|
use Symfony\Component\Process\Exception\ProcessFailedException; |
||||
|
use Symfony\Component\Process\Process; |
||||
|
|
||||
|
class DocxConvertor extends AbstractConvertor |
||||
|
{ |
||||
|
public function execute() |
||||
|
{ |
||||
|
$this->convertToText(); |
||||
|
|
||||
|
$convertor = new TextConvertor($this->storage, "$this->directoryPath/document.txt"); |
||||
|
|
||||
|
$convertor->execute(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* Convert docx file to text |
||||
|
* |
||||
|
* @return void |
||||
|
*/ |
||||
|
protected function convertToText() |
||||
|
{ |
||||
|
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); |
||||
|
|
||||
|
$process = new Process([ |
||||
|
'soffice', |
||||
|
'--headless', |
||||
|
'--convert-to', |
||||
|
'txt', |
||||
|
$this->storage->path($this->path), |
||||
|
'--outdir', |
||||
|
$this->storage->path($this->directoryPath) |
||||
|
]); |
||||
|
|
||||
|
$process->run(); |
||||
|
|
||||
|
if (!$process->isSuccessful()) { |
||||
|
throw new ProcessFailedException($process); |
||||
|
} |
||||
|
|
||||
|
$this->deleteOriginalDocument(); |
||||
|
} |
||||
|
} |
@ -0,0 +1,50 @@ |
|||||
|
<?php |
||||
|
|
||||
|
namespace App\Ingest; |
||||
|
|
||||
|
use Symfony\Component\Process\Exception\ProcessFailedException; |
||||
|
use Symfony\Component\Process\Process; |
||||
|
|
||||
|
class OtherConvertor extends AbstractConvertor |
||||
|
{ |
||||
|
public function execute() |
||||
|
{ |
||||
|
$this->convertToDocx(); |
||||
|
|
||||
|
$convertor = new DocxConvertor($this->storage, "$this->directoryPath/document.docx"); |
||||
|
|
||||
|
$convertor->execute(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* Convert doc,dot,rtf,odt,pdf,docx to docx |
||||
|
* |
||||
|
* |
||||
|
* @return string|void |
||||
|
*/ |
||||
|
private function convertToDocx() |
||||
|
{ |
||||
|
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); |
||||
|
|
||||
|
/** |
||||
|
* Convert doc,dot,rtf,odt to docx |
||||
|
*/ |
||||
|
$process = new Process([ |
||||
|
'soffice', |
||||
|
'--headless', |
||||
|
'--convert-to', |
||||
|
'docx', |
||||
|
$this->storage->path($this->path), |
||||
|
'--outdir', |
||||
|
$this->storage->path($this->directoryPath) |
||||
|
]); |
||||
|
|
||||
|
$process->run(); |
||||
|
|
||||
|
if (!$process->isSuccessful()) { |
||||
|
throw new ProcessFailedException($process); |
||||
|
} |
||||
|
|
||||
|
$this->deleteOriginalDocument(); |
||||
|
} |
||||
|
} |
@ -0,0 +1,271 @@ |
|||||
|
<?php |
||||
|
|
||||
|
namespace App\Ingest; |
||||
|
|
||||
|
use League\HTMLToMarkdown\HtmlConverter; |
||||
|
use Symfony\Component\Process\Exception\ProcessFailedException; |
||||
|
use Symfony\Component\Process\Process; |
||||
|
|
||||
|
class PDFConvertor extends AbstractConvertor |
||||
|
{ |
||||
|
public function execute() |
||||
|
{ |
||||
|
// $this->prepareForConvertPDF();
|
||||
|
|
||||
|
$result = $this->getFileContents(); |
||||
|
|
||||
|
if ( ! $result['has_images'] && ! $result['has_text']) { |
||||
|
throw new \Exception('Cannot get pdf file contents.'); |
||||
|
} |
||||
|
|
||||
|
if ($result['has_text']) { |
||||
|
$mdContents = ''; |
||||
|
|
||||
|
foreach ($result['htmls'] as $html) { |
||||
|
$converter = new HtmlConverter(); |
||||
|
$converter->getConfig()->setOption('strip_tags', true); |
||||
|
|
||||
|
$contents = $converter->convert($html); |
||||
|
|
||||
|
$mdContents = $mdContents . "\n\n" . $contents; |
||||
|
} |
||||
|
|
||||
|
$this->storage->put("$this->directoryPath/document.md", $mdContents); |
||||
|
|
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
// Only contains images.
|
||||
|
$imagesContent = ''; |
||||
|
$files = $this->storage->allFiles($this->path); |
||||
|
|
||||
|
foreach ($files as $file) { |
||||
|
// Only get the image files from the directory, it may contain some empty html files too.
|
||||
|
|
||||
|
// @TODO Only OCR images with text and delete them afterwards, the remaining ignore and keep.
|
||||
|
if (in_array(pathinfo($file, PATHINFO_EXTENSION), ['jpg', 'png'])) { |
||||
|
$ocr = new OCR($this->storage->path($file)); |
||||
|
|
||||
|
$imagesContent = $imagesContent . $ocr->execute(); |
||||
|
|
||||
|
$this->storage->delete($file); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
$this->storage->put("$this->directoryPath/document.md", $imagesContent); |
||||
|
} |
||||
|
|
||||
|
protected function getFileContents() |
||||
|
{ |
||||
|
$outputPath = $this->storage->path("$this->directoryPath/html"); |
||||
|
|
||||
|
$process = new Process([ |
||||
|
'pdftohtml', |
||||
|
'-xml', |
||||
|
$this->storage->path($this->path), |
||||
|
$outputPath |
||||
|
]); |
||||
|
|
||||
|
$process->run(); |
||||
|
|
||||
|
if (!$process->isSuccessful()) { |
||||
|
throw new ProcessFailedException($process); |
||||
|
} |
||||
|
|
||||
|
// Remove original document.
|
||||
|
$this->storage->delete($this->path); |
||||
|
|
||||
|
return $this->getDataFromXML(); |
||||
|
} |
||||
|
|
||||
|
protected function getDataFromXML() |
||||
|
{ |
||||
|
$xmlFilePath = "$this->directoryPath/html.xml"; |
||||
|
|
||||
|
$contents = $this->storage->get($xmlFilePath); |
||||
|
|
||||
|
$xml = simplexml_load_string($contents); |
||||
|
|
||||
|
$orderedList = []; |
||||
|
$fonts = []; |
||||
|
|
||||
|
foreach ($xml->page as $page) { |
||||
|
$pageNumber = (int) $page['number'][0]; |
||||
|
|
||||
|
$orderedList[$pageNumber] = []; |
||||
|
|
||||
|
foreach ($page as $p) { |
||||
|
if ($p->getName() === 'fontspec') { |
||||
|
$fonts[(int) $p['id']]['family'] = (string) $p['family']; |
||||
|
$fonts[(int) $p['id']]['size'] = (string) $p['size']; |
||||
|
$fonts[(int) $p['id']]['color'] = (string) $p['color']; |
||||
|
} |
||||
|
|
||||
|
if (isset($p['top'])) { |
||||
|
$top = (int) $p['top']; |
||||
|
|
||||
|
if ( ! array_key_exists($top, $orderedList[$pageNumber])) { |
||||
|
$orderedList[$pageNumber][$top] = []; |
||||
|
} |
||||
|
|
||||
|
$orderedList[$pageNumber][$top][] = $p; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
ksort($orderedList[$pageNumber]); |
||||
|
} |
||||
|
|
||||
|
$htmls = []; |
||||
|
$hasImages = false; |
||||
|
$hasText = false; |
||||
|
|
||||
|
$imagesCount = 0; |
||||
|
$imagesInFooter = true; |
||||
|
|
||||
|
try { |
||||
|
foreach ($orderedList as $page) { |
||||
|
$html = ''; |
||||
|
$footerImages = []; |
||||
|
|
||||
|
foreach ($page as $items) { |
||||
|
$continuousP = ''; |
||||
|
|
||||
|
foreach ($items as $p) { |
||||
|
if ($p->getName() == 'image') { |
||||
|
$hasImages = true; |
||||
|
|
||||
|
$imagesCount += 1; |
||||
|
$caption = "Fig. $imagesCount"; |
||||
|
|
||||
|
$imageHTML = $this->handleImage($p, $caption); |
||||
|
|
||||
|
if ( ! $imagesInFooter) { |
||||
|
$html = $html . $imageHTML; |
||||
|
} else { |
||||
|
$html = $html . "<p> $caption </p>"; |
||||
|
|
||||
|
$footerImages[] = $imageHTML; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if ($p->getName() == 'text') { |
||||
|
$continuousP = $continuousP . $this->handleText($p, $fonts); |
||||
|
|
||||
|
$hasText = true; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
$html = $html . '<p>' . $continuousP . '</p>'; |
||||
|
} |
||||
|
|
||||
|
if ($imagesInFooter) { |
||||
|
foreach ($footerImages as $index => $footerImage) { |
||||
|
$html = $html . '<p>' . $footerImage . '</p>'; |
||||
|
// $html = $html . '<p> Fig. ' . ($index + 1) . '</p>';
|
||||
|
} |
||||
|
} |
||||
|
|
||||
|
$htmls[] = '<html><head><title></title></head><body>' . $html . '</body></html>'; |
||||
|
} |
||||
|
} catch (\Exception $exception) { |
||||
|
$this->storage->deleteDirectory($this->directoryPath); |
||||
|
|
||||
|
\Illuminate\Support\Facades\Log::info($exception->getTraceAsString()); |
||||
|
|
||||
|
throw new \Exception('Something went wrong.'); |
||||
|
} |
||||
|
|
||||
|
if ( ! $hasText && ! $hasImages) { |
||||
|
// Remove directory because we do not have any use for it anymore.
|
||||
|
$this->storage->deleteDirectory($this->directoryPath); |
||||
|
} else { |
||||
|
// Remove the unnecessary 'xml' file.
|
||||
|
$this->storage->delete($xmlFilePath); |
||||
|
} |
||||
|
|
||||
|
return [ |
||||
|
'has_images' => $hasImages, |
||||
|
'has_text' => $hasText, |
||||
|
'htmls' => $htmls, |
||||
|
]; |
||||
|
} |
||||
|
|
||||
|
protected function handleImage($p, $caption) |
||||
|
{ |
||||
|
$html = ''; |
||||
|
|
||||
|
$src = './' . pathinfo($p['src'], PATHINFO_BASENAME); |
||||
|
|
||||
|
$html = $html . '<br>'; |
||||
|
$html = $html . '<img style="position: absolute; top: ' . $p['top'] . 'px; left: ' . $p['left'] . 'px;" width="' . $p['width'] . '" height="' . $p['height'] . '" src="' . $src . '" alt="' . $caption . '" title="' . $caption . '">'; |
||||
|
$html = $html . '<br>'; |
||||
|
$html = $html . '<br>'; |
||||
|
|
||||
|
return $html; |
||||
|
} |
||||
|
|
||||
|
protected function handleText($p, $fonts) |
||||
|
{ |
||||
|
$id = (int) $p['font']; |
||||
|
$font_size = $fonts[$id]['size']; |
||||
|
$font_color = $fonts[$id]['color']; |
||||
|
$font_family = $fonts[$id]['family']; |
||||
|
|
||||
|
$style = ''; |
||||
|
$style = $style . 'position: absolute;'; |
||||
|
$style = $style . "color: $font_color;"; |
||||
|
$style = $style . "font-family: $font_family;"; |
||||
|
$style = $style . "font-weight: 900;"; |
||||
|
$style = $style . "width: " . $p['width'] . "px;"; |
||||
|
$style = $style . "height: " . $p['height'] . "px;"; |
||||
|
$style = $style . "top: " . $p['top'] . "px;"; |
||||
|
$style = $style . "left: " . $p['left'] . "px;"; |
||||
|
$style = $style . "font-size: $font_size" . "px;"; |
||||
|
|
||||
|
if ($p->i) { |
||||
|
$content = '<i>' . $p->i . '</i>'; |
||||
|
} else if ($p->b) { |
||||
|
$content = '<b>' . $p->b . '</b>'; |
||||
|
} else { |
||||
|
$content = $p; |
||||
|
} |
||||
|
|
||||
|
$tag = $this->getTag($font_size); |
||||
|
|
||||
|
return '<' . $tag . ' style="' . $style . '">' . $content . '</' . $tag . '>'; |
||||
|
} |
||||
|
|
||||
|
protected function getTag($size) |
||||
|
{ |
||||
|
if ($size > 24) { |
||||
|
return 'h1'; |
||||
|
} |
||||
|
|
||||
|
if ($size > 18) { |
||||
|
return 'h2'; |
||||
|
} |
||||
|
|
||||
|
if ($size > 16) { |
||||
|
return 'h3'; |
||||
|
} |
||||
|
|
||||
|
return 'span'; |
||||
|
} |
||||
|
|
||||
|
protected function prepareForConvertPDF() |
||||
|
{ |
||||
|
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); |
||||
|
|
||||
|
$process = new Process([ |
||||
|
'pip3', |
||||
|
'install', |
||||
|
'pdftotext', |
||||
|
]); |
||||
|
|
||||
|
$process->run(); |
||||
|
|
||||
|
if (!$process->isSuccessful()) { |
||||
|
throw new ProcessFailedException($process); |
||||
|
} |
||||
|
} |
||||
|
} |
@ -0,0 +1,52 @@ |
|||||
|
<?php |
||||
|
|
||||
|
namespace App\Ingest; |
||||
|
|
||||
|
use App\Parser\ParseTextArray; |
||||
|
|
||||
|
class TextConvertor extends AbstractConvertor |
||||
|
{ |
||||
|
public function execute() |
||||
|
{ |
||||
|
$textParser = new ParseTextArray(); |
||||
|
|
||||
|
$content = $textParser->fromFile($this->storage->path($this->path)); |
||||
|
|
||||
|
if ( ! $content) { |
||||
|
throw new \Exception('Could not read content.'); |
||||
|
} |
||||
|
|
||||
|
$content = $this->convertToUTF8($content); |
||||
|
|
||||
|
$this->storeContent($content); |
||||
|
} |
||||
|
|
||||
|
protected function convertToUTF8($content) |
||||
|
{ |
||||
|
array_walk_recursive( |
||||
|
$content, |
||||
|
function (&$entry) { |
||||
|
$entry = mb_convert_encoding( |
||||
|
$entry, |
||||
|
'UTF-8' |
||||
|
); |
||||
|
} |
||||
|
); |
||||
|
|
||||
|
return $content; |
||||
|
} |
||||
|
|
||||
|
protected function storeContent($content) |
||||
|
{ |
||||
|
$this->storeMD($content); |
||||
|
|
||||
|
$this->deleteOriginalDocument(); |
||||
|
} |
||||
|
|
||||
|
protected function storeMD($content) |
||||
|
{ |
||||
|
$convertor = new MDConvertor($content); |
||||
|
|
||||
|
$this->storage->put("$this->directoryPath/document.md", $convertor->execute()); |
||||
|
} |
||||
|
} |
Write
Preview
Loading…
Cancel
Save
Reference in new issue