Orzu Ionut
3 years ago
10 changed files with 516 additions and 547 deletions
-
3README.md
-
24app/Ingest/AbstractConvertor.php
-
434app/Ingest/Convertor.php
-
6app/Ingest/DocumentHandler.php
-
46app/Ingest/DocxConvertor.php
-
50app/Ingest/OtherConvertor.php
-
271app/Ingest/PDFConvertor.php
-
52app/Ingest/TextConvertor.php
-
103app/Jobs/IngestDocuments.php
-
74app/Jobs/SendToCore.php
@ -0,0 +1,24 @@ |
|||
<?php |
|||
|
|||
namespace App\Ingest; |
|||
|
|||
abstract class AbstractConvertor |
|||
{ |
|||
protected $storage; |
|||
protected $path; |
|||
protected $directoryPath; |
|||
|
|||
public function __construct($storage, $path) |
|||
{ |
|||
$this->storage = $storage; |
|||
$this->path = $path; |
|||
$this->directoryPath = pathinfo($path, PATHINFO_DIRNAME); |
|||
} |
|||
|
|||
abstract public function execute(); |
|||
|
|||
protected function deleteOriginalDocument() |
|||
{ |
|||
$this->storage->delete($this->path); |
|||
} |
|||
} |
@ -0,0 +1,46 @@ |
|||
<?php |
|||
|
|||
namespace App\Ingest; |
|||
|
|||
use Symfony\Component\Process\Exception\ProcessFailedException; |
|||
use Symfony\Component\Process\Process; |
|||
|
|||
class DocxConvertor extends AbstractConvertor |
|||
{ |
|||
public function execute() |
|||
{ |
|||
$this->convertToText(); |
|||
|
|||
$convertor = new TextConvertor($this->storage, "$this->directoryPath/document.txt"); |
|||
|
|||
$convertor->execute(); |
|||
} |
|||
|
|||
/** |
|||
* Convert docx file to text |
|||
* |
|||
* @return void |
|||
*/ |
|||
protected function convertToText() |
|||
{ |
|||
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); |
|||
|
|||
$process = new Process([ |
|||
'soffice', |
|||
'--headless', |
|||
'--convert-to', |
|||
'txt', |
|||
$this->storage->path($this->path), |
|||
'--outdir', |
|||
$this->storage->path($this->directoryPath) |
|||
]); |
|||
|
|||
$process->run(); |
|||
|
|||
if (!$process->isSuccessful()) { |
|||
throw new ProcessFailedException($process); |
|||
} |
|||
|
|||
$this->deleteOriginalDocument(); |
|||
} |
|||
} |
@ -0,0 +1,50 @@ |
|||
<?php |
|||
|
|||
namespace App\Ingest; |
|||
|
|||
use Symfony\Component\Process\Exception\ProcessFailedException; |
|||
use Symfony\Component\Process\Process; |
|||
|
|||
class OtherConvertor extends AbstractConvertor |
|||
{ |
|||
public function execute() |
|||
{ |
|||
$this->convertToDocx(); |
|||
|
|||
$convertor = new DocxConvertor($this->storage, "$this->directoryPath/document.docx"); |
|||
|
|||
$convertor->execute(); |
|||
} |
|||
|
|||
/** |
|||
* Convert doc,dot,rtf,odt,pdf,docx to docx |
|||
* |
|||
* |
|||
* @return string|void |
|||
*/ |
|||
private function convertToDocx() |
|||
{ |
|||
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); |
|||
|
|||
/** |
|||
* Convert doc,dot,rtf,odt to docx |
|||
*/ |
|||
$process = new Process([ |
|||
'soffice', |
|||
'--headless', |
|||
'--convert-to', |
|||
'docx', |
|||
$this->storage->path($this->path), |
|||
'--outdir', |
|||
$this->storage->path($this->directoryPath) |
|||
]); |
|||
|
|||
$process->run(); |
|||
|
|||
if (!$process->isSuccessful()) { |
|||
throw new ProcessFailedException($process); |
|||
} |
|||
|
|||
$this->deleteOriginalDocument(); |
|||
} |
|||
} |
@ -0,0 +1,271 @@ |
|||
<?php |
|||
|
|||
namespace App\Ingest; |
|||
|
|||
use League\HTMLToMarkdown\HtmlConverter; |
|||
use Symfony\Component\Process\Exception\ProcessFailedException; |
|||
use Symfony\Component\Process\Process; |
|||
|
|||
class PDFConvertor extends AbstractConvertor |
|||
{ |
|||
public function execute() |
|||
{ |
|||
// $this->prepareForConvertPDF();
|
|||
|
|||
$result = $this->getFileContents(); |
|||
|
|||
if ( ! $result['has_images'] && ! $result['has_text']) { |
|||
throw new \Exception('Cannot get pdf file contents.'); |
|||
} |
|||
|
|||
if ($result['has_text']) { |
|||
$mdContents = ''; |
|||
|
|||
foreach ($result['htmls'] as $html) { |
|||
$converter = new HtmlConverter(); |
|||
$converter->getConfig()->setOption('strip_tags', true); |
|||
|
|||
$contents = $converter->convert($html); |
|||
|
|||
$mdContents = $mdContents . "\n\n" . $contents; |
|||
} |
|||
|
|||
$this->storage->put("$this->directoryPath/document.md", $mdContents); |
|||
|
|||
return; |
|||
} |
|||
|
|||
// Only contains images.
|
|||
$imagesContent = ''; |
|||
$files = $this->storage->allFiles($this->path); |
|||
|
|||
foreach ($files as $file) { |
|||
// Only get the image files from the directory, it may contain some empty html files too.
|
|||
|
|||
// @TODO Only OCR images with text and delete them afterwards, the remaining ignore and keep.
|
|||
if (in_array(pathinfo($file, PATHINFO_EXTENSION), ['jpg', 'png'])) { |
|||
$ocr = new OCR($this->storage->path($file)); |
|||
|
|||
$imagesContent = $imagesContent . $ocr->execute(); |
|||
|
|||
$this->storage->delete($file); |
|||
} |
|||
} |
|||
|
|||
$this->storage->put("$this->directoryPath/document.md", $imagesContent); |
|||
} |
|||
|
|||
protected function getFileContents() |
|||
{ |
|||
$outputPath = $this->storage->path("$this->directoryPath/html"); |
|||
|
|||
$process = new Process([ |
|||
'pdftohtml', |
|||
'-xml', |
|||
$this->storage->path($this->path), |
|||
$outputPath |
|||
]); |
|||
|
|||
$process->run(); |
|||
|
|||
if (!$process->isSuccessful()) { |
|||
throw new ProcessFailedException($process); |
|||
} |
|||
|
|||
// Remove original document.
|
|||
$this->storage->delete($this->path); |
|||
|
|||
return $this->getDataFromXML(); |
|||
} |
|||
|
|||
protected function getDataFromXML() |
|||
{ |
|||
$xmlFilePath = "$this->directoryPath/html.xml"; |
|||
|
|||
$contents = $this->storage->get($xmlFilePath); |
|||
|
|||
$xml = simplexml_load_string($contents); |
|||
|
|||
$orderedList = []; |
|||
$fonts = []; |
|||
|
|||
foreach ($xml->page as $page) { |
|||
$pageNumber = (int) $page['number'][0]; |
|||
|
|||
$orderedList[$pageNumber] = []; |
|||
|
|||
foreach ($page as $p) { |
|||
if ($p->getName() === 'fontspec') { |
|||
$fonts[(int) $p['id']]['family'] = (string) $p['family']; |
|||
$fonts[(int) $p['id']]['size'] = (string) $p['size']; |
|||
$fonts[(int) $p['id']]['color'] = (string) $p['color']; |
|||
} |
|||
|
|||
if (isset($p['top'])) { |
|||
$top = (int) $p['top']; |
|||
|
|||
if ( ! array_key_exists($top, $orderedList[$pageNumber])) { |
|||
$orderedList[$pageNumber][$top] = []; |
|||
} |
|||
|
|||
$orderedList[$pageNumber][$top][] = $p; |
|||
} |
|||
} |
|||
|
|||
ksort($orderedList[$pageNumber]); |
|||
} |
|||
|
|||
$htmls = []; |
|||
$hasImages = false; |
|||
$hasText = false; |
|||
|
|||
$imagesCount = 0; |
|||
$imagesInFooter = true; |
|||
|
|||
try { |
|||
foreach ($orderedList as $page) { |
|||
$html = ''; |
|||
$footerImages = []; |
|||
|
|||
foreach ($page as $items) { |
|||
$continuousP = ''; |
|||
|
|||
foreach ($items as $p) { |
|||
if ($p->getName() == 'image') { |
|||
$hasImages = true; |
|||
|
|||
$imagesCount += 1; |
|||
$caption = "Fig. $imagesCount"; |
|||
|
|||
$imageHTML = $this->handleImage($p, $caption); |
|||
|
|||
if ( ! $imagesInFooter) { |
|||
$html = $html . $imageHTML; |
|||
} else { |
|||
$html = $html . "<p> $caption </p>"; |
|||
|
|||
$footerImages[] = $imageHTML; |
|||
} |
|||
} |
|||
|
|||
if ($p->getName() == 'text') { |
|||
$continuousP = $continuousP . $this->handleText($p, $fonts); |
|||
|
|||
$hasText = true; |
|||
} |
|||
} |
|||
|
|||
$html = $html . '<p>' . $continuousP . '</p>'; |
|||
} |
|||
|
|||
if ($imagesInFooter) { |
|||
foreach ($footerImages as $index => $footerImage) { |
|||
$html = $html . '<p>' . $footerImage . '</p>'; |
|||
// $html = $html . '<p> Fig. ' . ($index + 1) . '</p>';
|
|||
} |
|||
} |
|||
|
|||
$htmls[] = '<html><head><title></title></head><body>' . $html . '</body></html>'; |
|||
} |
|||
} catch (\Exception $exception) { |
|||
$this->storage->deleteDirectory($this->directoryPath); |
|||
|
|||
\Illuminate\Support\Facades\Log::info($exception->getTraceAsString()); |
|||
|
|||
throw new \Exception('Something went wrong.'); |
|||
} |
|||
|
|||
if ( ! $hasText && ! $hasImages) { |
|||
// Remove directory because we do not have any use for it anymore.
|
|||
$this->storage->deleteDirectory($this->directoryPath); |
|||
} else { |
|||
// Remove the unnecessary 'xml' file.
|
|||
$this->storage->delete($xmlFilePath); |
|||
} |
|||
|
|||
return [ |
|||
'has_images' => $hasImages, |
|||
'has_text' => $hasText, |
|||
'htmls' => $htmls, |
|||
]; |
|||
} |
|||
|
|||
protected function handleImage($p, $caption) |
|||
{ |
|||
$html = ''; |
|||
|
|||
$src = './' . pathinfo($p['src'], PATHINFO_BASENAME); |
|||
|
|||
$html = $html . '<br>'; |
|||
$html = $html . '<img style="position: absolute; top: ' . $p['top'] . 'px; left: ' . $p['left'] . 'px;" width="' . $p['width'] . '" height="' . $p['height'] . '" src="' . $src . '" alt="' . $caption . '" title="' . $caption . '">'; |
|||
$html = $html . '<br>'; |
|||
$html = $html . '<br>'; |
|||
|
|||
return $html; |
|||
} |
|||
|
|||
protected function handleText($p, $fonts) |
|||
{ |
|||
$id = (int) $p['font']; |
|||
$font_size = $fonts[$id]['size']; |
|||
$font_color = $fonts[$id]['color']; |
|||
$font_family = $fonts[$id]['family']; |
|||
|
|||
$style = ''; |
|||
$style = $style . 'position: absolute;'; |
|||
$style = $style . "color: $font_color;"; |
|||
$style = $style . "font-family: $font_family;"; |
|||
$style = $style . "font-weight: 900;"; |
|||
$style = $style . "width: " . $p['width'] . "px;"; |
|||
$style = $style . "height: " . $p['height'] . "px;"; |
|||
$style = $style . "top: " . $p['top'] . "px;"; |
|||
$style = $style . "left: " . $p['left'] . "px;"; |
|||
$style = $style . "font-size: $font_size" . "px;"; |
|||
|
|||
if ($p->i) { |
|||
$content = '<i>' . $p->i . '</i>'; |
|||
} else if ($p->b) { |
|||
$content = '<b>' . $p->b . '</b>'; |
|||
} else { |
|||
$content = $p; |
|||
} |
|||
|
|||
$tag = $this->getTag($font_size); |
|||
|
|||
return '<' . $tag . ' style="' . $style . '">' . $content . '</' . $tag . '>'; |
|||
} |
|||
|
|||
protected function getTag($size) |
|||
{ |
|||
if ($size > 24) { |
|||
return 'h1'; |
|||
} |
|||
|
|||
if ($size > 18) { |
|||
return 'h2'; |
|||
} |
|||
|
|||
if ($size > 16) { |
|||
return 'h3'; |
|||
} |
|||
|
|||
return 'span'; |
|||
} |
|||
|
|||
protected function prepareForConvertPDF() |
|||
{ |
|||
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); |
|||
|
|||
$process = new Process([ |
|||
'pip3', |
|||
'install', |
|||
'pdftotext', |
|||
]); |
|||
|
|||
$process->run(); |
|||
|
|||
if (!$process->isSuccessful()) { |
|||
throw new ProcessFailedException($process); |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,52 @@ |
|||
<?php |
|||
|
|||
namespace App\Ingest; |
|||
|
|||
use App\Parser\ParseTextArray; |
|||
|
|||
class TextConvertor extends AbstractConvertor |
|||
{ |
|||
public function execute() |
|||
{ |
|||
$textParser = new ParseTextArray(); |
|||
|
|||
$content = $textParser->fromFile($this->storage->path($this->path)); |
|||
|
|||
if ( ! $content) { |
|||
throw new \Exception('Could not read content.'); |
|||
} |
|||
|
|||
$content = $this->convertToUTF8($content); |
|||
|
|||
$this->storeContent($content); |
|||
} |
|||
|
|||
protected function convertToUTF8($content) |
|||
{ |
|||
array_walk_recursive( |
|||
$content, |
|||
function (&$entry) { |
|||
$entry = mb_convert_encoding( |
|||
$entry, |
|||
'UTF-8' |
|||
); |
|||
} |
|||
); |
|||
|
|||
return $content; |
|||
} |
|||
|
|||
protected function storeContent($content) |
|||
{ |
|||
$this->storeMD($content); |
|||
|
|||
$this->deleteOriginalDocument(); |
|||
} |
|||
|
|||
protected function storeMD($content) |
|||
{ |
|||
$convertor = new MDConvertor($content); |
|||
|
|||
$this->storage->put("$this->directoryPath/document.md", $convertor->execute()); |
|||
} |
|||
} |
Write
Preview
Loading…
Cancel
Save
Reference in new issue