You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
509 lines
14 KiB
509 lines
14 KiB
<?php
|
|
|
|
namespace App\Ingest;
|
|
|
|
use Illuminate\Support\Facades\Storage;
|
|
use Symfony\Component\Process\Exception\ProcessFailedException;
|
|
use Symfony\Component\Process\Process;
|
|
use League\HTMLToMarkdown\HtmlConverter;
|
|
|
|
class Convertor
|
|
{
|
|
/**
|
|
* @var \Illuminate\Contracts\Filesystem\Filesystem
|
|
*/
|
|
private $storage;
|
|
private $path;
|
|
protected $type;
|
|
|
|
public function __construct($path, $type)
|
|
{
|
|
$this->storage = Storage::disk('local');
|
|
$this->path = $path;
|
|
$this->type = $type;
|
|
}
|
|
|
|
/**
|
|
* @return mixed
|
|
* @throws \Exception
|
|
*/
|
|
public function execute()
|
|
{
|
|
if ($this->type === 'txt') {
|
|
return $this->path;
|
|
}
|
|
|
|
if ($this->type === 'pdf') {
|
|
// $this->convertPdfToText();
|
|
$this->convertPdfToMD();
|
|
// $this->getHtmlContentsFromPdfWithImages();
|
|
|
|
return $this->path;
|
|
}
|
|
|
|
if ($this->type !== 'docx') {
|
|
$this->convertToDocx();
|
|
}
|
|
|
|
$this->convertDocumentToText();
|
|
//$this->convertToHtml();
|
|
|
|
return $this->path;
|
|
}
|
|
|
|
/**
|
|
* Convert doc,dot,rtf,odt,pdf,docx to docx
|
|
*
|
|
*
|
|
* @return string|void
|
|
*/
|
|
private function convertToDocx()
|
|
{
|
|
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
|
|
|
|
/**
|
|
* Convert doc,dot,rtf,odt to docx
|
|
*/
|
|
$process = new Process([
|
|
'soffice',
|
|
'--headless',
|
|
'--convert-to',
|
|
'docx',
|
|
$this->storage->path($this->path),
|
|
'--outdir',
|
|
$this->storage->path('contracts')
|
|
]);
|
|
|
|
$process->run();
|
|
|
|
if (!$process->isSuccessful()) {
|
|
throw new ProcessFailedException($process);
|
|
}
|
|
|
|
$this->storage->delete($this->path);
|
|
|
|
$this->path = str_replace(".$this->type", '.docx', $this->path);
|
|
}
|
|
|
|
/**
|
|
* Convert docx file to text
|
|
*
|
|
* @return void
|
|
*/
|
|
private function convertDocumentToText()
|
|
{
|
|
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
|
|
|
|
$process = new Process([
|
|
'soffice',
|
|
'--headless',
|
|
'--convert-to',
|
|
'txt',
|
|
$this->storage->path($this->path),
|
|
'--outdir',
|
|
$this->storage->path('contracts')
|
|
]);
|
|
|
|
$process->run();
|
|
|
|
if (!$process->isSuccessful()) {
|
|
throw new ProcessFailedException($process);
|
|
}
|
|
|
|
$this->storage->delete($this->path);
|
|
|
|
$this->path = str_replace(['.docx', '.bin'], '.txt', $this->path);
|
|
}
|
|
|
|
protected function convertPdfToText()
|
|
{
|
|
$this->prepareForConvertPDF();
|
|
|
|
$images = $this->getImagesFromPDF();
|
|
|
|
$contents = $this->getTextContentsFromPDF();
|
|
|
|
if (!$contents && count($images) === 0) {
|
|
throw new \Exception('Could not read from file.');
|
|
}
|
|
|
|
// Handle images and image contents.
|
|
if (count($images) > 0) {
|
|
foreach ($images as $image) {
|
|
try {
|
|
$ocr = new OCR($this->storage->path($image));
|
|
|
|
$imageContents = $ocr->execute();
|
|
|
|
$contents = $contents . "\n" . $imageContents;
|
|
} catch (\Exception $exception) {
|
|
\Illuminate\Support\Facades\Log::info('something wrong: ' . $exception->getMessage());
|
|
}
|
|
}
|
|
|
|
$dir = str_replace('.pdf', '', $this->path);
|
|
|
|
$this->storage->deleteDirectory($dir);
|
|
}
|
|
|
|
$this->storage->delete($this->path);
|
|
|
|
$this->path = str_replace('.pdf', '.txt', $this->path);
|
|
|
|
$this->storage->put($this->path, $contents);
|
|
}
|
|
|
|
protected function convertPdfToMD()
|
|
{
|
|
// $this->prepareForConvertPDF();
|
|
|
|
$result = $this->getContentsFromPdf();
|
|
|
|
if ( ! $result['has_images'] && ! $result['has_text']) {
|
|
throw new \Exception('Cannot get pdf file contents.');
|
|
}
|
|
|
|
if ($result['has_text']) {
|
|
if ($result['has_images']) {
|
|
// Both text and images.
|
|
throw new \Exception('Not supported for now.');
|
|
}
|
|
|
|
// Delete directory because the contents are in the '$result' variable.
|
|
$this->storage->deleteDirectory($this->path);
|
|
|
|
$mdContents = '';
|
|
|
|
foreach ($result['htmls'] as $html) {
|
|
$converter = new HtmlConverter();
|
|
$converter->getConfig()->setOption('strip_tags', true);
|
|
|
|
$contents = $converter->convert($html);
|
|
|
|
$mdContents = $mdContents . $contents;
|
|
}
|
|
|
|
$this->path = "$this->path.md";
|
|
|
|
$this->storage->put($this->path, $mdContents);
|
|
|
|
return;
|
|
}
|
|
|
|
// Only contains images.
|
|
$imagesContent = '';
|
|
$files = $this->storage->allFiles($this->path);
|
|
|
|
foreach ($files as $file) {
|
|
// Only get the image files from the directory, it may contain some empty html files too.
|
|
if (in_array(pathinfo($file, PATHINFO_EXTENSION), ['jpg', 'png'])) {
|
|
$ocr = new OCR($this->storage->path($file));
|
|
|
|
$imagesContent = $imagesContent . $ocr->execute();
|
|
}
|
|
}
|
|
|
|
\Illuminate\Support\Facades\Log::info('============================');
|
|
\Illuminate\Support\Facades\Log::info($this->path);
|
|
|
|
// We are done with the images processing, delete directory.
|
|
$this->storage->deleteDirectory($this->path);
|
|
|
|
$this->path = "$this->path.md";
|
|
|
|
\Illuminate\Support\Facades\Log::info($this->path);
|
|
\Illuminate\Support\Facades\Log::info('++++++++++++++++++++++++++');
|
|
|
|
$this->storage->put($this->path, $imagesContent);
|
|
}
|
|
|
|
private function convertToHtml()
|
|
{
|
|
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
|
|
|
|
$process = new Process([
|
|
'soffice',
|
|
'--headless',
|
|
'--convert-to',
|
|
'html:HTML:EmbedImages',
|
|
$this->storage->path($this->path),
|
|
'--outdir',
|
|
$this->storage->path('contracts')
|
|
]);
|
|
|
|
$process->run();
|
|
|
|
if (!$process->isSuccessful()) {
|
|
throw new ProcessFailedException($process);
|
|
}
|
|
|
|
$this->storage->delete($this->path);
|
|
|
|
$this->path = str_replace(".$this->type", '.html', $this->path);
|
|
}
|
|
|
|
private function convertToXML()
|
|
{
|
|
//Convert the file to xml using pdftohtml to xml and run a python scrypt to fix the paragraphs
|
|
$process = new Process([
|
|
'pdftohtml',
|
|
'-xml',
|
|
'-i',
|
|
$this->storage->path($this->path)
|
|
]);
|
|
|
|
$process->run();
|
|
|
|
if (!$process->isSuccessful()) {
|
|
throw new ProcessFailedException($process);
|
|
}
|
|
|
|
$this->storage->delete($this->path);
|
|
|
|
$this->path = str_replace(".$this->type", '.xml', $this->path);
|
|
}
|
|
|
|
protected function prepareForConvertPDF()
|
|
{
|
|
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
|
|
|
|
$process = new Process([
|
|
'pip3',
|
|
'install',
|
|
'pdftotext',
|
|
]);
|
|
|
|
$process->run();
|
|
|
|
if (!$process->isSuccessful()) {
|
|
throw new ProcessFailedException($process);
|
|
}
|
|
}
|
|
|
|
protected function getImagesFromPDF()
|
|
{
|
|
$dir = str_replace('.pdf', '', $this->path);
|
|
|
|
$this->storage->makeDirectory($dir);
|
|
|
|
$process = new Process([
|
|
'pdfimages',
|
|
'-p',
|
|
$this->storage->path($this->path),
|
|
'-tiff',
|
|
$this->storage->path("$dir/ocr")
|
|
]);
|
|
|
|
$process->run();
|
|
|
|
if (!$process->isSuccessful()) {
|
|
throw new ProcessFailedException($process);
|
|
}
|
|
|
|
return $this->storage->allFiles($dir);
|
|
}
|
|
|
|
protected function getTextContentsFromPDF()
|
|
{
|
|
$outputPath = $this->storage->path(str_replace('.pdf', '.txt', $this->path));
|
|
|
|
$process = new Process([
|
|
'python3',
|
|
storage_path('scripts' . DIRECTORY_SEPARATOR . 'parse-pdf.py'),
|
|
'-i',
|
|
$this->storage->path($this->path),
|
|
'-o',
|
|
$outputPath
|
|
]);
|
|
|
|
$process->run();
|
|
|
|
if (!$process->isSuccessful()) {
|
|
throw new ProcessFailedException($process);
|
|
}
|
|
|
|
return file_get_contents($outputPath);
|
|
}
|
|
|
|
protected function getHtmlContentsFromPdfWithImages()
|
|
{
|
|
$dirName = str_replace('.pdf', '', $this->path);
|
|
$this->storage->makeDirectory($dirName);
|
|
|
|
$outputPath = $this->storage->path("$dirName/html");
|
|
|
|
$process = new Process([
|
|
'pdftohtml',
|
|
'-noframes',
|
|
$this->storage->path($this->path),
|
|
$outputPath
|
|
]);
|
|
|
|
$process->run();
|
|
|
|
if (!$process->isSuccessful()) {
|
|
throw new ProcessFailedException($process);
|
|
}
|
|
|
|
$this->storage->delete($this->path);
|
|
|
|
$this->path = $dirName;
|
|
|
|
$converter = new HtmlConverter();
|
|
$converter->getConfig()->setOption('strip_tags', true);
|
|
|
|
$files = $this->storage->allFiles($this->path);
|
|
|
|
$htmlFileIndex = null;
|
|
|
|
foreach ($files as $index => $file) {
|
|
// if (pathinfo($file, PATHINFO_BASENAME) === 'html-html.html') {
|
|
// if (pathinfo($file, PATHINFO_EXTENSION) === 'html') {
|
|
if (pathinfo($file, PATHINFO_BASENAME) === 'html.html') {
|
|
$htmlFileIndex = $index;
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
$htmlContents = $this->storage->get($files[$htmlFileIndex]);
|
|
$contents = $converter->convert($htmlContents);
|
|
|
|
// $this->storage->deleteDirectory($this->path);
|
|
|
|
$this->path = "$this->path.md";
|
|
|
|
$this->storage->put($this->path, $contents);
|
|
|
|
dd(3);
|
|
}
|
|
|
|
protected function getContentsFromPdf()
|
|
{
|
|
$dirName = str_replace('.pdf', '', $this->path);
|
|
$this->storage->makeDirectory($dirName);
|
|
|
|
$outputPath = $this->storage->path("$dirName/html");
|
|
|
|
$process = new Process([
|
|
'pdftohtml',
|
|
'-xml',
|
|
$this->storage->path($this->path),
|
|
$outputPath
|
|
]);
|
|
|
|
$process->run();
|
|
|
|
if (!$process->isSuccessful()) {
|
|
throw new ProcessFailedException($process);
|
|
}
|
|
|
|
$this->storage->delete($this->path);
|
|
|
|
$this->path = $dirName;
|
|
|
|
$contents = $this->storage->get("$this->path/html.xml");
|
|
|
|
$xml = simplexml_load_string($contents);
|
|
|
|
$fonts = [];
|
|
|
|
foreach ($xml->page as $page) {
|
|
foreach ($page as $p) {
|
|
if ($p->getName() === 'fontspec') {
|
|
$fonts[(int) $p['id']]['family'] = (string) $p['family'];
|
|
$fonts[(int) $p['id']]['size'] = (string) $p['size'];
|
|
$fonts[(int) $p['id']]['color'] = (string) $p['color'];
|
|
}
|
|
}
|
|
}
|
|
|
|
$htmls = [];
|
|
$hasImages = false;
|
|
$hasText = false;
|
|
|
|
try {
|
|
foreach ($xml->page as $page) {
|
|
$html = '';
|
|
|
|
$previousP = null;
|
|
|
|
foreach ($page as $p) {
|
|
if ($p->getName() == 'image') {
|
|
$html = $html . '<img style="position: absolute; top: ' . $p['top'] . 'px; left: ' . $p['left'] . 'px;" width="' . $p['width'] . '" height="' . $p['height'] . '" src="' . $p['src'] . '">';
|
|
|
|
$hasImages = true;
|
|
}
|
|
|
|
if ($p->getName() == 'text') {
|
|
$id = (int) $p['font'];
|
|
$font_size = $fonts[$id]['size'];
|
|
$font_color = $fonts[$id]['color'];
|
|
$font_family = $fonts[$id]['family'];
|
|
|
|
$style = '';
|
|
$style = $style . 'position: absolute;';
|
|
$style = $style . "color: $font_color;";
|
|
$style = $style . "font-family: $font_family;";
|
|
$style = $style . "font-weight: 900;";
|
|
$style = $style . "width: " . $p['width'] . "px;";
|
|
$style = $style . "height: " . $p['height'] . "px;";
|
|
$style = $style . "top: " . $p['top'] . "px;";
|
|
$style = $style . "left: " . $p['left'] . "px;";
|
|
|
|
// $style = $style . "font-size: $font_size" . "px;";
|
|
|
|
if ($p->i) {
|
|
$content = '<i>' . $p->i . '</i>';
|
|
} else if ($p->b) {
|
|
$content = '<b>' . $p->b . '</b>';
|
|
} else {
|
|
$content = $p;
|
|
}
|
|
|
|
// @TODO Must chain paragraphs if top are almost same.
|
|
|
|
$tag = $this->getTag($p, $previousP, $font_size);
|
|
|
|
$html = $html . '<' . $tag . ' style="' . $style . '">' . $content . '</' . $tag . '>';
|
|
|
|
$hasText = true;
|
|
}
|
|
|
|
$previousP = $p;
|
|
}
|
|
|
|
$htmls[] = '<html><head><title></title></head><body>' . $html . '</body></html>';
|
|
}
|
|
} catch (\Exception $exception) {
|
|
\Illuminate\Support\Facades\Log::info($exception->getTraceAsString());
|
|
}
|
|
|
|
return [
|
|
'has_images' => $hasImages,
|
|
'has_text' => $hasText,
|
|
'htmls' => $htmls,
|
|
];
|
|
}
|
|
|
|
protected function getTag($p, $previousP, $size)
|
|
{
|
|
if ($size > 24) {
|
|
return 'h1';
|
|
}
|
|
|
|
if ($size > 18) {
|
|
return 'h2';
|
|
}
|
|
|
|
if ($size > 16) {
|
|
return 'h3';
|
|
}
|
|
|
|
if ($previousP && $p['top'] - $previousP['top'] <= 5) {
|
|
return 'span';
|
|
}
|
|
|
|
return 'p';
|
|
}
|
|
}
|