You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
193 lines
4.5 KiB
193 lines
4.5 KiB
<?php
|
|
|
|
namespace App\Ingest;
|
|
|
|
use Illuminate\Support\Facades\Storage;
|
|
use Symfony\Component\Process\Exception\ProcessFailedException;
|
|
use Symfony\Component\Process\Process;
|
|
|
|
class Convertor
|
|
{
|
|
/**
|
|
* @var \Illuminate\Contracts\Filesystem\Filesystem
|
|
*/
|
|
private $storage;
|
|
private $path;
|
|
protected $type;
|
|
|
|
public function __construct($path, $type)
|
|
{
|
|
$this->storage = Storage::disk('local');
|
|
$this->path = $path;
|
|
$this->type = $type;
|
|
}
|
|
|
|
public function execute()
|
|
{
|
|
if ($this->type === 'txt') {
|
|
return $this->path;
|
|
}
|
|
|
|
if ($this->type === 'pdf') {
|
|
$this->convertPdfToText();
|
|
|
|
return $this->path;
|
|
}
|
|
|
|
if ($this->type !== 'docx') {
|
|
$this->convertToDocx();
|
|
}
|
|
|
|
$this->convertDocumentToText();
|
|
//$this->convertToHtml();
|
|
|
|
return $this->path;
|
|
}
|
|
|
|
/**
|
|
* Convert doc,dot,rtf,odt,pdf,docx to docx
|
|
*
|
|
*
|
|
* @return string|void
|
|
*/
|
|
private function convertToDocx()
|
|
{
|
|
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
|
|
|
|
/**
|
|
* Convert doc,dot,rtf,odt to docx
|
|
*/
|
|
$process = new Process([
|
|
'soffice',
|
|
'--headless',
|
|
'--convert-to',
|
|
'docx',
|
|
$this->storage->path($this->path),
|
|
'--outdir',
|
|
$this->storage->path('contracts')
|
|
]);
|
|
|
|
$process->run();
|
|
|
|
if (!$process->isSuccessful()) {
|
|
throw new ProcessFailedException($process);
|
|
}
|
|
|
|
$this->storage->delete($this->path);
|
|
|
|
$this->path = str_replace($this->type, 'docx', $this->path);
|
|
}
|
|
|
|
/**
|
|
* Convert docx file to text
|
|
*
|
|
*
|
|
* @return string|void
|
|
*/
|
|
private function convertDocumentToText()
|
|
{
|
|
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
|
|
|
|
$process = new Process([
|
|
'soffice',
|
|
'--headless',
|
|
'--convert-to',
|
|
'txt',
|
|
$this->storage->path($this->path),
|
|
'--outdir',
|
|
$this->storage->path('contracts')
|
|
]);
|
|
|
|
$process->run();
|
|
|
|
if (!$process->isSuccessful()) {
|
|
throw new ProcessFailedException($process);
|
|
}
|
|
|
|
$this->storage->delete($this->path);
|
|
|
|
$this->path = str_replace(['.docx', '.bin'], '.txt', $this->path);
|
|
}
|
|
|
|
private function convertPdfToText()
|
|
{
|
|
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
|
|
|
|
$process = new Process([
|
|
'pip3',
|
|
'install',
|
|
'pdftotext',
|
|
]);
|
|
|
|
$process->run();
|
|
|
|
if (!$process->isSuccessful()) {
|
|
throw new ProcessFailedException($process);
|
|
}
|
|
|
|
$process = new Process([
|
|
'python3',
|
|
storage_path('scripts' . DIRECTORY_SEPARATOR . 'parse-pdf.py'),
|
|
'-i',
|
|
$this->storage->path($this->path),
|
|
'-o',
|
|
$this->storage->path(str_replace('.pdf', '.txt', $this->path))
|
|
]);
|
|
|
|
$process->run();
|
|
|
|
if (!$process->isSuccessful()) {
|
|
throw new ProcessFailedException($process);
|
|
}
|
|
|
|
$this->storage->delete($this->path);
|
|
|
|
$this->path = str_replace('pdf', 'txt', $this->path);
|
|
}
|
|
|
|
private function convertToHtml()
|
|
{
|
|
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
|
|
|
|
$process = new Process([
|
|
'soffice',
|
|
'--headless',
|
|
'--convert-to',
|
|
'html:HTML:EmbedImages',
|
|
$this->storage->path($this->path),
|
|
'--outdir',
|
|
$this->storage->path('contracts')
|
|
]);
|
|
|
|
$process->run();
|
|
|
|
if (!$process->isSuccessful()) {
|
|
throw new ProcessFailedException($process);
|
|
}
|
|
|
|
$this->storage->delete($this->path);
|
|
|
|
$this->path = str_replace($this->type, 'html', $this->path);
|
|
}
|
|
|
|
private function convertToXML()
|
|
{
|
|
//Convert the file to xml using pdftohtml to xml and run a python scrypt to fix the paragraphs
|
|
$process = new Process([
|
|
'pdftohtml',
|
|
'-xml',
|
|
'-i',
|
|
$this->storage->path($this->path)
|
|
]);
|
|
|
|
$process->run();
|
|
|
|
if (!$process->isSuccessful()) {
|
|
throw new ProcessFailedException($process);
|
|
}
|
|
|
|
$this->storage->delete($this->path);
|
|
|
|
$this->path = str_replace($this->type, 'xml', $this->path);
|
|
}
|
|
}
|