Repo for the search and displace ingest module that takes odf, docx and pdf and transforms it into .md to be used with search and displace operations
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

84 lines
2.1 KiB

<?php
namespace App\Ingest;
use Illuminate\Support\Facades\Storage;
use Symfony\Component\Process\Exception\ProcessFailedException;
use Symfony\Component\Process\Process;
class Convertor
{
/**
* @var \Illuminate\Contracts\Filesystem\Filesystem
*/
private $storage;
private $path;
protected $type;
public function __construct($path, $type)
{
$this->storage = Storage::disk('local');
$this->path = $path;
$this->type = $type;
}
/**
* @throws \Exception
*/
public function execute()
{
if ($this->type === 'txt') {
$convertor = new TextConvertor($this->storage, $this->path);
} else if ($this->type === 'pdf') {
$convertor = new PDFConvertor($this->storage, $this->path);
} else if ($this->type === 'docx') {
$convertor = new DocxConvertor($this->storage, $this->path);
} else {
$convertor = new OtherConvertor($this->storage, $this->path);
}
$convertor->execute();
//$this->convertToHtml();
}
private function convertToHtml()
{
$office = new Office();
$success = $office->run(
'html:HTML:EmbedImages',
$this->storage->path($this->path),
$this->storage->path('contracts')
);
if (! $success) {
throw new \Exception('Something went wrong while tried converting to HTML for file: ' . $this->path);
}
$this->storage->delete($this->path);
$this->path = str_replace(".$this->type", '.html', $this->path);
}
private function convertToXML()
{
//Convert the file to xml using pdftohtml to xml and run a python scrypt to fix the paragraphs
$process = new Process([
'pdftohtml',
'-xml',
'-i',
$this->storage->path($this->path)
]);
$process->run();
if (!$process->isSuccessful()) {
throw new ProcessFailedException($process);
}
$this->storage->delete($this->path);
$this->path = str_replace(".$this->type", '.xml', $this->path);
}
}