You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
183 lines
4.1 KiB
183 lines
4.1 KiB
<?php
|
|
|
|
namespace App\Jobs;
|
|
|
|
use App\Ingest\Convertor;
|
|
use App\Ingest\MDConvertor;
|
|
use App\Parser\ParseXml;
|
|
use App\Parser\DocxParser\ParseDocx;
|
|
use App\Parser\HtmlParser\ParseHtml;
|
|
use App\Parser\ParseHtmlArray;
|
|
use App\Parser\ParseTextArray;
|
|
use Illuminate\Bus\Queueable;
|
|
use Illuminate\Contracts\Queue\ShouldQueue;
|
|
use Illuminate\Foundation\Bus\Dispatchable;
|
|
use Illuminate\Queue\InteractsWithQueue;
|
|
use Illuminate\Support\Facades\Log;
|
|
use Illuminate\Support\Facades\Storage;
|
|
|
|
class IngestDocuments implements ShouldQueue
|
|
{
|
|
use Dispatchable, InteractsWithQueue, Queueable;
|
|
|
|
private $path;
|
|
protected $type;
|
|
|
|
/**
|
|
* @var \Illuminate\Contracts\Filesystem\Filesystem
|
|
*/
|
|
private $storage;
|
|
|
|
/**
|
|
* @var \App\Parser\DocxParser\ParseDocx
|
|
*/
|
|
private $parserDocx;
|
|
|
|
/**
|
|
* @var \App\Parser\ParseXml
|
|
*/
|
|
private $parserXml;
|
|
|
|
/**
|
|
* @var \App\Parser\HtmlParser\ParseHtml
|
|
*/
|
|
private $parserHtml;
|
|
|
|
/**
|
|
* @var \App\Parser\ParseHtmlArray
|
|
*/
|
|
private $parseHtmlArray;
|
|
|
|
/**
|
|
* @var \App\Parser\ParseTextArray
|
|
*/
|
|
private $parserText;
|
|
|
|
/**
|
|
* Create a new job instance.
|
|
*
|
|
* @param string $path
|
|
*/
|
|
public function __construct(string $path, $type)
|
|
{
|
|
$this->path = $path;
|
|
$this->type = $type;
|
|
|
|
$this->storage = Storage::disk('local');
|
|
$this->parserDocx = new ParseDocx();
|
|
$this->parserText = new ParseTextArray();
|
|
$this->parserXml = new ParseXml();
|
|
$this->parserHtml = new ParseHtml();
|
|
$this->parseHtmlArray = new ParseHtmlArray();
|
|
}
|
|
|
|
/**
|
|
* Execute the job.
|
|
*
|
|
* @return void
|
|
*/
|
|
public function handle()
|
|
{
|
|
$convertor = new Convertor($this->path, $this->type);
|
|
$this->path = $convertor->execute();
|
|
|
|
$content = $this->getContent();
|
|
|
|
if ( ! $content) {
|
|
return;
|
|
}
|
|
|
|
// $content = $this->convertToUTF8($content);
|
|
|
|
try {
|
|
$filePath = $this->storeContent($content);
|
|
|
|
SendToCore::dispatch($filePath);
|
|
} catch (\Exception $e) {
|
|
Log::error('Error writing in to the file: ' . $e->getMessage());
|
|
|
|
// report($e);
|
|
}
|
|
}
|
|
|
|
public function failed()
|
|
{
|
|
if ( ! $this->storage) {
|
|
$this->storage = Storage::disk('local');
|
|
}
|
|
|
|
Log::error('Ingest documents failed.');
|
|
|
|
// @TODO Delete docx, txt and md files.
|
|
if ($this->storage->exists($this->path)) {
|
|
$this->storage->delete($this->path);
|
|
}
|
|
|
|
SendToCore::dispatch(null);
|
|
}
|
|
|
|
protected function getContent()
|
|
{
|
|
if ($this->type === 'pdf') {
|
|
// Wait while it finishes.
|
|
while (!$this->storage->exists($this->path)) {
|
|
sleep(1);
|
|
}
|
|
|
|
$textParser = new ParseTextArray(true);
|
|
|
|
return $textParser->fromFile($this->storage->path($this->path));
|
|
}
|
|
|
|
return $this->parserText->fromFile($this->storage->path($this->path));
|
|
}
|
|
|
|
protected function convertToUTF8($content)
|
|
{
|
|
array_walk_recursive(
|
|
$content,
|
|
function (&$entry) {
|
|
$entry = mb_convert_encoding(
|
|
$entry,
|
|
'UTF-8'
|
|
);
|
|
}
|
|
);
|
|
|
|
return $content;
|
|
}
|
|
|
|
protected function storeContent($content)
|
|
{
|
|
$result = explode('.', $this->path);
|
|
$name = $result[0];
|
|
|
|
// Or json?
|
|
$filePath = $this->storeMD($name, $content);
|
|
|
|
// Delete converted file. We now have the .md file.
|
|
$this->storage->delete($this->path);
|
|
|
|
return $filePath;
|
|
}
|
|
|
|
protected function storeMD($name, $content)
|
|
{
|
|
$fileName = "$name.md";
|
|
|
|
$convertor = new MDConvertor($content);
|
|
|
|
$this->storage->put($fileName, $convertor->execute());
|
|
|
|
return $fileName;
|
|
}
|
|
|
|
protected function storeJson($name, $content)
|
|
{
|
|
$fileName = "$name.json";
|
|
|
|
$this->storage->put($fileName, $content);
|
|
|
|
return $fileName;
|
|
}
|
|
}
|