Browse Source

Fix issue with handling PDF files. Add support for plain text files.

hidden_tags_with_bookmarks
Orzu Ionut 3 years ago
parent
commit
17aa1ab326
  1. 13
      app/Ingest/Convertor.php
  2. 2
      app/Ingest/DocumentHandler.php
  3. 12
      app/Jobs/IngestDocuments.php
  4. 18
      app/Jobs/SendToCore.php

13
app/Ingest/Convertor.php

@ -24,6 +24,10 @@ class Convertor
public function execute()
{
if ($this->type === 'txt') {
return $this->path;
}
if ($this->type === 'pdf') {
$this->convertPdfToText();
@ -107,10 +111,12 @@ class Convertor
private function convertPdfToText()
{
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
$process = new Process([
'pip',
'pip3',
'install',
"pdftotext"
'pdftotext',
]);
$process->run();
@ -119,9 +125,6 @@ class Convertor
throw new ProcessFailedException($process);
}
/**
* Convert pdf to text
*/
$process = new Process([
'python3',
storage_path('scripts' . DIRECTORY_SEPARATOR . 'parse-pdf.py'),

2
app/Ingest/DocumentHandler.php

@ -18,6 +18,7 @@ class DocumentHandler
const PDF_WPS_MIME_TYPE = 'application/wps-office.pdf';
const DOCXOLD_MIME_TYPE = 'application/octet-stream';
const DOCX_WPS_TYPE = 'application/wps-office.docx';
const PLAIN_TEXT_TYPE = 'text/plain';
protected $supportedFiles = [
self::DOCX_MIME_TYPE => 'docx',
@ -28,6 +29,7 @@ class DocumentHandler
self::ODT_MIME_TYPE => 'odt',
self::PDF_MIME_TYPE => 'pdf',
self::PDF_WPS_MIME_TYPE => 'pdf',
self::PLAIN_TEXT_TYPE => 'txt',
];
public function __construct($id, $document)

12
app/Jobs/IngestDocuments.php

@ -48,11 +48,6 @@ class IngestDocuments implements ShouldQueue
*/
private $parseHtmlArray;
/**
* @var \App\Parser\ParseTextArray
*/
private $parserText;
/**
* Create a new job instance.
*
@ -65,7 +60,6 @@ class IngestDocuments implements ShouldQueue
$this->storage = Storage::disk('local');
$this->parserDocx = new ParseDocx();
$this->parserText = new ParseTextArray();
$this->parserXml = new ParseXml();
$this->parserHtml = new ParseHtml();
$this->parseHtmlArray = new ParseHtmlArray();
@ -87,7 +81,7 @@ class IngestDocuments implements ShouldQueue
return;
}
// $content = $this->convertToUTF8($content);
$content = $this->convertToUTF8($content);
try {
$filePath = $this->storeContent($content);
@ -129,7 +123,9 @@ class IngestDocuments implements ShouldQueue
return $textParser->fromFile($this->storage->path($this->path));
}
return $this->parserText->fromFile($this->storage->path($this->path));
$textParser = new ParseTextArray();
return $textParser->fromFile($this->storage->path($this->path));
}
protected function convertToUTF8($content)

18
app/Jobs/SendToCore.php

@ -47,6 +47,7 @@ class SendToCore implements ShouldQueue
* Execute the job.
*
* @return void
* @throws \Illuminate\Contracts\Filesystem\FileNotFoundException
*/
public function handle()
{
@ -83,16 +84,16 @@ class SendToCore implements ShouldQueue
* Send the data to the core trough webhooks
*
* @param $content
* @param string $status
* @return bool
*/
private function sendTheData($content)
protected function sendTheData($content)
{
try {
WebhookCall::create()
->url($this->url)
->payload(['data' => [
'id' => $this->id,
'content' => $content,
'content' => $this->encodeContent($content),
'status' => $content ? 'success' : 'fail',
]])
->useSecret($this->secret)
@ -105,4 +106,15 @@ class SendToCore implements ShouldQueue
return false;
}
}
protected function encodeContent($content)
{
$encoding = mb_detect_encoding($content, 'UTF-8, ISO-8859-1, WINDOWS-1252, WINDOWS-1251', true);
if ($encoding != 'UTF-8') {
$content = iconv($encoding, 'UTF-8//IGNORE', $content);
}
return $content;
}
}
Loading…
Cancel
Save