From 4b135c4fc493c398de3d9dc888c789a997a2eb0c Mon Sep 17 00:00:00 2001 From: Orzu Ionut Date: Wed, 12 May 2021 17:27:50 +0300 Subject: [PATCH] Text conversion improvement --- README.md | 1 + app/Http/Controllers/IngestController.php | 4 +- app/Ingest/MDConvertor.php | 12 +++- app/Jobs/IngestDocuments.php | 27 ++++--- app/Jobs/SendToCore.php | 7 ++ app/Parser/ParseTextArray.php | 87 +++++++++++++++-------- 6 files changed, 94 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index be75031..42badbd 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ apt-get update apt-get install software-properies-common add-apt-repository ppa:deadsnakes/ppa apt-get install supervisor python3.8 python3.8-dev +apt-get install redis-server supervisorctl restart all curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py diff --git a/app/Http/Controllers/IngestController.php b/app/Http/Controllers/IngestController.php index 32f7217..979a206 100644 --- a/app/Http/Controllers/IngestController.php +++ b/app/Http/Controllers/IngestController.php @@ -18,9 +18,9 @@ class IngestController extends Controller $handler->handle(); - return response()->json([ + return response()->json(array( 'status' => 'processing', - ]); + )); } catch (\Exception $exception) { return response()->json([ 'status' => 'error', diff --git a/app/Ingest/MDConvertor.php b/app/Ingest/MDConvertor.php index 9bfd465..2b8505f 100644 --- a/app/Ingest/MDConvertor.php +++ b/app/Ingest/MDConvertor.php @@ -8,7 +8,7 @@ class MDConvertor public function __construct($content) { - $this->content = json_decode($content, true); + $this->content = $content; } public function execute() @@ -26,7 +26,7 @@ class MDConvertor ' ' . (isset($paragraph['numbering']) ? $paragraph['numbering'] : '') . ' ' . - $paragraph['content'] . + $this->parseContent($paragraph['content']) . "\n"; if ( @@ -42,4 +42,12 @@ class MDConvertor return $content; } + + protected function parseContent($content) + { + $content = preg_replace("/\xE2\x80\x8B/", "", $content); + $content = preg_replace("/[\x{200B}-\x{200D}\x{FEFF}]/u", "", $content); + + return utf8_decode($content); + } } diff --git a/app/Jobs/IngestDocuments.php b/app/Jobs/IngestDocuments.php index 49636a5..845d89c 100644 --- a/app/Jobs/IngestDocuments.php +++ b/app/Jobs/IngestDocuments.php @@ -62,6 +62,13 @@ class IngestDocuments implements ShouldQueue { $this->path = $path; $this->type = $type; + + $this->storage = Storage::disk('local'); + $this->parserDocx = new ParseDocx(); + $this->parserText = new ParseTextArray(); + $this->parserXml = new ParseXml(); + $this->parserHtml = new ParseHtml(); + $this->parseHtmlArray = new ParseHtmlArray(); } /** @@ -71,19 +78,16 @@ class IngestDocuments implements ShouldQueue */ public function handle() { - $this->storage = Storage::disk('local'); - $this->parserDocx = new ParseDocx(); - $this->parserText = new ParseTextArray(); - $this->parserXml = new ParseXml(); - $this->parserHtml = new ParseHtml(); - $this->parseHtmlArray = new ParseHtmlArray(); - $convertor = new Convertor($this->path, $this->type); $this->path = $convertor->execute(); $content = $this->getContent(); - $content = $this->convertToUTF8($content); + if ( ! $content) { + return; + } + +// $content = $this->convertToUTF8($content); try { $filePath = $this->storeContent($content); @@ -96,8 +100,11 @@ class IngestDocuments implements ShouldQueue } } - protected function failed() + public function failed() { + Log::error('Ingest documents failed.'); + + // @TODO Delete docx, txt and md files. if ($this->storage->exists($this->path)) { $this->storage->delete($this->path); } @@ -133,7 +140,7 @@ class IngestDocuments implements ShouldQueue } ); - return utf8_encode(json_encode($content)); + return $content; } protected function storeContent($content) diff --git a/app/Jobs/SendToCore.php b/app/Jobs/SendToCore.php index 6d8f8fe..abc3470 100644 --- a/app/Jobs/SendToCore.php +++ b/app/Jobs/SendToCore.php @@ -72,6 +72,13 @@ class SendToCore implements ShouldQueue } } + public function failed() + { + if ($this->filePath) { + $this->storage->delete($this->filePath); + } + } + /** * Send the data to the core trough webhooks * diff --git a/app/Parser/ParseTextArray.php b/app/Parser/ParseTextArray.php index 8d79429..b16276b 100644 --- a/app/Parser/ParseTextArray.php +++ b/app/Parser/ParseTextArray.php @@ -125,7 +125,6 @@ class ParseTextArray } else { Log::error('The given file dose not exists!'); } - } @@ -144,6 +143,7 @@ class ParseTextArray $alreadyHandled = []; $countData = count($textAsArray); + for ($i = 0; $i < $countData; $i++) { if (array_key_exists($i, $alreadyHandled)) { continue; @@ -154,16 +154,19 @@ class ParseTextArray 'content' => trim($textAsArray[ $i ]), 'spaces' => strlen($textAsArray[ $i ]) - strlen(ltrim($textAsArray[ $i ])) ]; + //Remove numbering from the paragraph content if ($numbering = $this->getNumbering($textAsArray[ $i ])) { $data[ $i ][ 'numbering' ] = $numbering; $data[ $i ][ 'content' ] = trim(ltrim(str_replace($numbering, '', $data[ $i ][ 'content' ]), '.')); } + if ($this->pdf && strpos($textAsArray[ $i ], 'Page') !== false && strpos($textAsArray[ $i ], 'of') !== false) { $alreadyHandled[] = $i; break; } + $j = $i + 1; if (isset($textAsArray[ $j ])) { @@ -171,67 +174,91 @@ class ParseTextArray if (array_key_exists($j, $alreadyHandled)) { continue; } - if ($this->pdf && isset($textAsArray[ $j ]) && strpos($textAsArray[ $j ], - 'Page') !== false && strpos($textAsArray[ $j ], 'of') !== false) { + + if ( + $this->pdf && + isset($textAsArray[ $j ]) && + strpos($textAsArray[ $j ], 'Page') !== false && + strpos($textAsArray[ $j ], 'of') !== false + ) { $alreadyHandled[] = $j; - continue; + continue; } - //extract the content and count the number of the empty spaces from the begining + // Extract the content and count the number of the empty spaces from the beginning $data[ $j ] = [ 'content' => trim($textAsArray[ $j ]), 'spaces' => strlen($textAsArray[ $j ]) - strlen(ltrim($textAsArray[ $j ])) ]; - //Remove numbering from the paragraph content + + // Remove numbering from the paragraph content if ($numbering = $this->getNumbering($textAsArray[ $j ])) { $data[ $j ][ 'numbering' ] = $numbering; - $data[ $j ][ 'content' ] = trim(ltrim(str_replace($numbering, '', $data[ $j ][ 'content' ]), - '.')); + $data[ $j ][ 'content' ] = trim( + ltrim(str_replace($numbering, '', $data[ $j ][ 'content' ]), '.') + ); } - //break if outh have numbering and the space is equal - if ($data[ $j ][ 'spaces' ] == $data[ $i ][ 'spaces' ] && $this->hasNumbering($data[ $j ]) && $this->hasNumbering($data[ $i ]) && substr_count($data[ $i ][ 'numbering' ], - '.') == substr_count($data[ $j ][ 'numbering' ], - '.') && count(array_filter(str_split($data[ $i ][ 'numbering' ]), - 'is_numeric')) == count(array_filter(str_split($data[ $j ][ 'numbering' ]), - 'is_numeric'))) { + // Break if both have numbering and the space is equal + if ( + $data[ $j ][ 'spaces' ] == $data[ $i ][ 'spaces' ] && + $this->hasNumbering($data[ $j ]) && + $this->hasNumbering($data[ $i ]) && + substr_count($data[ $i ][ 'numbering' ], '.') == substr_count($data[ $j ][ 'numbering' ], '.') && + count(array_filter(str_split($data[ $i ][ 'numbering' ]), 'is_numeric')) == count(array_filter(str_split($data[ $j ][ 'numbering' ]), 'is_numeric'))) { break; } - if ($this->hasNumbering($data[ $j ]) && ! $this->hasNumbering($data[ $i ]) && ! $data[ $i ][ 'spaces' ] && $data[ $j ][ 'spaces' ] > $data[ $i ][ 'spaces' ] && ! in_array(substr($data[ $i ][ 'content' ], - -1), [':'])) { - + if ( + $this->hasNumbering($data[ $j ]) && + ! $this->hasNumbering($data[ $i ]) && + ! $data[ $i ][ 'spaces' ] && + $data[ $j ][ 'spaces' ] > $data[ $i ][ 'spaces' ] && + ! in_array(substr($data[ $i ][ 'content' ], -1), [':']) + ) { break; } - if ($this->hasNumbering($data[ $j ]) && $this->hasNumbering($data[ $i ]) && ((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) >= 1) { - + if ( + $this->hasNumbering($data[ $j ]) && + $this->hasNumbering($data[ $i ]) && + ((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) >= 1 + ) { break; } - if ($this->hasNumbering($data[ $j ]) && $this->hasNumbering($data[ $i ]) && ((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) >= 1) { + if ( + $this->hasNumbering($data[ $j ]) && + $this->hasNumbering($data[ $i ]) && + ((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) >= 1 + ) { break; } - //Hardcoded breakpoints - if ($this->hasNumbering($data[ $j ]) && in_array(strtoupper(str_replace(['.', "\t", "​ "], '', - $data[ $j ][ 'content' ])), $this->breakPoints)) { + // Hardcoded breakpoints + if ( + $this->hasNumbering($data[ $j ]) && + in_array(strtoupper(str_replace(['.', "\t", "​ "], '', $data[ $j ][ 'content' ])), $this->breakPoints) + ) { break; } - //Hardcoded "Schedule break" - if (! $this->hasNumbering($data[ $j ]) && strpos(substr(trim(strtolower(utf8_encode($data[ $j ][ 'content' ]))), - 0, 10), 'schedule') !== false) { + // Hardcoded "Schedule break" + if ( + ! $this->hasNumbering($data[ $j ]) && + strpos(substr(trim(strtolower(utf8_encode($data[ $j ][ 'content' ]))), 0, 10), 'schedule') !== false + ) { break; } - if (! $this->hasNumbering($data[ $j ]) && strpos(substr(trim($data[ $j ][ 'content' ]), 0, 15), - 'Exhibit') !== false && ! in_array(substr(trim($data[ $j ][ 'content' ]), -1), [ - '.' - ])) { + if ( + ! $this->hasNumbering($data[ $j ]) && + strpos(substr(trim($data[ $j ][ 'content' ]), 0, 15), 'Exhibit') !== false && + ! in_array(substr(trim($data[ $j ][ 'content' ]), -1), ['.']) + ) { break; }