Browse Source
Performance analyzer. Apply OCR on images in digital documents
hidden_tags_with_bookmarks
Performance analyzer. Apply OCR on images in digital documents
hidden_tags_with_bookmarks
Orzu Ionut
3 years ago
13 changed files with 332 additions and 122 deletions
-
85app/Console/Commands/AnalyzePerformance.php
-
16app/Ingest/Convertor.php
-
11app/Ingest/DocumentHandler.php
-
45app/Ingest/DocxConvertor.php
-
9app/Ingest/OCR.php
-
62app/Ingest/Office.php
-
18app/Ingest/OtherConvertor.php
-
104app/Ingest/PDFConvertor.php
-
66app/Jobs/IngestDocuments.php
-
7resources/python/dewarp/page_dewarp.py
-
29resources/python/ocr/localize_text_tesseract.py
-
BINresources/python/ocr/logo.jpg
-
2resources/python/ocr/results/.gitignore
@ -0,0 +1,85 @@ |
|||||
|
<?php |
||||
|
|
||||
|
namespace App\Console\Commands; |
||||
|
|
||||
|
use App\Ingest\DocumentHandler; |
||||
|
use Illuminate\Console\Command; |
||||
|
use Illuminate\Http\UploadedFile; |
||||
|
use Illuminate\Support\Carbon; |
||||
|
use Illuminate\Support\Facades\Redis; |
||||
|
|
||||
|
class AnalyzePerformance extends Command |
||||
|
{ |
||||
|
/** |
||||
|
* The name and signature of the console command. |
||||
|
* |
||||
|
* @var string |
||||
|
*/ |
||||
|
protected $signature = 'analyze:run {path : The directory path}'; |
||||
|
|
||||
|
/** |
||||
|
* The console command description. |
||||
|
* |
||||
|
* @var string |
||||
|
*/ |
||||
|
protected $description = 'Run analyzer on multiple files in a directory.'; |
||||
|
|
||||
|
/** |
||||
|
* Create a new command instance. |
||||
|
* |
||||
|
* @return void |
||||
|
*/ |
||||
|
public function __construct() |
||||
|
{ |
||||
|
parent::__construct(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* Execute the console command. |
||||
|
* |
||||
|
*/ |
||||
|
public function handle() |
||||
|
{ |
||||
|
$directoryPath = $this->argument('path'); |
||||
|
|
||||
|
if ( ! is_dir($directoryPath)) { |
||||
|
$this->error('The path is invalid: not a directory.'); |
||||
|
|
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
$redis = Redis::connection(); |
||||
|
|
||||
|
$redis->set('analyze_performance_time', Carbon::now()->format('U')); |
||||
|
$redis->set('analyze_performance_path', $directoryPath); |
||||
|
|
||||
|
$allFiles = $this->getDirContents($directoryPath); |
||||
|
|
||||
|
$redis->set('analyze_performance_remaining_files', count($allFiles)); |
||||
|
|
||||
|
foreach ($allFiles as $index => $file) { |
||||
|
$handler = new DocumentHandler($index, new UploadedFile($file, "File {$index}"), false); |
||||
|
|
||||
|
$handler->handle(); |
||||
|
} |
||||
|
|
||||
|
$this->info('Processing... When it\'s done the results will be added to the \'ingest_analyze_performance.txt\' file in the directory you have provided.'); |
||||
|
} |
||||
|
|
||||
|
protected function getDirContents($dir, &$results = array()) |
||||
|
{ |
||||
|
$files = scandir($dir); |
||||
|
|
||||
|
foreach ($files as $key => $value) { |
||||
|
$path = realpath($dir . DIRECTORY_SEPARATOR . $value); |
||||
|
|
||||
|
if (!is_dir($path)) { |
||||
|
$results[] = $path; |
||||
|
} else if ($value != "." && $value != "..") { |
||||
|
$this->getDirContents($path, $results); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return $results; |
||||
|
} |
||||
|
} |
@ -0,0 +1,62 @@ |
|||||
|
<?php |
||||
|
|
||||
|
namespace App\Ingest; |
||||
|
|
||||
|
use Symfony\Component\Process\Process; |
||||
|
|
||||
|
class Office |
||||
|
{ |
||||
|
protected $id; |
||||
|
protected $directory; |
||||
|
|
||||
|
public function __construct() |
||||
|
{ |
||||
|
$this->id = uniqid(); |
||||
|
$this->directory = 'soffice-dir-' . $this->id; |
||||
|
|
||||
|
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); |
||||
|
} |
||||
|
|
||||
|
public function run($convertTo, $filePath, $directoryPath) |
||||
|
{ |
||||
|
$this->makeTemporaryDirectory(); |
||||
|
|
||||
|
$success = $this->runConversion($convertTo, $filePath, $directoryPath); |
||||
|
|
||||
|
// @TODO Does not work at the moment.
|
||||
|
// $this->removeTemporaryDirectory();
|
||||
|
|
||||
|
return $success; |
||||
|
} |
||||
|
|
||||
|
protected function runConversion($convertTo, $filePath, $directoryPath) |
||||
|
{ |
||||
|
$process = new Process([ |
||||
|
'soffice', |
||||
|
'--accept="pipe,name=soffice-pipe-' . $this->id . ';urp;StarOffice.ServiceMananger"', |
||||
|
'-env:UserInstallation=file:///tmp/' . $this->directory, |
||||
|
'--headless', |
||||
|
'--convert-to', |
||||
|
$convertTo, |
||||
|
$filePath, |
||||
|
'--outdir', |
||||
|
$directoryPath |
||||
|
]); |
||||
|
|
||||
|
$process->setTimeout(10); |
||||
|
|
||||
|
$process->run(); |
||||
|
|
||||
|
return $process->isSuccessful(); |
||||
|
} |
||||
|
|
||||
|
protected function makeTemporaryDirectory() |
||||
|
{ |
||||
|
(new Process(['mkdir /tmp/' . $this->directory]))->run(); |
||||
|
} |
||||
|
|
||||
|
protected function removeTemporaryDirectory() |
||||
|
{ |
||||
|
(new Process(['rm -rf /tmp/' . $this->directory]))->run(); |
||||
|
} |
||||
|
} |
@ -0,0 +1,29 @@ |
|||||
|
import cv2 |
||||
|
|
||||
|
image = cv2.imread("logo.jpg", 1) |
||||
|
|
||||
|
img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) |
||||
|
|
||||
|
cv2.threshold(img, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU, img) |
||||
|
cv2.bitwise_not(img, img) |
||||
|
|
||||
|
rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (30, 5)) |
||||
|
|
||||
|
img = cv2.morphologyEx(img, cv2.MORPH_CLOSE, rect_kernel) |
||||
|
contours, hier = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) |
||||
|
|
||||
|
if len(contours) != 0: |
||||
|
ROI_number = 0 |
||||
|
for c in contours: |
||||
|
x,y,w,h = cv2.boundingRect(c) |
||||
|
|
||||
|
# Depends on text size, so the greater the value the less objects we get. |
||||
|
if (h > 50): |
||||
|
cv2.rectangle(image, (x,y), (x+w,y+h), (0,0,255), 1) |
||||
|
|
||||
|
ROI = image[y:y+h, x:x+w] |
||||
|
cv2.imwrite('results/ROI_{}.png'.format(ROI_number), ROI) |
||||
|
ROI_number += 1 |
||||
|
|
||||
|
cv2.imshow("Result", image) |
||||
|
cv2.waitKey(0) |
After Width: 638 | Height: 359 | Size: 19 KiB |
@ -0,0 +1,2 @@ |
|||||
|
* |
||||
|
!.gitignore |
Write
Preview
Loading…
Cancel
Save
Reference in new issue