Browse Source
Performance analyzer. Apply OCR on images in digital documents
hidden_tags_with_bookmarks
Performance analyzer. Apply OCR on images in digital documents
hidden_tags_with_bookmarks
Orzu Ionut
3 years ago
13 changed files with 332 additions and 122 deletions
-
85app/Console/Commands/AnalyzePerformance.php
-
16app/Ingest/Convertor.php
-
11app/Ingest/DocumentHandler.php
-
45app/Ingest/DocxConvertor.php
-
9app/Ingest/OCR.php
-
62app/Ingest/Office.php
-
18app/Ingest/OtherConvertor.php
-
88app/Ingest/PDFConvertor.php
-
66app/Jobs/IngestDocuments.php
-
7resources/python/dewarp/page_dewarp.py
-
29resources/python/ocr/localize_text_tesseract.py
-
BINresources/python/ocr/logo.jpg
-
2resources/python/ocr/results/.gitignore
@ -0,0 +1,85 @@ |
|||
<?php |
|||
|
|||
namespace App\Console\Commands; |
|||
|
|||
use App\Ingest\DocumentHandler; |
|||
use Illuminate\Console\Command; |
|||
use Illuminate\Http\UploadedFile; |
|||
use Illuminate\Support\Carbon; |
|||
use Illuminate\Support\Facades\Redis; |
|||
|
|||
class AnalyzePerformance extends Command |
|||
{ |
|||
/** |
|||
* The name and signature of the console command. |
|||
* |
|||
* @var string |
|||
*/ |
|||
protected $signature = 'analyze:run {path : The directory path}'; |
|||
|
|||
/** |
|||
* The console command description. |
|||
* |
|||
* @var string |
|||
*/ |
|||
protected $description = 'Run analyzer on multiple files in a directory.'; |
|||
|
|||
/** |
|||
* Create a new command instance. |
|||
* |
|||
* @return void |
|||
*/ |
|||
public function __construct() |
|||
{ |
|||
parent::__construct(); |
|||
} |
|||
|
|||
/** |
|||
* Execute the console command. |
|||
* |
|||
*/ |
|||
public function handle() |
|||
{ |
|||
$directoryPath = $this->argument('path'); |
|||
|
|||
if ( ! is_dir($directoryPath)) { |
|||
$this->error('The path is invalid: not a directory.'); |
|||
|
|||
return; |
|||
} |
|||
|
|||
$redis = Redis::connection(); |
|||
|
|||
$redis->set('analyze_performance_time', Carbon::now()->format('U')); |
|||
$redis->set('analyze_performance_path', $directoryPath); |
|||
|
|||
$allFiles = $this->getDirContents($directoryPath); |
|||
|
|||
$redis->set('analyze_performance_remaining_files', count($allFiles)); |
|||
|
|||
foreach ($allFiles as $index => $file) { |
|||
$handler = new DocumentHandler($index, new UploadedFile($file, "File {$index}"), false); |
|||
|
|||
$handler->handle(); |
|||
} |
|||
|
|||
$this->info('Processing... When it\'s done the results will be added to the \'ingest_analyze_performance.txt\' file in the directory you have provided.'); |
|||
} |
|||
|
|||
protected function getDirContents($dir, &$results = array()) |
|||
{ |
|||
$files = scandir($dir); |
|||
|
|||
foreach ($files as $key => $value) { |
|||
$path = realpath($dir . DIRECTORY_SEPARATOR . $value); |
|||
|
|||
if (!is_dir($path)) { |
|||
$results[] = $path; |
|||
} else if ($value != "." && $value != "..") { |
|||
$this->getDirContents($path, $results); |
|||
} |
|||
} |
|||
|
|||
return $results; |
|||
} |
|||
} |
@ -0,0 +1,62 @@ |
|||
<?php |
|||
|
|||
namespace App\Ingest; |
|||
|
|||
use Symfony\Component\Process\Process; |
|||
|
|||
class Office |
|||
{ |
|||
protected $id; |
|||
protected $directory; |
|||
|
|||
public function __construct() |
|||
{ |
|||
$this->id = uniqid(); |
|||
$this->directory = 'soffice-dir-' . $this->id; |
|||
|
|||
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); |
|||
} |
|||
|
|||
public function run($convertTo, $filePath, $directoryPath) |
|||
{ |
|||
$this->makeTemporaryDirectory(); |
|||
|
|||
$success = $this->runConversion($convertTo, $filePath, $directoryPath); |
|||
|
|||
// @TODO Does not work at the moment.
|
|||
// $this->removeTemporaryDirectory();
|
|||
|
|||
return $success; |
|||
} |
|||
|
|||
protected function runConversion($convertTo, $filePath, $directoryPath) |
|||
{ |
|||
$process = new Process([ |
|||
'soffice', |
|||
'--accept="pipe,name=soffice-pipe-' . $this->id . ';urp;StarOffice.ServiceMananger"', |
|||
'-env:UserInstallation=file:///tmp/' . $this->directory, |
|||
'--headless', |
|||
'--convert-to', |
|||
$convertTo, |
|||
$filePath, |
|||
'--outdir', |
|||
$directoryPath |
|||
]); |
|||
|
|||
$process->setTimeout(10); |
|||
|
|||
$process->run(); |
|||
|
|||
return $process->isSuccessful(); |
|||
} |
|||
|
|||
protected function makeTemporaryDirectory() |
|||
{ |
|||
(new Process(['mkdir /tmp/' . $this->directory]))->run(); |
|||
} |
|||
|
|||
protected function removeTemporaryDirectory() |
|||
{ |
|||
(new Process(['rm -rf /tmp/' . $this->directory]))->run(); |
|||
} |
|||
} |
@ -0,0 +1,29 @@ |
|||
import cv2 |
|||
|
|||
image = cv2.imread("logo.jpg", 1) |
|||
|
|||
img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) |
|||
|
|||
cv2.threshold(img, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU, img) |
|||
cv2.bitwise_not(img, img) |
|||
|
|||
rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (30, 5)) |
|||
|
|||
img = cv2.morphologyEx(img, cv2.MORPH_CLOSE, rect_kernel) |
|||
contours, hier = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) |
|||
|
|||
if len(contours) != 0: |
|||
ROI_number = 0 |
|||
for c in contours: |
|||
x,y,w,h = cv2.boundingRect(c) |
|||
|
|||
# Depends on text size, so the greater the value the less objects we get. |
|||
if (h > 50): |
|||
cv2.rectangle(image, (x,y), (x+w,y+h), (0,0,255), 1) |
|||
|
|||
ROI = image[y:y+h, x:x+w] |
|||
cv2.imwrite('results/ROI_{}.png'.format(ROI_number), ROI) |
|||
ROI_number += 1 |
|||
|
|||
cv2.imshow("Result", image) |
|||
cv2.waitKey(0) |
After Width: 638 | Height: 359 | Size: 19 KiB |
@ -0,0 +1,2 @@ |
|||
* |
|||
!.gitignore |
Write
Preview
Loading…
Cancel
Save
Reference in new issue