From b48a9c439e2972b98789433319d6ccaa512db24f Mon Sep 17 00:00:00 2001
From: Radu Liviu Carjan <radu.carjan@gmail.com>
Date: Thu, 20 Oct 2022 14:31:43 +0300
Subject: [PATCH] Added dewarp to the ingest image

---
 docker/ingest/Dockerfile              |  11 +-
 docker/ingest/dewarp/LICENSE.txt      |  21 +
 docker/ingest/dewarp/README.md        |  14 +
 docker/ingest/dewarp/derive_cubic.py  |  46 ++
 docker/ingest/dewarp/page_dewarp.py   | 923 ++++++++++++++++++++++++++
 docker/ingest/dewarp/requirements.txt |   5 +
 6 files changed, 1019 insertions(+), 1 deletion(-)
 create mode 100644 docker/ingest/dewarp/LICENSE.txt
 create mode 100644 docker/ingest/dewarp/README.md
 create mode 100644 docker/ingest/dewarp/derive_cubic.py
 create mode 100755 docker/ingest/dewarp/page_dewarp.py
 create mode 100644 docker/ingest/dewarp/requirements.txt

diff --git a/docker/ingest/Dockerfile b/docker/ingest/Dockerfile
index e287f35..51962a8 100644
--- a/docker/ingest/Dockerfile
+++ b/docker/ingest/Dockerfile
@@ -2,6 +2,10 @@ FROM rcarjan/nginx-php:7.4
 
 LABEL maintainer="Radu Liviu Carjan"
 
+## Add required files
+RUN mkdir /var/www/dewarp
+ADD dewarp /var/www/dewarp
+
 ## Install libreoffice
 RUN apt-add-repository -y ppa:libreoffice/ppa && \
     apt-get install -y \
@@ -40,5 +44,10 @@ RUN pip install \
     pdftotext \
     supervisor \
     opencv-python
+    
+WORKDIR /var/www/dewarp
+RUN pip install -r requirements.txt
+
+RUN mkdir /var/log/queue
 
-RUN mkdir /var/log/queue
\ No newline at end of file
+WORKDIR /var/www/ingest
diff --git a/docker/ingest/dewarp/LICENSE.txt b/docker/ingest/dewarp/LICENSE.txt
new file mode 100644
index 0000000..e35410a
--- /dev/null
+++ b/docker/ingest/dewarp/LICENSE.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2016, Matt Zucker
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/docker/ingest/dewarp/README.md b/docker/ingest/dewarp/README.md
new file mode 100644
index 0000000..7ea5f8c
--- /dev/null
+++ b/docker/ingest/dewarp/README.md
@@ -0,0 +1,14 @@
+page_dewarp
+===========
+
+Page dewarping and thresholding using a "cubic sheet" model - see full writeup at <https://mzucker.github.io/2016/08/15/page-dewarping.html>
+
+Requirements:
+
+ - scipy
+ - OpenCV 3.0 or greater
+ - Image module from PIL or Pillow
+ 
+Usage:
+
+    page_dewarp.py IMAGE1 [IMAGE2 ...]
diff --git a/docker/ingest/dewarp/derive_cubic.py b/docker/ingest/dewarp/derive_cubic.py
new file mode 100644
index 0000000..7b4c993
--- /dev/null
+++ b/docker/ingest/dewarp/derive_cubic.py
@@ -0,0 +1,46 @@
+from __future__ import print_function
+import matplotlib.pyplot as plt
+import numpy as np
+import sympy
+
+# create a bunch of symbols
+a, b, c, d, x, alpha, beta = sympy.symbols('a b c d x alpha beta')
+
+# create a polynomial function f(x)
+f = a*x**3 + b*x**2 + c*x + d
+
+# get its derivative f'(x)
+fp = f.diff(x)
+
+# evaluate both at x=0 and x=1
+f0 = f.subs(x, 0)
+f1 = f.subs(x, 1)
+fp0 = fp.subs(x, 0)
+fp1 = fp.subs(x, 1)
+
+# we want a, b, c, d such that the following conditions hold:
+#
+#  f(0) = 0
+#  f(1) = 0
+#  f'(0) = alpha
+#  f'(1) = beta
+
+S = sympy.solve([f0, f1, fp0-alpha, fp1-beta], [a, b, c, d])
+
+# print the analytic solution and plot a graphical example
+coeffs = []
+
+num_alpha = 0.3
+num_beta = 0.03
+
+for key in [a, b, c, d]:
+    print(key, '=', S[key])
+    coeffs.append(S[key].subs(dict(alpha=num_alpha,
+                                   beta=num_beta)))
+
+xvals = np.linspace(0, 1, 101)
+yvals = np.polyval(coeffs, xvals)
+
+plt.plot(xvals, yvals)
+plt.show()
+
diff --git a/docker/ingest/dewarp/page_dewarp.py b/docker/ingest/dewarp/page_dewarp.py
new file mode 100755
index 0000000..c0b1e7e
--- /dev/null
+++ b/docker/ingest/dewarp/page_dewarp.py
@@ -0,0 +1,923 @@
+#!/usr/bin/env python
+######################################################################
+# page_dewarp.py - Proof-of-concept of page-dewarping based on a
+# "cubic sheet" model. Requires OpenCV (version 3 or greater),
+# PIL/Pillow, and scipy.optimize.
+######################################################################
+# Author:  Matt Zucker
+# Date:    July 2016
+# License: MIT License (see LICENSE.txt)
+######################################################################
+
+from __future__ import division
+from __future__ import print_function
+from builtins import zip
+from builtins import str
+from builtins import range
+from builtins import object
+from past.utils import old_div
+import os
+import sys
+import datetime
+import cv2
+from PIL import Image
+import numpy as np
+import scipy.optimize
+
+# for some reason pylint complains about cv2 members being undefined :(
+# pylint: disable=E1101
+
+PAGE_MARGIN_X = 50       # reduced px to ignore near L/R edge
+PAGE_MARGIN_Y = 20       # reduced px to ignore near T/B edge
+
+OUTPUT_ZOOM = 1.0        # how much to zoom output relative to *original* image
+OUTPUT_DPI = 300         # just affects stated DPI of PNG, not appearance
+REMAP_DECIMATE = 16      # downscaling factor for remapping image
+
+ADAPTIVE_WINSZ = 55      # window size for adaptive threshold in reduced px
+
+TEXT_MIN_WIDTH = 15      # min reduced px width of detected text contour
+TEXT_MIN_HEIGHT = 2      # min reduced px height of detected text contour
+TEXT_MIN_ASPECT = 1.5    # filter out text contours below this w/h ratio
+TEXT_MAX_THICKNESS = 10  # max reduced px thickness of detected text contour
+
+EDGE_MAX_OVERLAP = 1.0   # max reduced px horiz. overlap of contours in span
+EDGE_MAX_LENGTH = 100.0  # max reduced px length of edge connecting contours
+EDGE_ANGLE_COST = 10.0   # cost of angles in edges (tradeoff vs. length)
+EDGE_MAX_ANGLE = 7.5     # maximum change in angle allowed between contours
+
+RVEC_IDX = slice(0, 3)   # index of rvec in params vector
+TVEC_IDX = slice(3, 6)   # index of tvec in params vector
+CUBIC_IDX = slice(6, 8)  # index of cubic slopes in params vector
+
+SPAN_MIN_WIDTH = 30      # minimum reduced px width for span
+SPAN_PX_PER_STEP = 20    # reduced px spacing for sampling along spans
+FOCAL_LENGTH = 1.2       # normalized focal length of camera
+
+DEBUG_LEVEL = 0          # 0=none, 1=some, 2=lots, 3=all
+DEBUG_OUTPUT = 'file'    # file, screen, both
+
+WINDOW_NAME = 'Dewarp'   # Window name for visualization
+
+# nice color palette for visualizing contours, etc.
+CCOLORS = [
+    (255, 0, 0),
+    (255, 63, 0),
+    (255, 127, 0),
+    (255, 191, 0),
+    (255, 255, 0),
+    (191, 255, 0),
+    (127, 255, 0),
+    (63, 255, 0),
+    (0, 255, 0),
+    (0, 255, 63),
+    (0, 255, 127),
+    (0, 255, 191),
+    (0, 255, 255),
+    (0, 191, 255),
+    (0, 127, 255),
+    (0, 63, 255),
+    (0, 0, 255),
+    (63, 0, 255),
+    (127, 0, 255),
+    (191, 0, 255),
+    (255, 0, 255),
+    (255, 0, 191),
+    (255, 0, 127),
+    (255, 0, 63),
+]
+
+# default intrinsic parameter matrix
+K = np.array([
+    [FOCAL_LENGTH, 0, 0],
+    [0, FOCAL_LENGTH, 0],
+    [0, 0, 1]], dtype=np.float32)
+
+
+def debug_show(name, step, text, display):
+
+    if DEBUG_OUTPUT != 'screen':
+        filetext = text.replace(' ', '_')
+        outfile = name + '_debug_' + str(step) + '_' + filetext + '.png'
+        cv2.imwrite(outfile, display)
+
+    if DEBUG_OUTPUT != 'file':
+
+        image = display.copy()
+        height = image.shape[0]
+
+        cv2.putText(image, text, (16, height-16),
+                    cv2.FONT_HERSHEY_SIMPLEX, 1.0,
+                    (0, 0, 0), 3, cv2.LINE_AA)
+
+        cv2.putText(image, text, (16, height-16),
+                    cv2.FONT_HERSHEY_SIMPLEX, 1.0,
+                    (255, 255, 255), 1, cv2.LINE_AA)
+
+        cv2.imshow(WINDOW_NAME, image)
+
+        while cv2.waitKey(5) < 0:
+            pass
+
+
+def round_nearest_multiple(i, factor):
+    i = int(i)
+    rem = i % factor
+    if not rem:
+        return i
+    else:
+        return i + factor - rem
+
+
+def pix2norm(shape, pts):
+    height, width = shape[:2]
+    scl = 2.0/(max(height, width))
+    offset = np.array([width, height], dtype=pts.dtype).reshape((-1, 1, 2))*0.5
+    return (pts - offset) * scl
+
+
+def norm2pix(shape, pts, as_integer):
+    height, width = shape[:2]
+    scl = max(height, width)*0.5
+    offset = np.array([0.5*width, 0.5*height],
+                      dtype=pts.dtype).reshape((-1, 1, 2))
+    rval = pts * scl + offset
+    if as_integer:
+        return (rval + 0.5).astype(int)
+    else:
+        return rval
+
+
+def fltp(point):
+    return tuple(point.astype(int).flatten())
+
+
+def draw_correspondences(img, dstpoints, projpts):
+
+    display = img.copy()
+    dstpoints = norm2pix(img.shape, dstpoints, True)
+    projpts = norm2pix(img.shape, projpts, True)
+
+    for pts, color in [(projpts, (255, 0, 0)),
+                       (dstpoints, (0, 0, 255))]:
+
+        for point in pts:
+            cv2.circle(display, fltp(point), 3, color, -1, cv2.LINE_AA)
+
+    for point_a, point_b in zip(projpts, dstpoints):
+        cv2.line(display, fltp(point_a), fltp(point_b),
+                 (255, 255, 255), 1, cv2.LINE_AA)
+
+    return display
+
+
+def get_default_params(corners, ycoords, xcoords):
+
+    # page width and height
+    page_width = np.linalg.norm(corners[1] - corners[0])
+    page_height = np.linalg.norm(corners[-1] - corners[0])
+    rough_dims = (page_width, page_height)
+
+    # our initial guess for the cubic has no slope
+    cubic_slopes = [0.0, 0.0]
+
+    # object points of flat page in 3D coordinates
+    corners_object3d = np.array([
+        [0, 0, 0],
+        [page_width, 0, 0],
+        [page_width, page_height, 0],
+        [0, page_height, 0]])
+
+    # estimate rotation and translation from four 2D-to-3D point
+    # correspondences
+    _, rvec, tvec = cv2.solvePnP(corners_object3d,
+                                 corners, K, np.zeros(5))
+
+    span_counts = [len(xc) for xc in xcoords]
+
+    params = np.hstack((np.array(rvec).flatten(),
+                        np.array(tvec).flatten(),
+                        np.array(cubic_slopes).flatten(),
+                        ycoords.flatten()) +
+                       tuple(xcoords))
+
+    return rough_dims, span_counts, params
+
+
+def project_xy(xy_coords, pvec):
+
+    # get cubic polynomial coefficients given
+    #
+    #  f(0) = 0, f'(0) = alpha
+    #  f(1) = 0, f'(1) = beta
+
+    alpha, beta = tuple(pvec[CUBIC_IDX])
+
+    poly = np.array([
+        alpha + beta,
+        -2*alpha - beta,
+        alpha,
+        0])
+
+    xy_coords = xy_coords.reshape((-1, 2))
+    z_coords = np.polyval(poly, xy_coords[:, 0])
+
+    objpoints = np.hstack((xy_coords, z_coords.reshape((-1, 1))))
+
+    image_points, _ = cv2.projectPoints(objpoints,
+                                        pvec[RVEC_IDX],
+                                        pvec[TVEC_IDX],
+                                        K, np.zeros(5))
+
+    return image_points
+
+
+def project_keypoints(pvec, keypoint_index):
+
+    xy_coords = pvec[keypoint_index]
+    xy_coords[0, :] = 0
+
+    return project_xy(xy_coords, pvec)
+
+
+def resize_to_screen(src, maxw=1280, maxh=700, copy=False):
+
+    height, width = src.shape[:2]
+
+    scl_x = float(width)/maxw
+    scl_y = float(height)/maxh
+
+    scl = int(np.ceil(max(scl_x, scl_y)))
+
+    if scl > 1.0:
+        inv_scl = 1.0/scl
+        img = cv2.resize(src, (0, 0), None, inv_scl, inv_scl, cv2.INTER_AREA)
+    elif copy:
+        img = src.copy()
+    else:
+        img = src
+
+    return img
+
+
+def box(width, height):
+    return np.ones((height, width), dtype=np.uint8)
+
+
+def get_page_extents(small):
+
+    height, width = small.shape[:2]
+
+    xmin = PAGE_MARGIN_X
+    ymin = PAGE_MARGIN_Y
+    xmax = width-PAGE_MARGIN_X
+    ymax = height-PAGE_MARGIN_Y
+
+    page = np.zeros((height, width), dtype=np.uint8)
+    cv2.rectangle(page, (xmin, ymin), (xmax, ymax), (255, 255, 255), -1)
+
+    outline = np.array([
+        [xmin, ymin],
+        [xmin, ymax],
+        [xmax, ymax],
+        [xmax, ymin]])
+
+    return page, outline
+
+
+def get_mask(name, small, pagemask, masktype):
+
+    sgray = cv2.cvtColor(small, cv2.COLOR_RGB2GRAY)
+
+    if masktype == 'text':
+
+        mask = cv2.adaptiveThreshold(sgray, 255, cv2.ADAPTIVE_THRESH_MEAN_C,
+                                     cv2.THRESH_BINARY_INV,
+                                     ADAPTIVE_WINSZ,
+                                     25)
+
+        if DEBUG_LEVEL >= 3:
+            debug_show(name, 0.1, 'thresholded', mask)
+
+        mask = cv2.dilate(mask, box(9, 1))
+
+        if DEBUG_LEVEL >= 3:
+            debug_show(name, 0.2, 'dilated', mask)
+
+        mask = cv2.erode(mask, box(1, 3))
+
+        if DEBUG_LEVEL >= 3:
+            debug_show(name, 0.3, 'eroded', mask)
+
+    else:
+
+        mask = cv2.adaptiveThreshold(sgray, 255, cv2.ADAPTIVE_THRESH_MEAN_C,
+                                     cv2.THRESH_BINARY_INV,
+                                     ADAPTIVE_WINSZ,
+                                     7)
+
+        if DEBUG_LEVEL >= 3:
+            debug_show(name, 0.4, 'thresholded', mask)
+
+        mask = cv2.erode(mask, box(3, 1), iterations=3)
+
+        if DEBUG_LEVEL >= 3:
+            debug_show(name, 0.5, 'eroded', mask)
+
+        mask = cv2.dilate(mask, box(8, 2))
+
+        if DEBUG_LEVEL >= 3:
+            debug_show(name, 0.6, 'dilated', mask)
+
+    return np.minimum(mask, pagemask)
+
+
+def interval_measure_overlap(int_a, int_b):
+    return min(int_a[1], int_b[1]) - max(int_a[0], int_b[0])
+
+
+def angle_dist(angle_b, angle_a):
+
+    diff = angle_b - angle_a
+
+    while diff > np.pi:
+        diff -= 2*np.pi
+
+    while diff < -np.pi:
+        diff += 2*np.pi
+
+    return np.abs(diff)
+
+
+def blob_mean_and_tangent(contour):
+
+    moments = cv2.moments(contour)
+
+    area = moments['m00']
+
+    mean_x = old_div(moments['m10'], area)
+    mean_y = old_div(moments['m01'], area)
+
+    moments_matrix = old_div(np.array([
+        [moments['mu20'], moments['mu11']],
+        [moments['mu11'], moments['mu02']]
+    ]), area)
+
+    _, svd_u, _ = cv2.SVDecomp(moments_matrix)
+
+    center = np.array([mean_x, mean_y])
+    tangent = svd_u[:, 0].flatten().copy()
+
+    return center, tangent
+
+
+class ContourInfo(object):
+
+    def __init__(self, contour, rect, mask):
+
+        self.contour = contour
+        self.rect = rect
+        self.mask = mask
+
+        self.center, self.tangent = blob_mean_and_tangent(contour)
+
+        self.angle = np.arctan2(self.tangent[1], self.tangent[0])
+
+        clx = [self.proj_x(point) for point in contour]
+
+        lxmin = min(clx)
+        lxmax = max(clx)
+
+        self.local_xrng = (lxmin, lxmax)
+
+        self.point0 = self.center + self.tangent * lxmin
+        self.point1 = self.center + self.tangent * lxmax
+
+        self.pred = None
+        self.succ = None
+
+    def proj_x(self, point):
+        return np.dot(self.tangent, point.flatten()-self.center)
+
+    def local_overlap(self, other):
+        xmin = self.proj_x(other.point0)
+        xmax = self.proj_x(other.point1)
+        return interval_measure_overlap(self.local_xrng, (xmin, xmax))
+
+
+def generate_candidate_edge(cinfo_a, cinfo_b):
+
+    # we want a left of b (so a's successor will be b and b's
+    # predecessor will be a) make sure right endpoint of b is to the
+    # right of left endpoint of a.
+    if cinfo_a.point0[0] > cinfo_b.point1[0]:
+        tmp = cinfo_a
+        cinfo_a = cinfo_b
+        cinfo_b = tmp
+
+    x_overlap_a = cinfo_a.local_overlap(cinfo_b)
+    x_overlap_b = cinfo_b.local_overlap(cinfo_a)
+
+    overall_tangent = cinfo_b.center - cinfo_a.center
+    overall_angle = np.arctan2(overall_tangent[1], overall_tangent[0])
+
+    delta_angle = old_div(max(angle_dist(cinfo_a.angle, overall_angle),
+                      angle_dist(cinfo_b.angle, overall_angle)) * 180,np.pi)
+
+    # we want the largest overlap in x to be small
+    x_overlap = max(x_overlap_a, x_overlap_b)
+
+    dist = np.linalg.norm(cinfo_b.point0 - cinfo_a.point1)
+
+    if (dist > EDGE_MAX_LENGTH or
+            x_overlap > EDGE_MAX_OVERLAP or
+            delta_angle > EDGE_MAX_ANGLE):
+        return None
+    else:
+        score = dist + delta_angle*EDGE_ANGLE_COST
+        return (score, cinfo_a, cinfo_b)
+
+
+def make_tight_mask(contour, xmin, ymin, width, height):
+
+    tight_mask = np.zeros((height, width), dtype=np.uint8)
+    tight_contour = contour - np.array((xmin, ymin)).reshape((-1, 1, 2))
+
+    cv2.drawContours(tight_mask, [tight_contour], 0,
+                     (1, 1, 1), -1)
+
+    return tight_mask
+
+
+def get_contours(name, small, pagemask, masktype):
+
+    mask = get_mask(name, small, pagemask, masktype)
+
+    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL,
+                                   cv2.CHAIN_APPROX_NONE)
+
+    contours_out = []
+
+    for contour in contours:
+
+        rect = cv2.boundingRect(contour)
+        xmin, ymin, width, height = rect
+
+        if (width < TEXT_MIN_WIDTH or
+                height < TEXT_MIN_HEIGHT or
+                width < TEXT_MIN_ASPECT*height):
+            continue
+
+        tight_mask = make_tight_mask(contour, xmin, ymin, width, height)
+
+        if tight_mask.sum(axis=0).max() > TEXT_MAX_THICKNESS:
+            continue
+
+        contours_out.append(ContourInfo(contour, rect, tight_mask))
+
+    if DEBUG_LEVEL >= 2:
+        visualize_contours(name, small, contours_out)
+
+    return contours_out
+
+
+def assemble_spans(name, small, pagemask, cinfo_list):
+
+    # sort list
+    cinfo_list = sorted(cinfo_list, key=lambda cinfo: cinfo.rect[1])
+
+    # generate all candidate edges
+    candidate_edges = []
+
+    for i, cinfo_i in enumerate(cinfo_list):
+        for j in range(i):
+            # note e is of the form (score, left_cinfo, right_cinfo)
+            edge = generate_candidate_edge(cinfo_i, cinfo_list[j])
+            if edge is not None:
+                candidate_edges.append(edge)
+
+    # sort candidate edges by score (lower is better)
+    candidate_edges.sort()
+
+    # for each candidate edge
+    for _, cinfo_a, cinfo_b in candidate_edges:
+        # if left and right are unassigned, join them
+        if cinfo_a.succ is None and cinfo_b.pred is None:
+            cinfo_a.succ = cinfo_b
+            cinfo_b.pred = cinfo_a
+
+    # generate list of spans as output
+    spans = []
+
+    # until we have removed everything from the list
+    while cinfo_list:
+
+        # get the first on the list
+        cinfo = cinfo_list[0]
+
+        # keep following predecessors until none exists
+        while cinfo.pred:
+            cinfo = cinfo.pred
+
+        # start a new span
+        cur_span = []
+
+        width = 0.0
+
+        # follow successors til end of span
+        while cinfo:
+            # remove from list (sadly making this loop *also* O(n^2)
+            cinfo_list.remove(cinfo)
+            # add to span
+            cur_span.append(cinfo)
+            width += cinfo.local_xrng[1] - cinfo.local_xrng[0]
+            # set successor
+            cinfo = cinfo.succ
+
+        # add if long enough
+        if width > SPAN_MIN_WIDTH:
+            spans.append(cur_span)
+
+    if DEBUG_LEVEL >= 2:
+        visualize_spans(name, small, pagemask, spans)
+
+    return spans
+
+
+def sample_spans(shape, spans):
+
+    span_points = []
+
+    for span in spans:
+
+        contour_points = []
+
+        for cinfo in span:
+
+            yvals = np.arange(cinfo.mask.shape[0]).reshape((-1, 1))
+            totals = (yvals * cinfo.mask).sum(axis=0)
+            means = old_div(totals, cinfo.mask.sum(axis=0))
+
+            xmin, ymin = cinfo.rect[:2]
+
+            step = SPAN_PX_PER_STEP
+            start = old_div(((len(means)-1) % step), 2)
+
+            contour_points += [(x+xmin, means[x]+ymin)
+                               for x in range(start, len(means), step)]
+
+        contour_points = np.array(contour_points,
+                                  dtype=np.float32).reshape((-1, 1, 2))
+
+        contour_points = pix2norm(shape, contour_points)
+
+        span_points.append(contour_points)
+
+    return span_points
+
+
+def keypoints_from_samples(name, small, pagemask, page_outline,
+                           span_points):
+
+    all_evecs = np.array([[0.0, 0.0]])
+    all_weights = 0
+
+    for points in span_points:
+
+        _, evec = cv2.PCACompute(points.reshape((-1, 2)),
+                                 None, maxComponents=1)
+
+        weight = np.linalg.norm(points[-1] - points[0])
+
+        all_evecs += evec * weight
+        all_weights += weight
+
+    evec = old_div(all_evecs, all_weights)
+
+    x_dir = evec.flatten()
+
+    if x_dir[0] < 0:
+        x_dir = -x_dir
+
+    y_dir = np.array([-x_dir[1], x_dir[0]])
+
+    pagecoords = cv2.convexHull(page_outline)
+    pagecoords = pix2norm(pagemask.shape, pagecoords.reshape((-1, 1, 2)))
+    pagecoords = pagecoords.reshape((-1, 2))
+
+    px_coords = np.dot(pagecoords, x_dir)
+    py_coords = np.dot(pagecoords, y_dir)
+
+    px0 = px_coords.min()
+    px1 = px_coords.max()
+
+    py0 = py_coords.min()
+    py1 = py_coords.max()
+
+    p00 = px0 * x_dir + py0 * y_dir
+    p10 = px1 * x_dir + py0 * y_dir
+    p11 = px1 * x_dir + py1 * y_dir
+    p01 = px0 * x_dir + py1 * y_dir
+
+    corners = np.vstack((p00, p10, p11, p01)).reshape((-1, 1, 2))
+
+    ycoords = []
+    xcoords = []
+
+    for points in span_points:
+        pts = points.reshape((-1, 2))
+        px_coords = np.dot(pts, x_dir)
+        py_coords = np.dot(pts, y_dir)
+        ycoords.append(py_coords.mean() - py0)
+        xcoords.append(px_coords - px0)
+
+    if DEBUG_LEVEL >= 2:
+        visualize_span_points(name, small, span_points, corners)
+
+    return corners, np.array(ycoords), xcoords
+
+
+def visualize_contours(name, small, cinfo_list):
+
+    regions = np.zeros_like(small)
+
+    for j, cinfo in enumerate(cinfo_list):
+
+        cv2.drawContours(regions, [cinfo.contour], 0,
+                         CCOLORS[j % len(CCOLORS)], -1)
+
+    mask = (regions.max(axis=2) != 0)
+
+    display = small.copy()
+    display[mask] = (old_div(display[mask],2)) + (old_div(regions[mask],2))
+
+    for j, cinfo in enumerate(cinfo_list):
+        color = CCOLORS[j % len(CCOLORS)]
+        color = tuple([old_div(c,4) for c in color])
+
+        cv2.circle(display, fltp(cinfo.center), 3,
+                   (255, 255, 255), 1, cv2.LINE_AA)
+
+        cv2.line(display, fltp(cinfo.point0), fltp(cinfo.point1),
+                 (255, 255, 255), 1, cv2.LINE_AA)
+
+    debug_show(name, 1, 'contours', display)
+
+
+def visualize_spans(name, small, pagemask, spans):
+
+    regions = np.zeros_like(small)
+
+    for i, span in enumerate(spans):
+        contours = [cinfo.contour for cinfo in span]
+        cv2.drawContours(regions, contours, -1,
+                         CCOLORS[i*3 % len(CCOLORS)], -1)
+
+    mask = (regions.max(axis=2) != 0)
+
+    display = small.copy()
+    display[mask] = (old_div(display[mask],2)) + (old_div(regions[mask],2))
+    display[pagemask == 0] //= 4
+
+    debug_show(name, 2, 'spans', display)
+
+
+def visualize_span_points(name, small, span_points, corners):
+
+    display = small.copy()
+
+    for i, points in enumerate(span_points):
+
+        points = norm2pix(small.shape, points, False)
+
+        mean, small_evec = cv2.PCACompute(points.reshape((-1, 2)),
+                                          None,
+                                          maxComponents=1)
+
+        dps = np.dot(points.reshape((-1, 2)), small_evec.reshape((2, 1)))
+        dpm = np.dot(mean.flatten(), small_evec.flatten())
+
+        point0 = mean + small_evec * (dps.min()-dpm)
+        point1 = mean + small_evec * (dps.max()-dpm)
+
+        for point in points:
+            cv2.circle(display, fltp(point), 3,
+                       CCOLORS[i % len(CCOLORS)], -1, cv2.LINE_AA)
+
+        cv2.line(display, fltp(point0), fltp(point1),
+                 (255, 255, 255), 1, cv2.LINE_AA)
+
+    cv2.polylines(display, [norm2pix(small.shape, corners, True)],
+                  True, (255, 255, 255))
+
+    debug_show(name, 3, 'span points', display)
+
+
+def imgsize(img):
+    height, width = img.shape[:2]
+    return '{}x{}'.format(width, height)
+
+
+def make_keypoint_index(span_counts):
+
+    nspans = len(span_counts)
+    npts = sum(span_counts)
+    keypoint_index = np.zeros((npts+1, 2), dtype=int)
+    start = 1
+
+    for i, count in enumerate(span_counts):
+        end = start + count
+        keypoint_index[start:start+end, 1] = 8+i
+        start = end
+
+    keypoint_index[1:, 0] = np.arange(npts) + 8 + nspans
+
+    return keypoint_index
+
+
+def optimize_params(name, small, dstpoints, span_counts, params):
+
+    keypoint_index = make_keypoint_index(span_counts)
+
+    def objective(pvec):
+        ppts = project_keypoints(pvec, keypoint_index)
+        return np.sum((dstpoints - ppts)**2)
+
+    print('  initial objective is', objective(params))
+
+    if DEBUG_LEVEL >= 1:
+        projpts = project_keypoints(params, keypoint_index)
+        display = draw_correspondences(small, dstpoints, projpts)
+        debug_show(name, 4, 'keypoints before', display)
+
+    print('  optimizing', len(params), 'parameters...')
+    start = datetime.datetime.now()
+    res = scipy.optimize.minimize(objective, params,
+                                  method='Powell')
+    end = datetime.datetime.now()
+    print('  optimization took', round((end-start).total_seconds(), 2), 'sec.')
+    print('  final objective is', res.fun)
+    params = res.x
+
+    if DEBUG_LEVEL >= 1:
+        projpts = project_keypoints(params, keypoint_index)
+        display = draw_correspondences(small, dstpoints, projpts)
+        debug_show(name, 5, 'keypoints after', display)
+
+    return params
+
+
+def get_page_dims(corners, rough_dims, params):
+
+    dst_br = corners[2].flatten()
+
+    dims = np.array(rough_dims)
+
+    def objective(dims):
+        proj_br = project_xy(dims, params)
+        return np.sum((dst_br - proj_br.flatten())**2)
+
+    res = scipy.optimize.minimize(objective, dims, method='Powell')
+    dims = res.x
+
+    print('  got page dims', dims[0], 'x', dims[1])
+
+    return dims
+
+
+def remap_image(name, dirname, img, small, page_dims, params):
+
+    height = 0.5 * page_dims[1] * OUTPUT_ZOOM * img.shape[0]
+    height = round_nearest_multiple(height, REMAP_DECIMATE)
+
+    width = round_nearest_multiple(old_div(height * page_dims[0], page_dims[1]),
+                                   REMAP_DECIMATE)
+
+    print('  output will be {}x{}'.format(width, height))
+
+    height_small = old_div(height, REMAP_DECIMATE)
+    width_small = old_div(width, REMAP_DECIMATE)
+
+    page_x_range = np.linspace(0, page_dims[0], width_small)
+    page_y_range = np.linspace(0, page_dims[1], height_small)
+
+    page_x_coords, page_y_coords = np.meshgrid(page_x_range, page_y_range)
+
+    page_xy_coords = np.hstack((page_x_coords.flatten().reshape((-1, 1)),
+                                page_y_coords.flatten().reshape((-1, 1))))
+
+    page_xy_coords = page_xy_coords.astype(np.float32)
+
+    image_points = project_xy(page_xy_coords, params)
+    image_points = norm2pix(img.shape, image_points, False)
+
+    image_x_coords = image_points[:, 0, 0].reshape(page_x_coords.shape)
+    image_y_coords = image_points[:, 0, 1].reshape(page_y_coords.shape)
+
+    image_x_coords = cv2.resize(image_x_coords, (width, height),
+                                interpolation=cv2.INTER_CUBIC)
+
+    image_y_coords = cv2.resize(image_y_coords, (width, height),
+                                interpolation=cv2.INTER_CUBIC)
+
+    img_gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+
+    remapped = cv2.remap(img_gray, image_x_coords, image_y_coords,
+                         cv2.INTER_CUBIC,
+                         None, cv2.BORDER_REPLICATE)
+
+    thresh = cv2.adaptiveThreshold(remapped, 255, cv2.ADAPTIVE_THRESH_MEAN_C,
+                                   cv2.THRESH_BINARY, ADAPTIVE_WINSZ, 25)
+
+    pil_image = Image.fromarray(thresh)
+    pil_image = pil_image.convert('1')
+
+    threshfile = name + '_thresh.png'
+    pil_image.save(dirname + '/' + threshfile, dpi=(OUTPUT_DPI, OUTPUT_DPI))
+
+    if DEBUG_LEVEL >= 1:
+        height = small.shape[0]
+        width = int(round(height * float(thresh.shape[1])/thresh.shape[0]))
+        display = cv2.resize(thresh, (width, height),
+                             interpolation=cv2.INTER_AREA)
+        debug_show(name, 6, 'output', display)
+
+    return threshfile
+
+
+def main():
+
+    if len(sys.argv) < 2:
+        print('usage:', sys.argv[0], 'IMAGE1 [IMAGE2 ...]')
+        sys.exit(0)
+
+    if DEBUG_LEVEL > 0 and DEBUG_OUTPUT != 'file':
+        cv2.namedWindow(WINDOW_NAME)
+
+    outfiles = []
+
+    for imgfile in sys.argv[1:]:
+
+        img = cv2.imread(imgfile)
+        small = resize_to_screen(img)
+        basename = os.path.basename(imgfile)
+        dirname = os.path.dirname(imgfile)
+        name, _ = os.path.splitext(basename)
+
+        print('loaded', basename, 'with size', imgsize(img), end=' ')
+        print('and resized to', imgsize(small))
+
+        if DEBUG_LEVEL >= 3:
+            debug_show(name, 0.0, 'original', small)
+
+        pagemask, page_outline = get_page_extents(small)
+
+        cinfo_list = get_contours(name, small, pagemask, 'text')
+        spans = assemble_spans(name, small, pagemask, cinfo_list)
+
+        if len(spans) < 3:
+            print('  detecting lines because only', len(spans), 'text spans')
+            cinfo_list = get_contours(name, small, pagemask, 'line')
+            spans2 = assemble_spans(name, small, pagemask, cinfo_list)
+            if len(spans2) > len(spans):
+                spans = spans2
+
+        if len(spans) < 1:
+            print('skipping', name, 'because only', len(spans), 'spans')
+            continue
+
+        span_points = sample_spans(small.shape, spans)
+
+        print('  got', len(spans), 'spans', end=' ')
+        print('with', sum([len(pts) for pts in span_points]), 'points.')
+
+        corners, ycoords, xcoords = keypoints_from_samples(name, small,
+                                                           pagemask,
+                                                           page_outline,
+                                                           span_points)
+
+        rough_dims, span_counts, params = get_default_params(corners,
+                                                             ycoords, xcoords)
+
+        dstpoints = np.vstack((corners[0].reshape((1, 1, 2)),) +
+                              tuple(span_points))
+
+        params = optimize_params(name, small,
+                                 dstpoints,
+                                 span_counts, params)
+
+        page_dims = get_page_dims(corners, rough_dims, params)
+
+        outfile = remap_image(name, dirname, img, small, page_dims, params)
+
+        outfiles.append(outfile)
+
+        print('  wrote', outfile)
+        print()
+
+    print('to convert to PDF (requires ImageMagick):')
+    print('  convert -compress Group4 ' + ' '.join(outfiles) + ' output.pdf')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/docker/ingest/dewarp/requirements.txt b/docker/ingest/dewarp/requirements.txt
new file mode 100644
index 0000000..c716cb0
--- /dev/null
+++ b/docker/ingest/dewarp/requirements.txt
@@ -0,0 +1,5 @@
+numpy
+scipy
+Pillow
+opencv-python
+future