|
|
<?php
namespace App\Ingest;
use PhpOffice\PhpWord\Element\AbstractElement; use PhpOffice\PhpWord\Element\Bookmark; use PhpOffice\PhpWord\Element\Header; use PhpOffice\PhpWord\Element\Image; use PhpOffice\PhpWord\Element\Line; use PhpOffice\PhpWord\Element\Link; use PhpOffice\PhpWord\Element\ListItem; use PhpOffice\PhpWord\Element\ListItemRun; use PhpOffice\PhpWord\Element\PageBreak; use PhpOffice\PhpWord\Element\PreserveText; use PhpOffice\PhpWord\Element\Section; use PhpOffice\PhpWord\Element\Table; use PhpOffice\PhpWord\Element\Text; use PhpOffice\PhpWord\Element\TextBreak; use PhpOffice\PhpWord\Element\TextRun; use PhpOffice\PhpWord\Element\Title; use PhpOffice\PhpWord\IOFactory; use PhpOffice\PhpWord\Style;
class DocxReader extends AbstractConvertor { protected $textContents; protected $textLength; // protected $readersMapper;
public function __construct($storage, $path) { parent::__construct($storage, $path);
$this->textContents = [ 'text' => '', 'elements' => [], ]; $this->textLength = 0;
// $this->readersMapper = [
// 'docx' => 'Word2007',
// 'odt' => 'ODText',
// 'rtf' => 'RTF',
// ];
}
public function execute() { // Converting to HTML and then back to DOCX loses some content and styles (lost when converted to HTML).
$data = [];
$handler = IOFactory::load($this->storage->path($this->path));
/** * @ISSUE * At the moment of this writing (08/sept/2021) phpword does not support reading bookmarks from the * DOCX file, in order to add the support we can, for example, add the following lines of * code to the AbstractPart.php file before the Text and TextRun handling case. * * $els = $xmlReader->getElements('w:bookmarkStart', $domNode); * if ($els && $els->count() > 0) { * $parent->addBookmark($els[0]->getAttribute('w:name')); * } * */
$data['default_font_name'] = $handler->getDefaultFontName(); $data['default_font_size'] = $handler->getDefaultFontSize(); $data['styles'] = $this->handleStyles(Style::getStyles());
$elements = []; $sections = $handler->getSections();
foreach ($sections as $section) { $elements[] = $this->sectionToJson($section); }
$data['elements'] = $elements;
$data['contents'] = $this->textContents;
return $data; }
protected function sectionToJson(Section $section) { return array_merge($this->elementToJson($section), [ 'element_type' => 'Section', 'elements' => $this->elementsToJson($section->getElements()), 'footers' => $this->handleFooters($section->getFooters()), // 'footnote_properties' => $this->handleFooters($section->getFootnotePropoperties()),
'headers' => $this->handleHeaders($section->getHeaders()), 'style' => $this->sectionStyleToJson($section->getStyle()), ]); }
protected function handleStyles($styles) { $self = $this;
return array_map(function ($style) use ($self) { $class = get_class($style); $a = explode('\\', $class); $a = array_reverse($a); $className = $a[0];
$handler = lcfirst($className) . 'StyleToJson';
return $self->$handler($style); }, $styles); }
protected function handleHeaders(array $headers) { $list = [];
foreach ($headers as $header) { $list[] = $this->headerToJson($header); }
return $list; }
protected function handleFooters(array $footers) { $list = [];
foreach ($footers as $footer) { $list[] = $this->footerToJson($footer); }
return $list; }
protected function handleElement($element) { $class = get_class($element); $a = explode('\\', $class); $a = array_reverse($a); $className = $a[0];
$handler = lcfirst($className) . 'ToJson';
return array_merge( $this->elementToJson($element), $this->$handler($element) ); }
protected function elementToJson(AbstractElement $element) { return [ // 'comment_range_end' => $element->getCommentRangeEnd(),
// 'comment_range_start' => $element->getCommentRangeStart(),
'doc_part' => $element->getDocPart(), 'doc_part_id' => $element->getDocPartId(), 'element_id' => $element->getElementId(), 'element_index' => $element->getElementIndex(), // 'parent' => $element->getParent(),
'nested_level' => $element->getNestedLevel(), 'relation_id' => $element->getRelationId(), 'section_id' => $element->getSectionId(), ]; }
protected function elementsToJson(array $elements) { $list = [];
foreach ($elements as $element) { $list[] = $this->handleElement($element); }
return $list; }
protected function headerToJson(Header $header) { return array_merge( $this->footerToJson($header), [ 'element_type' => 'Header', ] ); }
protected function footerToJson($footer) { return array_merge($this->elementToJson($footer), [ 'element_type' => 'Footer', 'elements' => $this->elementsToJson($footer->getElements()), 'type' => $footer->getType(), ]); }
protected function bookmarkToJson(Bookmark $element) { return array_merge($this->elementToJson($element), [ 'element_type' => 'Bookmark', 'name' => $element->getName(), ]); }
protected function imageToJson(Image $image) { return array_merge($this->elementToJson($image), [ 'element_type' => 'Image', 'name' => $image->getName(), 'style' => $this->imageStyleToJson($image->getStyle()), 'source' => $image->getSource(), 'source_type' => $image->getSourceType(), 'is_watermark' => $image->isWatermark(), ]); }
protected function lineToJson(Line $element) { return array_merge($this->elementToJson($element), [ 'element_type' => 'Line', 'style' => $this->lineStyleToJson($element->getStyle()), ]); }
protected function linkToJson(Link $element) { return [ 'element_type' => 'Link', ]; }
protected function listItemToJson(ListItem $element) { return array_merge($this->elementToJson($element), [ 'element_type' => 'ListItem', 'depth' => $element->getDepth(), 'style' => $this->listItemStyleToJson($element->getStyle()), 'text' => $element->getText(), 'text_object' => $this->textToJson($element->getTextObject()), ]); }
protected function listItemRunToJson(ListItemRun $element) { return array_merge($this->textRunToJson($element), [ 'element_type' => 'ListItemRun', 'depth' => $element->getDepth(), 'style' => $this->listItemStyleToJson($element->getStyle()), ]); }
protected function preserveTextToJson(PreserveText $element) { $fontStyle = $element->getFontStyle(); $paragraphStyle = $element->getParagraphStyle();
$text = $element->getText(); $text = is_array($text) ? $text[0] : $text;
$hash = $this->addText($text);
return array_merge($this->elementToJson($element), [ 'element_type' => 'PreserveText', 'font_style' => $fontStyle ? $this->fontStyleToJson($fontStyle) : null, 'paragraph_style' => $paragraphStyle ? $this->paragraphStyleToJson($paragraphStyle) : null, 'text' => $text, 'hash' => $hash, ]); }
protected function pageBreakToJson(PageBreak $element) { return [ 'element_type' => 'PageBreak', ]; }
protected function textToJson(Text $element) { $fontStyle = $element->getFontStyle(); $paragraphStyle = $element->getParagraphStyle();
$text = $element->getText();
$hash = $this->addText($text);
return array_merge($this->elementToJson($element), [ 'element_type' => 'Text', 'font_style' => $fontStyle ? $this->fontStyleToJson($fontStyle) : null, 'paragraph_style' => $paragraphStyle ? $this->paragraphStyleToJson($paragraphStyle) : null, 'text' => $text, 'hash' => $hash, ]); }
protected function textBreakToJson(TextBreak $element) { $fontStyle = $element->getFontStyle(); $paragraphStyle = $element->getParagraphStyle();
return array_merge($this->elementToJson($element), [ 'element_type' => 'TextBreak', 'font_style' => $fontStyle ? $this->fontStyleToJson($fontStyle) : null, 'paragraph_style' => $paragraphStyle ? $this->paragraphStyleToJson($paragraphStyle) : null, ]); }
protected function textRunToJson(TextRun $element) { $paragraphStyle = $element->getParagraphStyle();
return array_merge($this->elementToJson($element), [ 'element_type' => 'TextRun', 'paragraph_style' => $paragraphStyle ? $this->paragraphStyleToJson($paragraphStyle) : null, 'elements' => $this->elementsToJson($element->getElements()), ]); }
protected function tableToJson(Table $element) { $self = $this;
return array_merge($this->elementToJson($element), [ 'element_type' => 'Table', 'style' => $this->tableStyleToJson($element->getStyle()), 'rows' => array_map(function($row) use ($self) { return $self->rowToJson($row); }, $element->getRows()), 'width' => $element->getWidth(), ]); }
protected function titleToJson(Title $element) { $elements = []; $text = $element->getText();
if (is_object($text)) { $elements = [$this->textRunToJson($text)]; $text = ''; }
$result = array_merge($this->elementToJson($element), [ 'element_type' => 'Title', 'depth' => $element->getDepth(), 'style' => $element->getStyle(), 'text' => $text, 'elements' => $elements, ]);
if ($text) { $hash = $this->addText($text);
$result['hash'] = $hash; }
return $result; }
protected function rowToJson($row) { $self = $this;
return [ 'height' => $row->getHeight(), 'style' => $this->rowStyleToJson($row->getStyle()), 'cells' => array_map(function($cell) use ($self) { return $self->cellToJson($cell); }, $row->getCells()), ]; }
protected function cellToJson($cell) { return [ 'style' => $this->cellStyleToJson($cell->getStyle()), 'width' => $cell->getWidth(), 'elements' => $this->elementsToJson($cell->getElements()), ]; }
// Styles
protected function borderStyleToJson($style) { return [ 'style' => 'border',
'BorderTopSize' => $style->getBorderTopSize(), 'BorderTopColor' => $style->getBorderTopColor(), 'BorderTopStyle' => $style->getBorderTopStyle(),
'BorderLeftSize' => $style->getBorderLeftSize(), 'BorderLeftColor' => $style->getBorderLeftColor(), 'BorderLeftStyle' => $style->getBorderLeftStyle(),
'BorderRightSize' => $style->getBorderRightSize(), 'BorderRightColor' => $style->getBorderRightColor(), 'BorderRightStyle' => $style->getBorderRightStyle(),
'BorderBottomSize' => $style->getBorderBottomSize(), 'BorderBottomColor' => $style->getBorderBottomColor(), 'BorderBottomStyle' => $style->getBorderBottomStyle(), ]; }
protected function cellStyleToJson(Style\Cell $style) { $styles = array_merge($this->borderStyleToJson($style), [ 'style' => 'cell',
'TextDirection' => $style->getTextDirection(), 'BgColor' => $style->getBgColor(), 'GridSpan' => $style->getGridSpan(), 'VMerge' => $style->getVMerge(), 'Shading' => $this->shadingStyleToJson($style->getShading()), 'Width' => $style->getWidth(), 'Unit' => $style->getUnit(), ]);
if ($vAlign = $style->getVAlign()) { $styles['VAlign'] = $vAlign; }
return $styles; }
protected function sectionStyleToJson(Style\Section $style) { $styles = [ 'style' => 'section',
'BreakType' => $style->getBreakType(), 'ColsNum' => $style->getColsNum(), 'ColsSpace' => $style->getColsSpace(), 'FooterHeight' => $style->getFooterHeight(), 'Gutter' => $style->getGutter(), 'HeaderHeight' => $style->getHeaderHeight(), 'LineNumbering' => $style->getLineNumbering(), 'MarginBottom' => $style->getMarginBottom(), 'MarginLeft' => $style->getMarginLeft(), 'MarginRight' => $style->getMarginRight(), 'MarginTop' => $style->getMarginTop(), 'Orientation' => $style->getOrientation(), 'PageNumberingStart' => $style->getPageNumberingStart(), 'PageSizeH' => $style->getPageSizeH(), 'PageSizeW' => $style->getPageSizeW(), 'PaperSize' => $style->getPaperSize(), ];
$vAlign = $style->getVAlign();
if ($vAlign) { $styles['VAlign'] = $vAlign; }
return $styles; }
protected function shadingStyleToJson($style) { if ( ! $style) { return null; }
return [ 'style' => 'shading',
'pattern' => $style->getPattern(), 'color' => $style->getColor(), 'fill' => $style->getFill(), ]; }
protected function lineStyleToJson(Style\Line $style) { return array_merge($this->imageStyleToJson($style), [ 'style' => 'line',
'BeginArrow' => $style->getBeginArrow(), 'Color' => $style->getColor(), 'ConnectorType' => $style->getConnectorType(), 'Dash' => $style->getDash(), 'EndArrow' => $style->getEndArrow(), 'Flip' => $style->isFlip(), 'Weight' => $style->getWeight(), ]); }
protected function listItemStyleToJson(Style\ListItem $style) { return [ 'style' => 'line_item',
'ListType' => $style->getListType(), 'NumStyle' => $style->getNumStyle(), ]; }
protected function fontStyleToJson(Style\Font $style) { $styles = [ 'style' => 'font',
'StyleName' => $style->getStyleName(), 'Name' => $style->getName(), 'Size' => $style->getSize(), 'Color' => $style->getColor(), 'Hint' => $style->getHint(), 'Bold' => $style->isBold(), 'Italic' => $style->isItalic(), 'Underline' => $style->getUnderline(), 'Strikethrough' => $style->isStrikethrough(), 'DoubleStrikethrough' => $style->isDoubleStrikethrough(), 'SuperScript' => $style->isSuperScript(), 'SubScript' => $style->isSubScript(), 'SmallCaps' => $style->isSmallCaps(), 'AllCaps' => $style->isAllCaps(), 'FgColor' => $style->getFgColor(), 'Hidden' => $style->isHidden(), 'Type' => $style->getStyleType(),
'Scale' => $style->getScale(), 'Spacing' => $style->getSpacing(), 'Kerning' => $style->getKerning(), 'Position' => $style->getPosition(), ];
if ($style->getParagraph()) { $styles['Paragraph'] = $this->paragraphStyleToJson($style->getParagraph()); }
return $styles; }
protected function frameStyleToJson(Style\Frame $style) { return [ 'style' => 'frame',
'Alignment' => $style->getAlignment(), 'Height' => $style->getHeight(), 'Left' => $style->getLeft(), 'HPos' => $style->getHPos(), 'HPosRelTo' => $style->getHPosRelTo(), 'Pos' => $style->getPos(), 'VPos' => $style->getVPos(), 'VPosRelTo' => $style->getVPosRelTo(), 'Position' => $style->getPosition(), 'Top' => $style->getTop(), 'Unit' => $style->getUnit(), 'Width' => $style->getWidth(), 'Wrap' => $style->getWrap(), 'WrapDistanceBottom' => $style->getWrapDistanceBottom(), 'WrapDistanceLeft' => $style->getWrapDistanceLeft(), 'WrapDistanceRight' => $style->getWrapDistanceRight(), 'WrapDistanceTop' => $style->getWrapDistanceTop(), ]; }
protected function imageStyleToJson(Style\Image $style) { return array_merge($this->frameStyleToJson($style), [ 'style' => 'image',
'MarginLeft' => $style->getMarginLeft(), 'MarginTop' => $style->getMarginTop(), 'WrappingStyle' => $style->getWrappingStyle(), 'Positioning' => $style->getPositioning(), 'PosHorizontal' => $style->getPosHorizontal(), 'PosHorizontalRel' => $style->getPosHorizontalRel(), 'PosVertical' => $style->getPosVertical(), 'PosVerticalRel' => $style->getPosVerticalRel(), ]); }
protected function indentationStyleToJson($style) { if ( ! $style) { return null; }
return [ 'style' => 'indentation',
'Left' => $style->getLeft(), 'Right' => $style->getRight(), 'FirstLine' => $style->getFirstLine(), 'Hanging' => $style->getHanging(), ]; }
protected function spacingStyleToJson(Style\Spacing $style) { return [ 'style' => 'spacing',
'Before' => $style->getBefore(), 'After' => $style->getAfter(), 'Line' => $style->getLine(), 'LineRule' => $style->getLineRule(), ]; }
protected function numberingStyleToJson(Style\Numbering $style) { $self = $this;
return [ 'style' => 'numbering',
'NumId' => $style->getNumId(), 'Type' => $style->getType(), 'StyleName' => $style->getStyleName(), 'Index' => $style->getIndex(),
'Levels' => array_map(function ($numberingLevel) use ($self) { return $self->numberingLevelStyleToJson($numberingLevel); }, $style->getLevels()), ]; }
protected function numberingLevelStyleToJson(Style\NumberingLevel $style) { return [ 'type' => 'numbering_level',
'Level' => $style->getLevel(), 'Start' => $style->getStart(), 'Format' => $style->getFormat(), 'Restart' => $style->getRestart(), 'PStyle' => $style->getPStyle(), 'Suffix' => $style->getSuffix(), 'Text' => $style->getText(), 'Alignment' => $style->getAlignment(), 'Left' => $style->getLeft(), 'Hanging' => $style->getHanging(), 'TabPos' => $style->getTabPos(), 'Font' => $style->getFont(), 'Hint' => $style->getHint(), ]; }
protected function paragraphStyleToJson(Style\Paragraph $style) { $styles = [ 'Name' => $style->getStyleName(), 'BasedOn' => $style->getBasedOn(), 'Next' => $style->getNext(), 'Alignment' => $style->getAlignment(), 'Indentation' => $style->getIndentation(), 'Spacing' => $style->getSpacing(), 'WidowControl' => $style->hasWidowControl(), 'KeepNext' => $style->isKeepNext(), 'KeepLines' => $style->isKeepLines(), 'PageBreakBefore' => $style->hasPageBreakBefore(), 'NumStyle' => $style->getNumStyle(), 'NumLevel' => $style->getNumLevel(), 'Tabs' => $style->getTabs(), 'Shading' => $style->getShading(), 'ContextualSpacing' => $style->hasContextualSpacing(), 'Bidi' => $style->isBidi(), 'TextAlignment' => $style->getTextAlignment(), 'SuppressAutoHyphens' => $style->hasSuppressAutoHyphens(), ];
$styles['style'] = 'paragraph';
if ( ! $styles['Alignment']) { $styles['Alignment'] = 'baseline'; }
if ( ! $styles['TextAlignment']) { $styles['TextAlignment'] = 'baseline'; }
if ($styles['Indentation']) { $styles['Indentation'] = $this->indentationStyleToJson($styles['Indentation']); }
if ($styles['Spacing']) { $styles['Spacing'] = $this->spacingStyleToJson($styles['Spacing']); }
return $styles; }
protected function tableStyleToJson($style) { if ( ! $style) { return []; }
if (is_string($style)) { return $style; }
return array_merge( $this->borderStyleToJson($style),
[ 'style' => 'table',
'BgColor' => $style->getBgColor(), 'CellSpacing' => $style->getCellSpacing(), 'Shading' => $style->getShading(), 'Alignment' => $style->getAlignment(), 'Width' => $style->getWidth(), 'Unit' => $style->getUnit(), 'Layout' => $style->getLayout(), 'ColumnWidths' => $style->getColumnWidths(), 'BidiVisual' => $style->isBidiVisual(), 'position' => $this->tablePositionStyleToJson($style->getPosition()), 'first_row' => $this->tableStyleToJson($style->getFirstRow()), 'BorderInsideHSize' => $style->getBorderInsideHSize(), 'BorderInsideHColor' => $style->getBorderInsideHColor(), 'BorderInsideVSize' => $style->getBorderInsideVSize(), 'BorderInsideVColor' => $style->getBorderInsideVColor(), 'CellMarginTop' => $style->getCellMarginTop(), 'CellMarginRight' => $style->getCellMarginRight(), 'CellMarginLeft' => $style->getCellMarginLeft(), 'CellMarginBottom' => $style->getCellMarginBottom(), ] ); }
protected function tablePositionStyleToJson($style) { if ( ! $style) { return []; }
return [ 'style' => 'table_position',
'LeftFromText' => $style->getLeftFromText(), 'RightFromText' => $style->getRightFromText(), 'TopFromText' => $style->getTopFromText(), 'BottomFromText' => $style->getBottomFromText(), 'VertAnchor' => $style->getVertAnchor(), 'HorzAnchor' => $style->getHorzAnchor(), 'TblpXSpec' => $style->getTblpXSpec(), 'TblpX' => $style->getTblpX(), 'TblpYSpec' => $style->getTblpYSpec(), 'TblpY' => $style->getTblpY(), ]; }
protected function rowStyleToJson($style) { return [ 'style' => 'row',
'TblHeader' => $style->isTblHeader(), 'CantSplit' => $style->isCantSplit(), 'ExactHeight' => $style->isExactHeight(), ]; }
protected function addText($text) { $hash = $this->generateHash();
$this->textContents['text'] .= $text;
$this->textContents['elements'][] = [ 'hash' => $hash, 'range_start' => $this->textLength, 'range_end' => $this->textLength + (strlen($text) > 0 ? strlen($text) - 1 : 0), ];
$this->textLength = $this->textLength + (strlen($text) > 0 ? strlen($text) : 1);
return $hash; }
protected function generateHash() { return uniqid(); } }
|