diff --git a/composer.json b/composer.json index 76df3e9..fb662c1 100644 --- a/composer.json +++ b/composer.json @@ -26,6 +26,7 @@ "ext-libxml": "*" }, "require-dev": { - "phpunit/phpunit": "^7.0|^8.0|^9.0" + "phpunit/phpunit": "^7.0|^8.0|^9.0", + "phpstan/phpstan": "^1.9" } } diff --git a/phpstan.neon.dist b/phpstan.neon.dist new file mode 100644 index 0000000..b05dc95 --- /dev/null +++ b/phpstan.neon.dist @@ -0,0 +1,7 @@ +parameters: + level: 6 + errorFormat: raw + editorUrl: '%%file%% %%line%% %%column%%: %%error%%' + paths: + - src + - tests diff --git a/src/Html2Text.php b/src/Html2Text.php index 30d3a00..1763cb4 100644 --- a/src/Html2Text.php +++ b/src/Html2Text.php @@ -4,6 +4,7 @@ class Html2Text { + /** @return array */ public static function defaultOptions(): array { return [ 'ignore_errors' => false, @@ -23,7 +24,7 @@ public static function defaultOptions(): array { * * * @param string $html the input HTML - * @param boolean|array $options if boolean, Ignore xml parsing errors, else ['ignore_errors' => false, 'drop_links' => false, 'char_set' => 'auto'] + * @param boolean|array $options if boolean, Ignore xml parsing errors, else ['ignore_errors' => false, 'drop_links' => false, 'char_set' => 'auto'] * @return string the HTML converted, as best as possible, to text * @throws Html2TextException if the HTML could not be loaded as a {@link \DOMDocument} */ @@ -43,26 +44,26 @@ public static function convert(string $html, $options = []): string { } } - $is_office_document = static::isOfficeDocument($html); + $is_office_document = self::isOfficeDocument($html); if ($is_office_document) { // remove office namespace $html = str_replace(["", ""], "", $html); } - $html = static::fixNewlines($html); + $html = self::fixNewlines($html); // use mb_convert_encoding for legacy versions of php if (PHP_MAJOR_VERSION * 10 + PHP_MINOR_VERSION < 81 && mb_detect_encoding($html, "UTF-8", true)) { $html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8"); } - $doc = static::getDocument($html, $options); + $doc = self::getDocument($html, $options); - $output = static::iterateOverNode($doc, null, false, $is_office_document, $options); + $output = self::iterateOverNode($doc, null, false, $is_office_document, $options); // process output for whitespace/newlines - $output = static::processWhitespaceNewlines($output); + $output = self::processWhitespaceNewlines($output); return $output; } @@ -84,6 +85,7 @@ public static function fixNewlines(string $text): string { return $text; } + /** @return array */ public static function nbspCodes(): array { return [ "\xc2\xa0", @@ -91,6 +93,7 @@ public static function nbspCodes(): array { ]; } + /** @return array */ public static function zwnjCodes(): array { return [ "\xe2\x80\x8c", @@ -118,7 +121,7 @@ public static function processWhitespaceNewlines(string $text): string { // convert non-breaking spaces to regular spaces to prevent output issues, // do it here so they do NOT get removed with other leading spaces, as they // are sometimes used for indentation - $text = static::renderText($text); + $text = self::renderText($text); // remove trailing whitespace $text = rtrim($text); @@ -127,7 +130,7 @@ public static function processWhitespaceNewlines(string $text): string { $text = preg_replace("/[ \t]*\n/im", "\n", $text); // unarmor pre blocks - $text = static::fixNewLines($text); + $text = self::fixNewLines($text); // remove unnecessary empty lines $text = preg_replace("/\n\n\n*/im", "\n\n", $text); @@ -143,13 +146,14 @@ public static function isOfficeDocument(string $html): bool { } public static function isWhitespace(string $text): bool { - return strlen(trim(static::renderText($text), "\n\r\t ")) === 0; + return strlen(trim(self::renderText($text), "\n\r\t ")) === 0; } /** * Parse HTML into a DOMDocument * * @param string $html the input HTML + * @param array $options * @return \DOMDocument the parsed document tree */ private static function getDocument(string $html, array $options): \DOMDocument { @@ -218,17 +222,17 @@ private static function getDocument(string $html, array $options): \DOMDocument * by a browser. */ private static function renderText(string $text): string { - $text = str_replace(static::nbspCodes(), " ", $text); - $text = str_replace(static::zwnjCodes(), "", $text); + $text = str_replace(self::nbspCodes(), " ", $text); + $text = str_replace(self::zwnjCodes(), "", $text); return $text; } - private static function nextChildName($node): ?string { + private static function nextChildName(?\DOMNode $node): ?string { // get the next child $nextNode = $node->nextSibling; while ($nextNode != null) { if ($nextNode instanceof \DOMText) { - if (!static::isWhitespace($nextNode->wholeText)) { + if (!self::isWhitespace($nextNode->wholeText)) { break; } } @@ -248,11 +252,12 @@ private static function nextChildName($node): ?string { return $nextName; } - private static function iterateOverNode($node, $prevName, bool $in_pre, bool $is_office_document, $options): string { + /** @param array $options */ + private static function iterateOverNode(\DOMNode $node, ?string $prevName, bool $in_pre, bool $is_office_document, array $options): string { if ($node instanceof \DOMText) { // Replace whitespace characters with a space (equivilant to \s) if ($in_pre) { - $text = "\n" . trim(static::renderText($node->wholeText), "\n\r\t ") . "\n"; + $text = "\n" . trim(self::renderText($node->wholeText), "\n\r\t ") . "\n"; // Remove trailing whitespace only $text = preg_replace("/[ \t]*\n/im", "\n", $text); @@ -261,10 +266,10 @@ private static function iterateOverNode($node, $prevName, bool $in_pre, bool $is return str_replace("\n", "\r", $text); } - $text = static::renderText($node->wholeText); + $text = self::renderText($node->wholeText); $text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $text); - if (!static::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) { + if (!self::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) { return "\n" . $text; } return $text; @@ -276,7 +281,7 @@ private static function iterateOverNode($node, $prevName, bool $in_pre, bool $is } $name = strtolower($node->nodeName); - $nextName = static::nextChildName($node); + $nextName = self::nextChildName($node); // start whitespace switch ($name) { @@ -321,6 +326,7 @@ private static function iterateOverNode($node, $prevName, bool $in_pre, bool $is // To fix this, for any p element with a className of `MsoNormal` (the standard // classname in any Microsoft export or outlook for a paragraph that behaves // like a line return) we skip the first line returns and set the name to br. + // @phpstan-ignore-next-line if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') { $output = ""; $name = 'br'; @@ -368,12 +374,12 @@ private static function iterateOverNode($node, $prevName, bool $in_pre, bool $is while ($n != null) { - $text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document, $options); + $text = self::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document, $options); // Pass current node name to next child, as previousSibling does not appear to get populated if ($n instanceof \DOMDocumentType || $n instanceof \DOMProcessingInstruction - || ($n instanceof \DOMText && static::isWhitespace($text))) { + || ($n instanceof \DOMText && self::isWhitespace($text))) { // Keep current previousSiblingName, these are invisible $trailing_whitespace++; } @@ -430,6 +436,7 @@ private static function iterateOverNode($node, $prevName, bool $in_pre, bool $is case "a": // links are returned in [text](link) format + // @phpstan-ignore-next-line $href = $node->getAttribute("href"); $output = trim($output); @@ -439,18 +446,23 @@ private static function iterateOverNode($node, $prevName, bool $in_pre, bool $is $output = substr($output, 1, strlen($output) - 2); // for linking images, the title of the overrides the title of the + // @phpstan-ignore-next-line if ($node->getAttribute("title")) { + // @phpstan-ignore-next-line $output = $node->getAttribute("title"); } } // if there is no link text, but a title attr + // @phpstan-ignore-next-line if (!$output && $node->getAttribute("title")) { + // @phpstan-ignore-next-line $output = $node->getAttribute("title"); } if ($href == null) { // it doesn't link anywhere + // @phpstan-ignore-next-line if ($node->getAttribute("name") != null) { if ($options['drop_links']) { $output = "$output"; @@ -486,9 +498,13 @@ private static function iterateOverNode($node, $prevName, bool $in_pre, bool $is break; case "img": + // @phpstan-ignore-next-line if ($node->getAttribute("title")) { + // @phpstan-ignore-next-line $output = "[" . $node->getAttribute("title") . "]"; + // @phpstan-ignore-next-line } elseif ($node->getAttribute("alt")) { + // @phpstan-ignore-next-line $output = "[" . $node->getAttribute("alt") . "]"; } else { $output = ""; @@ -501,7 +517,7 @@ private static function iterateOverNode($node, $prevName, bool $in_pre, bool $is case "blockquote": // process quoted text for whitespace/newlines - $output = static::processWhitespaceNewlines($output); + $output = self::processWhitespaceNewlines($output); // add leading newline $output = "\n" . $output; diff --git a/src/Html2TextException.php b/src/Html2TextException.php index 9171d86..fe919f3 100644 --- a/src/Html2TextException.php +++ b/src/Html2TextException.php @@ -4,9 +4,10 @@ class Html2TextException extends \Exception { - var $more_info; + /** @var string $more_info */ + public $more_info; - public function __construct($message = "", $more_info = "") { + public function __construct(string $message = "", string $more_info = "") { parent::__construct($message); $this->more_info = $more_info; } diff --git a/tests/Html2TextTest.php b/tests/Html2TextTest.php index 9655aff..5e2e522 100644 --- a/tests/Html2TextTest.php +++ b/tests/Html2TextTest.php @@ -16,11 +16,12 @@ public static function setUpBeforeClass(): void { /** * @dataProvider providerFiles */ - public function testFile(string $test) { + public function testFile(string $test): void { $this->doTestWithResults($test, $test, []); } - function doTestWithResults(string $test, string $result, $options = []) { + /** @param bool | array $options */ + function doTestWithResults(string $test, string $result, $options = []): void { $html = __DIR__ . "/html/$test.html"; $txt = __DIR__ . "/txt/$result.txt"; $this->assertTrue(file_exists($html), "File '{$html}' does not exist"); @@ -36,7 +37,8 @@ function doTestWithResults(string $test, string $result, $options = []) { $this->assertEquals($expected, $output, "{$html} file failed to convert to {$txt}"); } - public function providerFiles() { + /** @return array> */ + public function providerFiles(): array { return [ ['basic'], ['anchors'], @@ -62,34 +64,34 @@ public function providerFiles() { ]; } - public function testInvalidXML() { + public function testInvalidXML(): void { $this->expectWarning(); $this->doTestWithResults("invalid", "invalid", ['ignore_errors' => false]); } - public function testInvalidXMLIgnore() { + public function testInvalidXMLIgnore(): void { $this->doTestWithResults("invalid", "invalid", ['ignore_errors' => true]); } - public function testInvalidXMLIgnoreOldSyntax() { + public function testInvalidXMLIgnoreOldSyntax(): void { // for BC, allow old #convert(text, bool) syntax $this->doTestWithResults("invalid", "invalid", true); } - public function testInvalidOption() { + public function testInvalidOption(): void { $this->expectException(InvalidArgumentException::class); $this->doTestWithResults("basic", "basic", ['invalid_option' => true]); } - public function testBasicDropLinks() { + public function testBasicDropLinks(): void { $this->doTestWithResults("basic", "basic.no-links", ['drop_links' => true]); } - public function testAnchorsDropLinks() { + public function testAnchorsDropLinks(): void { $this->doTestWithResults("anchors", "anchors.no-links", ['drop_links' => true]); } - public function testWindows1252() { + public function testWindows1252(): void { $this->doTestWithResults("windows-1252-example", "windows-1252-example", ['char_set' => 'windows-1252']); } } \ No newline at end of file