wiki-grav/plugins/external_links/classes/ExternalLinks.php
2022-05-27 15:47:01 +02:00

506 lines
19 KiB
PHP

<?php
/**
* External Links
*
* This file is part of Grav External Links plugin.
*
* Dual licensed under the MIT or GPL Version 3 licenses, see LICENSE.
* http://benjamin-regler.de/license/
*/
namespace Grav\Plugin;
use Grav\Common\Utils;
use Grav\Common\Grav;
/**
* External Links
*
* Helper class to add small icons to external and mailto links, informing
* users the link will take them to a new site or open their email client.
*/
class ExternalLinks
{
/**
* @var ExternalLinks
*/
/** -------------
* Public methods
* --------------
*/
/**
* Process contents i.e. apply filer to the content.
*
* @param string $content The content to render.
* @param array $options Options to be passed to the renderer.
* @param null|Page $page Null or an instance of \Grav\Common\Page.
*
* @return string The rendered contents.
*/
public function render($content, $options = [], $page = null)
{
// Get all <a> tags and process them
$content = preg_replace_callback('~<a(?:\s[^>]*)?>.*?</a>~i',
function($match) use ($options, $page) {
// Load PHP built-in DOMDocument class
if (($dom = $this->loadDOMDocument($match[0])) === null) {
return $match[0];
}
// Check that there is really a link tag
$a = $dom->getElementsByTagName('a');
if ($a->length == 0) {
return $match[0];
}
$a = $a->item(0);
// Process links with non-empty href attribute
$href = $a->getAttribute('href');
if (strlen($href) == 0) {
return $match[0];
}
// Get the class of the <a> element
$class = $a->hasAttribute('class') ? $a->getAttribute('class') : '';
$classes = array_filter(explode(' ', $class));
// Exclude links with specific class from processing
$exclude = $options->get('exclude.classes', null);
if ($exclude && !!array_intersect($exclude, $classes)) {
return $match[0];
}
// Get domains to be seen as internal
$domains = $options->get('exclude.domains', []);
// This is a mailto link.
if (strpos($href, 'mailto:') === 0) {
$classes[] = 'mailto';
}
// The link is external
elseif ($url = $this->isExternalUrl($href, $domains, $page)) {
// Add external class
$classes[] = 'external-link';
$a->setAttribute('href', $url);
// Add target="_blank"
$target = $options->get('target');
if ($target) {
$a->setAttribute('target', $target);
}
// Add no-follow.
$nofollow = $options->get('no_follow');
if ($nofollow) {
$rel = array_filter(explode(' ', $a->getAttribute('rel')));
if (!in_array('nofollow', $rel)) {
$rel[] = 'nofollow';
$a->setAttribute('rel', implode(' ', $rel));
}
}
// Set rel="noopener noreferrer"
$rel = $a->hasAttribute('rel') ? $a->getAttribute('rel') : '';
$rel = array_filter(explode(' ', $rel));
$rel[] = 'noopener';
$rel[] = 'noreferrer';
$a->setAttribute('rel', implode(' ', array_unique($rel)));
// Add image class to <a> if it has at least one <img> child element
$imgs = $a->getElementsByTagName('img');
if ($imgs->length > 1) {
// Add "images" class to <a> element, if it has multiple child images
$classes[] = 'images';
} elseif ($imgs->length == 1) {
$imgNode = $imgs->item(0);
// Get image size
list($width, $height) = $this->getImageSize($imgNode);
// Determine maximum dimension of image size
$size = max($width, $height);
// Depending on size determine image type
$classes[] = ((0 < $size) && ($size <= 32)) ? 'icon' : 'image';
} else {
// Add "no-image" class to <a> element, if it has no child images
$classes[] = 'no-image';
}
// Add title (aka alert text)
if ($options->get('title')) {
$language = Grav::instance()['language'];
$message = $language->translate(['PLUGINS.EXTERNAL_LINKS.TITLE_MESSAGE']);
// Set default title to link else, set title as data attribute
$key = $a->hasAttribute('title') ? 'data-title' : 'title';
$a->setAttribute($key, $message);
}
}
// Set class attribute
if (count($classes) && ($options->get('mode') === 'active')) {
$a->setAttribute('class', implode(' ', $classes));
}
// Save Dom document back to HTML representation
$html = $this->saveDOMDocument($dom);
return $html;
}, $content);
// Write content back to page
return $content;
}
/** -------------------------------
* Private/protected helper methods
* --------------------------------
*/
/**
* Test if a URL is external
*
* @param string $url The URL to test.
* @param array $domains An array of domains to be seen as internal.
* @param null|Page $page Null or an instance of \Grav\Common\Page.
*
* @return mixed Returns the URL as a string, if it is external,
* false otherwise.
*/
protected function isExternalUrl($url, $domains = [], $page = null)
{
static $allowed_protocols;
static $pattern;
/** @var Config $config */
$config = Grav::instance()['config'];
/** @var Page $page */
$page = $page ?: Grav::instance()['page'];
// Statically store allowed protocols
if (!isset($allowed_protocols)) {
$allowed_protocols = array_flip(
$config->get('plugins.external_links.links.schemes', ['http', 'https'])
);
}
// Statically store internal domains as a PCRE pattern.
if (!isset($pattern) || (count($domains) > 0)) {
$domains = array_merge($domains,
array(Grav::instance()['base_url_absolute']));
foreach ($domains as $domain) {
$domains[] = preg_quote($domain, '#');
}
$pattern = '#(' . str_replace(array('\*', '/*'), '.*?',
implode('|', $domains)) . ')#i';
}
$external = false;
// Check for URLs that don't match any excluded domain
if (!preg_match($pattern, $url)) {
// Check if URL is external by extracting colon position
$colonpos = strpos($url, ':');
if ($colonpos > 0) {
// We found a colon, possibly a protocol. Verify.
$protocol = strtolower(substr($url, 0, $colonpos));
if (isset($allowed_protocols[$protocol])) {
// The protocol turns out be an allowed protocol
$external = $url;
}
} else {
if ($config->get('plugins.external_links.links.www')) {
// Remove possible path duplicate
$route = Grav::instance()['base_url'] . $page->route();
$href = Utils::startsWith($url, $route)
? ltrim(mb_substr($url, mb_strlen($route)), '/')
: $url;
// We found an url without protocol, but with starting 'www' (sub-)domain
if (Utils::startsWith($url, 'www.')) {
$external = 'http://' . $url;
} elseif (Utils::startsWith($href, 'www.')) {
$external = 'http://' . $href;
}
}
if ($config->get('plugins.external_links.links.redirects')) {
$targetPage = Grav::instance()['pages']->find($url);
if ($targetPage && $targetPage->redirect()) {
$external = $this->isExternalUrl($targetPage->redirect(), $domains, $page);
}
}
}
}
// Only if a valid protocol or an URL starting with 'www.' was found return true
return $external;
}
/**
* Determine the size of an image
*
* @param DOMNode $imgNode The image already parsed as a DOMNode
* @param integer $limit Load first $limit KB of remote image
*
* @return array Return the dimension of the image of the
* format array(width, height)
*/
protected function getImageSize($imgNode, $limit = 32)
{
// Hold units (assume standard font with 16px base pixel size)
// Calculations based on pixels
$units = array(
'px' => 1, /* base unit: pixel */
'pt' => 16 / 12, /* 12 point = 16 pixel = 1/72 inch */
'pc' => 16, /* 1 pica = 16 pixel = 12 points */
'in' => 96, /* 1 inch = 96 pixel = 2.54 centimeters */
'mm' => 96 / 25.4, /* 1 millimeter = 96 pixel / 1 inch [mm] */
'cm' => 96 / 2.54, /* 1 centimeter = 96 pixel / 1 inch [cm] */
'm' => 96 / 0.0254, /* 1 centimeter = 96 pixel / 1 inch [m] */
'ex' => 7, /* 1 ex = 7 pixel */
'em' => 16, /* 1 em = 16 pixel */
'rem' => 16, /* 1 rem = 16 pixel */
'%' => 16 / 100, /* 100 percent = 16 pixel */
);
// Initialize dimensions
$width = 0;
$height = 0;
// Determine image dimensions based on "src" atrribute
if ($imgNode->hasAttribute('src')) {
$src = $imgNode->getAttribute('src');
// Simple check if the URL is internal i.e. check if path exists
$path = $_SERVER['DOCUMENT_ROOT'] . $src;
if (realpath($path) && is_file($path)) {
$size = @getimagesize($path);
} else {
// The URL is external; try to load it (default: 32 KB)
$size = $this->getRemoteImageSize($src, $limit * 1024);
}
}
// Read out width and height from <img> attributes
$width = $imgNode->hasAttribute('width') ?
$imgNode->getAttribute('width') : $size[0];
$height = $imgNode->hasAttribute('height') ?
$imgNode->getAttribute('height') : $size[1];
// Get width and height from style attribute
if ( $imgNode->hasAttribute('style') ) {
$style = $imgNode->getAttribute('style');
// Width
if (preg_match('~width:\s*(\d+)([a-z]+)~i', $style, $matches)) {
$width = $matches[1];
// Convert unit to pixel
if ( isset($units[$matches[2]]) ) {
$width *= $units[$matches[2]];
}
}
// Height
if (preg_match('~height:\s*(\d+)([a-z]+)~i', $style, $matches)) {
$height = $matches[1];
// Convert unit to pixel
if (isset($units[$matches[2]])) {
$height *= $units[$matches[2]];
}
}
}
// Update width and height
$size[0] = $width;
$size[1] = $height;
// Return image dimensions
return $size;
}
/**
* Get the size of a remote image
*
* @param string $uri The URI of the remote image
* @param integer $limit Load first $limit bytes of remote image
*
* @return mixed Returns an array with up to 7 elements
*/
protected function getRemoteImageSize($uri, $limit = -1)
{
// Create temporary file to store data from $uri
$tmp_name = tempnam(sys_get_temp_dir(), uniqid('ris'));
if ($tmp_name === false) {
return false;
}
// Open temporary file
$tmp = fopen($tmp_name, 'wb');
// Check which method we should use to get remote image sizes
$allow_url_fopen = ini_get('allow_url_fopen') ? true : false;
$use_curl = function_exists('curl_version');
// Use stream copy
if ($allow_url_fopen) {
$options = [];
if ( $limit > 0 ) {
// Loading number of $limit bytes
$options['http']['header'] = array('Range: bytes=0-' . $limit);
}
// Create stream context
$context = stream_context_create($options);
@copy($uri, $tmp_name, $context);
// Use Curl
} elseif ($use_curl) {
// Initialize Curl
$options = array(
CURLOPT_HEADER => false, // Don't return headers
CURLOPT_FOLLOWLOCATION => true, // Follow redirects
CURLOPT_AUTOREFERER => true, // Set referrer on redirect
CURLOPT_CONNECTTIMEOUT => 120, // Timeout on connect
CURLOPT_TIMEOUT => 120, // Timeout on response
CURLOPT_MAXREDIRS => 10, // Stop after 10 redirects
CURLOPT_ENCODING => '', // Handle all encodings
CURLOPT_BINARYTRANSFER => true, // Transfer as binary file
CURLOPT_FILE => $tmp, // Curl file
CURLOPT_URL => $uri, // URI
);
$curl = curl_init();
curl_setopt_array($curl, $options);
if ($limit > 0) {
// Loading number of $limit
$headers = array('Range: bytes=0-' . $limit);
curl_setopt($curl, CURLOPT_HTTPHEADER, $headers);
curl_setopt($curl, CURLOPT_RANGE, '0-' . $limit);
// Abort request when more data is received
curl_setopt($curl, CURLOPT_BUFFERSIZE, 512); // More progress info
curl_setopt($curl, CURLOPT_NOPROGRESS, false); // Monitor progress
curl_setopt($curl, CURLOPT_PROGRESSFUNCTION,
function($download_size, $downloaded, $upload_size, $uploaded) use ($limit) {
// If $downloaded exceeds $limit, returning non-zero breaks
// the connection!
return ( $downloaded > $limit ) ? 1 : 0;
});
}
// Execute Curl
curl_exec($curl);
curl_close($curl);
}
// Close temporary file
fclose($tmp);
// Retrieve image information
$info = array(0, 0, 'width="0" height="0"');
if (filesize($tmp_name) > 0) {
$info = @getimagesize($tmp_name);
}
// Delete temporary file
unlink($tmp_name);
return $info;
}
/**
* Load contents into PHP built-in DOMDocument object
*
* Two Really good resources to handle DOMDocument with HTML(5)
* correctly.
*
* @see http://stackoverflow.com/questions/3577641/how-do-you-parse-and-process-html-xml-in-php
* @see http://stackoverflow.com/questions/7997936/how-do-you-format-dom-structures-in-php
*
* @param string $content The content to be loaded into the
* DOMDocument object
*
* @return DOMDocument DOMDocument object of content
*/
protected function loadDOMDocument($content)
{
// Clear previous errors
if (libxml_use_internal_errors(true) === true) {
libxml_clear_errors();
}
// Parse content using PHP built-in DOMDocument class
$document = new \DOMDocument('1.0', 'UTF-8');
// Encode contents as UTF-8, strip whitespaces & normalize newlines
$content = mb_convert_encoding($content, 'HTML-ENTITIES', 'UTF-8');
// $whitespaces = array(
// '~\R~u' => "\n", // Normalize new line
// '~\>[^\S ]+~s' => '>', // Strip whitespaces after tags, except space
// '~[^\S ]+\<~s' => '<', // Strip whitespaces before tags, except space
// '~(\s)+~s' => '\\1' // Shorten multiple whitespace sequences
// );
// $content = preg_replace(array_keys($whitespaces), $whitespaces, $content);
// Parse the HTML using UTF-8
// The @ before the method call suppresses any warnings that
// loadHTML might throw because of invalid HTML in the page.
@$document->loadHTML($content);
// Do nothing, if DOM is empty
if (is_null($document->documentElement)) {
return null;
}
return $document;
}
/**
* Save contents of PHP built-in DOMDocument object as HTML5
*
* @param DOMDocument $document DOMDocument object with nodes
*
* @return string The outputted DOM document as HTML(5)
* compliant string
*/
protected function saveDOMDocument($document)
{
// Pretty print output
$document->preserveWhiteSpace = false;
$document->formatOutput = true;
// Transform DOM document to valid HTML(5)
$content = '';
$body = $document->getElementsByTagName('body')->item(0);
foreach ($body->childNodes as $node) {
// Expand empty tags (e.g. <br/> to <br></br>)
if (($html = $document->saveXML($node, LIBXML_NOEMPTYTAG)) !== false) {
$content .= $html;
}
}
// Fix formatting for self-closing tags in HTML5 and removing
// encapsulated (uncommented) CDATA blocks in <script> and
// <style> tags
$regex = array(
'~' . preg_quote('<![CDATA[', '~') . '~' => '',
'~' . preg_quote(']]>', '~') . '~' => '',
'~></(?:area|base(?:font)?|br|col|command|embed|frame|hr|img|input|keygen|link|meta|param|source|track|wbr)>~' => ' />',
);
// Make XML HTML5 compliant
$content = preg_replace(array_keys($regex), $regex, $content);
return $content;
}
}