PHP Classes

File: App/ImgCountHandler.php

Recommend this page to a friend!
  Classes of Igor Dyshlenko   PHP Image Crawler   App/ImgCountHandler.php   Download  
File: App/ImgCountHandler.php
Role: Class source
Content type: text/plain
Description: Class source
Class: PHP Image Crawler
Crawl Web site pages to find images in the pages
Author: By
Last change:
Date: 4 years ago
Size: 6,512 bytes
 

Contents

Class file image Download
<?php

namespace App;

use
Domain\ImgCountReport;
use
Domain\Page;
use
Domain\Report;
use
Domain\Site;
use
Infrastructure\Repository\PageRepository;
use
InvalidArgumentException;

/**
 * Class ImgCountHandler.
 * Implementation of the recursive command for counting the number of tags <img />.
 *
 * @package App
 */
class ImgCountHandler
{
   
/** @var Page $rootPage */
   
protected $rootPage;

   
/** @var PageRepository $repository */
   
protected $repository;

   
/** @var Site $site */
   
protected $site;

   
/** @var int $maxDepth The maximum depth of recursion when processing site pages. */
   
protected $maxDepth;

   
/**
     * @var ContentLoaderInterface
     */
   
private $contentLoader;

   
/**
     * ImgCountHandler constructor.
     *
     * @param Site $site Site information.
     * @param string $rootUrl Root URL for begin processing.
     * @param ContentLoaderInterface $loader Content loader.
     * @param array $headers CURL headers for content load.
     * @param int $maxDepth The maximum depth of recursion when processing site pages.
     */
   
public function __construct(Site $site, string $rootUrl, ContentLoaderInterface $loader, array $headers = [],
                               
int $maxDepth = PHP_INT_MAX)
    {
       
$this->repository = new PageRepository();
       
$this->repository->store($this->rootPage = new Page($site->correctUrl($rootUrl)));

       
$this->maxDepth = $maxDepth;
       
$this->site = $site;
       
$this->contentLoader = $loader;
       
$loader->setHeaders($headers);
    }

   
/**
     * @param string $url
     *
     * @return Report
     */
   
public function handle(string $url): Report
   
{
       
$this->pageProcessingRecursive([$url]);

        return new
ImgCountReport($this->repository);
    }

    private function
countImgTags(string &$content): int
   
{
       
preg_match_all('/<img(?>\\s|$)/i', $content, $matches);
        return
count($matches[0] ?? []);
    }

    private function
pageProcessing(Page $page, string &$content): void
   
{
        if ((
$childrenUrls = $this->correctUrls(UrlFilter::getInstance()->handle($content))) === null) {
           
$page->setChildren([])->setImgCount(0);
           
$this->echoErrorMsg($page);
        }
       
$children = [];

       
/** @var string $url */
       
foreach ($childrenUrls as $url) {
           
$children[] =
           
$childrenPage = $this->repository->get($url) ?? new Page($url);
           
$this->repository->store($childrenPage);
        }

       
$page->setChildren($children)
             ->
setImgCount($this->countImgTags($content));
    }

    private function
echoErrorMsg(Page $page): void
   
{
        switch (
preg_last_error()) {
            case
PREG_NO_ERROR:
               
$errorMsg = '?????? ???????????.';
                break;

            case
PREG_INTERNAL_ERROR:
               
$errorMsg = '????????? ?????????? ?????? PCRE.';
                break;

            case
PREG_BACKTRACK_LIMIT_ERROR:
               
$errorMsg = '????? ???????? ?????? ??? ????????.';
                break;

            case
PREG_RECURSION_LIMIT_ERROR:
               
$errorMsg = '????? ???????? ??? ????????.';
                break;

            case
PREG_BAD_UTF8_ERROR:
               
$errorMsg = '?????? ???? ??????? ????????????? ??????? UTF-8 (?????? ??? ??????? ? ?????? UTF-8).';
                break;

            case
PREG_BAD_UTF8_OFFSET_ERROR:
               
$errorMsg =
                   
'???????? ?? ????????????? ?????? ?????????? ??????? ????? UTF-8 (?????? ??? ??????? ? ?????? UTF-8).';
                break;

            case
PREG_JIT_STACKLIMIT_ERROR:
               
$errorMsg = '????????? ??????? PCRE ??????????? ???????? ??-?? ?????? ????? JIT.';
                break;

            default:
               
$errorMsg = '??????????? ?????? PCRE.';
        }
        echo
"\nContent parsing error for URL \"", $page->getUrl(), '": ', $errorMsg, "\n";
    }

    private function
correctUrls(?array $urlList): ?array
    {
        if (
$urlList === null) {
            return
null;
        }

        foreach (
$urlList as $i => $url) {
            if (
$this->site->isInhere($url)) {
                try {
                   
$correctedUrl = $this->site->correctUrl($url);
                    if ((
$this->repository->get($correctedUrl) ?? new Page($correctedUrl))->isNotProcessed()) {
                       
$urlList[$i] = $correctedUrl;
                    } else {
                        unset(
$urlList[$i]);
                    }
                } catch (
InvalidArgumentException $e) {
                    unset(
$urlList[$i]);
                }
            } else {
                unset(
$urlList[$i]);
            }
        }

        return
array_values($urlList);
    }

    private function
pageProcessingRecursive(array $urlList, int $depth = 1): void
   
{
       
$start = microtime(true);

       
$urlList = $this->correctUrls($urlList) ?? [];
       
$contentArray = $this->contentLoader->loadContent($urlList);
       
$loadTime = microtime(true) - $start;

        foreach (
$contentArray as $url => $content) {
           
$start = microtime(true);
           
$page = $this->repository->get($url) ?? new Page($url);
            if (
$page->isNotProcessed()) {
               
$this->repository->store($page);
               
$this->pageProcessing($page, $content);
               
$page->setProcessingTime(microtime(true) - $start + $loadTime);
            }
            unset(
$contentArray[$url]);
        }

       
/** Check max depth level */
       
if ($this->maxDepth <= ++$depth) {
            return;
        }

        foreach (
$urlList as $url) {
           
$page = $this->repository->get($url);
           
$children = $page->getChildren();
            foreach (
$children as $i => $page) {
                if (
$page->isNotProcessed()) {
                   
$children[$i] = $page->getUrl();
                } else {
                    unset(
$children[$i]);
                }
            }

           
$this->pageProcessingRecursive($children, $depth);
        }
    }
}