<?php namespace App\Http\Controllers; use Illuminate\Http\Request; use App\Http\Requests; use RedBeanPHP\R; use PhpQuery\PhpQuery as phpQuery; use GuzzleHttp\Client; use Cache; /** * This class searches google shopping for a given * keyword and scrapes the page urls for microdata */ class GoogleShoppingController extends Controller { /** * @var $_html GoogleShoppingController * */ private $_html; /** * @var $_url GoogleShoppingController * @return array of urls from scraping shopping results */ private $_urls; /** * Creates new Guzzle http client and caches the response * Loads response into $_html * Calls loadHtmlDom() which returns phpQuery object * * @param string $url url to scrape for links */ private function getHtml($url) { $request = new Client(); try { // see if url/html exists in Cache // if it does not, fetch the html and store it in the cache if (Cache::has($url)) { $this->_html = Cache::get($url); } else { $response = $request->get($url); Cache::put($url, $response->getBody(), 3600); } } catch (RequestException $e) { echo $e->getRequest(); if ($e->hasResponse()) { echo $e->getResponse(); } } $this->_html = Cache::get($url); $this->loadHtmlDom(); } /** * Loads html response into phpQuery * * @return phpQuery object */ private function loadHtmlDom() { return phpQuery::newDocumentHTML($this->_html); } /** * Scrapes the actual links <a href="" from given URL string * * @param string $url URL to scrape for links * @param string $selector CSS selector to scrape * @return Array of scraped urls */ private function getLinks($url, $selector) { $this->getHtml($url); foreach (phpQuery::pq($selector) as $link) { $links[] = phpQuery::pq($link)->attr("href"); } return $links; } /** * Search Google shopping for $keyword * * @param string $keyword Keyword to search on GoogleShopping * @return Array of scraped urls containing all results */ public function getGoogleShoppingSearchResults($keyword) { $this->_urls = $this->getLinks("https://www.google.com/search?q={$keyword}&hl=en&tbm=shop", ".r a"); return $this->_urls; } /** * Parses only Google Shopping Product links from the _urls array * * @return Array of scraped urls in the format http://www.google.com/shopping/product/ */ public function parseGoogleShoppingProductLinks() { foreach ($this->_urls as $url) { $parsed = parse_url($url); if (isset($parsed['path']) && strpos($parsed['path'], 'shopping/product/')) { $parsed['host'] = 'www.google.com'; unset($parsed['query']); $urls[] = "http://" . $parsed['host'] . $parsed['path']; return $urls; } } } /** * Parses only Google Sponsored Product links from the _urls array * * @return Array of scraped urls for sponsored google products original store page */ public function parseGoogleSponsoredProductLinks() { foreach ($this->_urls as $url) { $parsed = parse_url($url); if (isset($parsed['path']) && $parsed['path'] == '/aclk') { $parsed_query = parse_str($parsed['query'], $query); $parsed_adurl = parse_str($query['adurl'], $adurl); if (isset($adurl['ds_dest_url'])) { $urls[] = $adurl['ds_dest_url']; } else if ($adurl['u']) { $urls[] = $adurl['u']; } return $urls; } } }