Skip to content

Commit

Permalink
add feed discovery API
Browse files Browse the repository at this point in the history
  • Loading branch information
aaronpk committed Nov 11, 2017
1 parent 796defb commit 206e27e
Show file tree
Hide file tree
Showing 12 changed files with 545 additions and 4 deletions.
53 changes: 49 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,14 +99,13 @@ You can also use XRay to fetch all the rel values on a page, merging the list of

```php
$xray = new p3k\XRay();
$xray->http = $this->http;
$rels = $xray->rels('https://aaronparecki.com/');
```

This will return a similar response to the parser, but instead of a `data` key containing the parsed page, there will be `rels`, an associative array. Each key will contain an array of all the values that match that rel value.

```
$rels = Array
Array
(
[url] => https://aaronparecki.com/
[code] => 200
Expand All @@ -125,6 +124,41 @@ $rels = Array
```


### Feed Discovery

You can use XRay to discover the types of feeds available at a URL.

```php
$xray = new p3k\XRay();
$feeds = $xray->feeds('http://percolator.today');
```

This will fetch the URL, check for a Microformats feed, as well as check for rel=alternates pointing to Atom, RSS or JSONFeed URLs. The response will look like the below.

```
Array
(
[url] => https://percolator.today/
[code] => 200
[feeds] => Array
(
[0] => Array
(
[url] => https://percolator.today/
[type] => microformats
)
[1] => Array
(
[url] => https://percolator.today/podcast.xml
[type] => rss
)
)
)
```

### Customizing the User Agent

To set a unique user agent, (some websites will require a user agent be set), you can set the `http` property of the object to a `p3k\HTTP` object.
Expand Down Expand Up @@ -336,7 +370,8 @@ If the page being parsed represents a feed, then the response will look like the
"data": {
"type": "feed",
"items": [

{...},
{...}
]
}
}
Expand All @@ -346,14 +381,24 @@ Each object in the `items` array will contain a parsed version of the item, in t

Atom, RSS and JSONFeed will all be normalized to XRay's vocabulary, and only recognized properties will be returned.

## Rels
## Rels API

There is also an API method to parse and return all rel values on the page, including HTTP `Link` headers and HTML rel values.

```
GET /rels?url=https://aaronparecki.com/
```

See [above](#rels) for the response format.

## Feed Discovery API

```
GET /feeds?url=https://aaronparecki.com/
```

See [above](#feed-discovery) for the response format.


## Token API

Expand Down
1 change: 1 addition & 0 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
"controllers/Parse.php",
"controllers/Token.php",
"controllers/Rels.php",
"controllers/Feeds.php",
"controllers/Certbot.php"
]
}
Expand Down
58 changes: 58 additions & 0 deletions controllers/Feeds.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
<?php
use Symfony\Component\HttpFoundation\Request;
use Symfony\Component\HttpFoundation\Response;

class Feeds {

public $http;
private $_pretty = false;

public function __construct() {
$this->http = new p3k\HTTP();
}

private function respond(Response $response, $code, $params, $headers=[]) {
$response->setStatusCode($code);
foreach($headers as $k=>$v) {
$response->headers->set($k, $v);
}
$response->headers->set('Content-Type', 'application/json');
$opts = JSON_UNESCAPED_SLASHES;
if($this->_pretty) $opts += JSON_PRETTY_PRINT;
$response->setContent(json_encode($params, $opts)."\n");
return $response;
}

public function find(Request $request, Response $response) {
$opts = [];

if($request->get('timeout')) {
// We might make 2 HTTP requests, so each request gets half the desired timeout
$opts['timeout'] = $request->get('timeout') / 2;
}

if($request->get('max_redirects')) {
$opts['max_redirects'] = (int)$request->get('max_redirects');
}

if($request->get('pretty')) {
$this->_pretty = true;
}

$url = $request->get('url');

if(!$url) {
return $this->respond($response, 400, [
'error' => 'missing_url',
'error_description' => 'Provide a URL to fetch'
]);
}

$xray = new p3k\XRay();
$xray->http = $this->http;
$res = $xray->feeds($url, $opts);

return $this->respond($response, !empty($res['error']) ? 400 : 200, $res);
}

}
5 changes: 5 additions & 0 deletions lib/XRay.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ public function rels($url, $opts=[]) {
return $rels->parse($url, $opts);
}

public function feeds($url, $opts=[]) {
$feeds = new XRay\Feeds($this->http);
return $feeds->find($url, $opts);
}

public function parse($url, $opts_or_body=false, $opts_for_body=[]) {
if(!$opts_or_body || is_array($opts_or_body)) {
$fetch = new XRay\Fetcher($this->http);
Expand Down
113 changes: 113 additions & 0 deletions lib/XRay/Feeds.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
<?php
namespace p3k\XRay;

use p3k\XRay\Formats;

class Feeds {
private $http;

public function __construct($http) {
$this->http = $http;
}

public function find($url, $opts=[]) {
if(isset($opts['timeout']))
$this->http->set_timeout($opts['timeout']);
if(isset($opts['max_redirects']))
$this->http->set_max_redirects($opts['max_redirects']);

$scheme = parse_url($url, PHP_URL_SCHEME);
if(!in_array($scheme, ['http','https'])) {
return [
'error' => 'invalid_url',
'error_description' => 'Only http and https URLs are supported'
];
}

$host = parse_url($url, PHP_URL_HOST);
if(!$host) {
return [
'error' => 'invalid_url',
'error_description' => 'The URL provided was not valid'
];
}

$url = normalize_url($url);

$result = $this->http->get($url);
$body = $result['body'];

$feeds = [];

// First check the content type of the response
$contentType = isset($result['headers']['Content-Type']) ? $result['headers']['Content-Type'] : '';

if(is_array($contentType))
$contentType = $contentType[count($contentType)-1];

if(strpos($contentType, 'application/atom+xml') !== false) {
$feeds[] = [
'url' => $result['url'],
'type' => 'atom'
];
} elseif(strpos($contentType, 'application/rss+xml') !== false) {
$feeds[] = [
'url' => $result['url'],
'type' => 'rss'
];
} elseif(strpos($contentType, 'application/json') !== false
&& substr($body, 0, 1) == '{' && strpos(substr($body, 0, 100), 'https://jsonfeed.org/version/1')) {
$feeds[] = [
'url' => $result['url'],
'type' => 'jsonfeed'
];
} else {
// Some other document was returned, parse the HTML and look for rel alternates and Microformats

$mf2 = \mf2\Parse($body, $result['url']);
if(isset($mf2['alternates'])) {
foreach($mf2['alternates'] as $alt) {
if(strpos($alt['type'], 'application/json') !== false) {
$feeds[] = [
'url' => $alt['url'],
'type' => 'jsonfeed'
];
}
if(strpos($alt['type'], 'application/atom+xml') !== false) {
$feeds[] = [
'url' => $alt['url'],
'type' => 'atom'
];
}
if(strpos($alt['type'], 'application/rss+xml') !== false) {
$feeds[] = [
'url' => $alt['url'],
'type' => 'rss'
];
}
}
}

$parsed = Formats\HTML::parse($this->http, $body, $result['url'], array_merge($opts, ['expect'=>'feed']));
if($parsed && isset($parsed['data']['type']) && $parsed['data']['type'] == 'feed') {
$feeds[] = [
'url' => $result['url'],
'type' => 'microformats'
];
}
}

// Sort feeds by priority
$rank = ['microformats'=>0,'jsonfeed'=>1,'atom'=>2,'rss'=>3];
usort($feeds, function($a, $b) use($rank) {
return $rank[$a['type']] > $rank[$b['type']];
});

return [
'url' => $result['url'],
'code' => $result['code'],
'feeds' => $feeds
];
}

}
3 changes: 3 additions & 0 deletions public/index.php
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ class Config {
$router->addRoute('POST', '/parse', 'Parse::parse');
$router->addRoute('POST', '/token', 'Token::token');

$router->addRoute('GET', '/feeds', 'Feeds::find');
$router->addRoute('POST', '/feeds', 'Feeds::find');

$router->addRoute('GET', '/rels', 'Rels::fetch');

$router->addRoute('GET', '/cert', 'Certbot::index');
Expand Down
Loading

0 comments on commit 206e27e

Please sign in to comment.