So I was thinking, wouldn’t it be nice if the Australian Dictionary of Biography’s ‘born on this day‘ feature could be made available as an RSS feed. Every morning you’d get a new list of biographies delivered direct to your feed reader. And so…
[sounds of xpath wrangling and PHP coding]
It’s pretty simple – it harvests all the links of people born on the current day, then loops through the links to gather the first paragraph of each biography. Then it’s just a matter of writing everything to an RSS file.
In case you missed it, I also created a Media RSS feed for portrait images used in the ADB. This enables them to be viewed in CoolIris.
Code follows…
<?php
function getPage($url, $ch) {
curl_setopt($ch, CURLOPT_URL,$url);
$html= curl_exec($ch);
if (!$html) {
echo "cURL error number:" .curl_errno($ch);
echo "cURL error:" . curl_error($ch);
exit;
}
return $html;
}
$url = "http://www.adb.online.anu.edu.au/scripts/adbp-births-deaths.php";
$userAgent = 'Googlebot/2.1 (http://www.googlebot.com/bot.html)';
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
curl_setopt($ch, CURLOPT_FAILONERROR, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,true);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
$html = getPage($url, $ch);
$dom = new DOMDocument();
@$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("//ul[@class='pb-results'][1]/li/a");
$titles = $xpath->evaluate("//ul[@class='pb-results'][1]/li/a/text()");
echo "<?xml version='1.0'?>\n";
echo "<rss version='2.0'>\n";
echo "<channel>\n";
echo "\n";
echo "
<link>http://www.adb.online.anu.edu.au/scripts/adbp-births-deaths.php</link>\n";
echo "<description>A list of all those people in the Australian Dictionary of Biography who were born on this day.</description>\n";
for ($i = 0; $i < $hrefs->length; $i++) {
$href = $hrefs->item($i);
$title = $href->nodeValue;
$bio = "";
$url = "http://www.adb.online.anu.edu.au" . substr($href->getAttribute('href'),2);
$html = getPage($url, $ch);
$dom = new DOMDocument();
@$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
$paras = $xpath->evaluate("//div[@id='content']/p[1]/text()");
foreach ($paras as $para) {
$bio .= $para->nodeValue;
}
$bio .= "...";
$bio = htmlspecialchars($bio, ENT_QUOTES);
$bio = str_replace('\n', '', $bio);
echo "<item>\n";
echo "\n";
echo "
<link>$url</link>\n";
echo "<description>$bio</description>\n";
echo "</item>\n";
}
echo "</channel>\n";
?>
Post a Comment