Overclock.net Mechanical Keyboard Guide Atom Feed
Sunday, August 29th, 2010I use RSS / atom feeds pretty much all the time to minimize information overload but the Mechanical Keyboard Guide of Overclock.net doesn't make my job any easier because they don't provide any feeds and the thread moves very fast.
I couldn't tolerate this anymore so I've created a webscraper that provides atom feeds for this thread. Parsing HTML into a DOM and executing XPath queries on the DOM is something that I have a vast amount experience with and this project didn't take a long time either. I've been testing it for more than a month and it's rock solid. The only glitch is that sometimes posts are randomized between very short time intervals which is a minor inconvenience.
The script below is executed on a hourly basis by cron and its content is saved to http://monda.hu/overclock-net-mech-keyboard.xml
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | < ?php include 'config.php'; // include $database_{servername, username, password, dbname} $start_url = 'http://www.overclock.net/computer-peripherals/491752-mechanical-keyboard-guide-10000.html'; function DOMinnerHTML($element) { // Borrowed from php.net $innerHTML = ""; $children = $element->childNodes; foreach ($children as $child) { $tmp_dom = new DOMDocument(); $tmp_dom->appendChild($tmp_dom->importNode($child, true)); $innerHTML .= trim($tmp_dom->saveHTML()) . ' '; } return $innerHTML; } function page_to_entries($html) { $domdocument = new DOMDocument(); @$domdocument->loadHTML($html); $domxpath = new DomXPath($domdocument); $xpath = '/html/body/div[2]/div/div/div/div/div/table'; $nodelist = $domxpath->query($xpath); $entries = array(); foreach ($nodelist as $node) { $link_node = $node->firstChild->childNodes->Item(2)->childNodes->Item(1); $id = $link_node->textContent; $url = $link_node->getAttribute('href'); $username = $node->childNodes->Item(1)->firstChild->childNodes->Item(1)->textContent; $comment = DOMInnerHTML($node->childNodes->Item(1)->childNodes->Item(2)->childNodes->Item(8)); $entry = array('id'=>$id, 'url'=>$url, 'username'=>$username, 'comment'=>$comment); $entries[] = $entry; } return $entries; } function query($sql) { if (($result=mysql_query($sql)) === false) { die(mysql_error()); } return $result; } function register_entry_and_get_timestamp($id) { if (!is_numeric($id)) { die("Entry ID is not numeric!"); } $result = query("INSERT IGNORE INTO mechanical_keyboard_guide SET id=$id"); $result = query("SELECT timestamp FROM mechanical_keyboard_guide WHERE id=$id"); $row = mysql_fetch_assoc($result); $timestamp = strtr($row['timestamp'], ' ', 'T') . 'Z'; return $timestamp; } function add_updated_timestamp(&$entries) { for ($i=0; $i<count ($entries); $i++) { $entries[$i]['updated'] = register_entry_and_get_timestamp($entries[$i]['id']); } } function print_entry($entry) { $url = $entry['url']; $username = htmlspecialchars($entry['username']); $comment = $entry['comment']; $updated = $entry['updated']; print "<entry>\n"; print " <title>$username</title>\n"; print " <link href=\"$url\"/>\n"; print " <id>$url</id>\n"; print " <updated>$updated</updated>\n"; print " <summary type=\"html\">< ![CDATA[$comment]]></summary>\n"; print "\n"; } // Set up MySQL connection. if (mysql_connect($database_servername, $database_username, $database_password) === false) { die('Failed to connect to the MySQL server. Please check the $database_servername, $database_username, $database_password variables in config.php'); } if (mysql_select_db($database_dbname) === false) { die('Failed to select the MySQL database. Please check the $database_dbname variable in config.php'); } // Set up cURL. $ch = curl_init(); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); // Fetch last page. curl_setopt($ch, CURLOPT_URL, $start_url); $last_page_html = curl_exec($ch); // Get the page ID of the page before the last page. $last_page_url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); preg_match('/-([0-9]+)\.html$/', $last_page_url, $matches); $last_page_id = $matches[1]; $almost_last_page_id = $last_page_id - 1; // Fetch the page before the last page. $almost_last_page_url = "http://www.overclock.net/computer-peripherals/491752-mechanical-keyboard-guide-$almost_last_page_id.html"; curl_setopt($ch, CURLOPT_URL, $almost_last_page_url); $almost_last_page_html = curl_exec($ch); $almost_last_page_entries = page_to_entries($almost_last_page_html); $last_page_entries = page_to_entries($last_page_html); $entries = array_merge($almost_last_page_entries, $last_page_entries); add_updated_timestamp($entries); $last_updated_timestamp = $entries[count($entries)-1]['updated']; print '< ?xml version="1.0" encoding="utf-8" ?>'; ?> <feed xmlns="http://www.w3.org/2005/Atom"> <title>Overclock.net's Mechanical Keyboards Guide</title> <subtitle>Overclock.net's Mechanical Keyboards Guide</subtitle> <link href="http://www.overclock.net/computer-peripherals/491752-mechanical-keyboard-guide.html"/> <updated>< ?php print $last_updated_timestamp ?></updated> <author> <name>Overclock.net's Mechanical Keyboards Guide</name> <email>laci_nospam@monda.hu</email> </author> <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id> < ?php foreach ($entries as $entry) print_entry($entry) ?> </feed> </count> |
As for the SQL table structure, it's not particularly complex.
0 1 2 3 4 | CREATE TABLE IF NOT EXISTS `mechanical_keyboard_guide` ( `id` INT(11) NOT NULL, `timestamp` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (`id`) ) ENGINE=MyISAM DEFAULT CHARSET=utf8; |





