IT戦記

プログラミング、起業などについて書いているプログラマーのブログです😚

Twitter の半径数クリック以内の情報収集

ちょっと

現実頭皮的に自己満足的プログラムを書きたくなったので Twitterクローラーを書いてみた。
C++ にしては、割とすっきり書けて満足。

使ったライブラリ

ソース

#include <cassert>
#include <soci.h>
#include <soci-sqlite3.h>
#include <unistd.h>
#include <iostream>
#include <sstream>
#include <picojson.h>
#include <boost/scoped_ptr.hpp>
#include <boost/asio.hpp>
#include <boost/cast.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/date_time/posix_time/posix_time.hpp>

using namespace std;
using namespace SOCI;
using namespace boost;
using namespace boost::posix_time;
using namespace boost::gregorian;
namespace j = picojson;
typedef asio::ip::tcp::iostream tcpstream;

void interval(unsigned int t) {
    cout << "Interval " << t <<"[s]" << endl;
    sleep(t);
}

template<class T>
j::value request(const std::string &api_path, const T &id) {

    interval(30);

    // Request Friends API
    cout << "Request http://twitter.com" << api_path << "/" << id << ".json" << endl;
    tcpstream s("twitter.com", "http");
    s << "GET " << api_path << "/" << id << ".json HTTP/1.0\r\nHost: twitter.com\r\n\r\n" << flush;

    if (s) {
        // Skip Header
        string line;
        while (getline(s, line)) if (line == "\r") break;

        if (s) {
            // Parse JSON
            j::value v;
            string err = j::parse(v, s);

            return v; // Expect RVO
        }
    }

    j::value null;
    return null;
}


template<class T> bool exists(Session& sql, T id);

template<>
bool exists<string>(Session& sql, string screen_name) {
    int exists = 0;
    cout << "Ckeck conflict " << screen_name << endl;
    sql << "SELECT count(id) FROM user WHERE screen_name = :screen_name", use(screen_name), into(exists);
    return static_cast<bool>(exists);
}

template<>
bool exists<unsigned long>(Session& sql, unsigned long id) {
    int exists = 0;
    cout << "Ckeck conflict " << id << endl;
    sql << "SELECT count(id) FROM user WHERE id = :id", use(id), into(exists);
    return static_cast<bool>(exists);
}

template<class T>
bool check(const j::value& v) {

    // Error check
    if (v.is<j::object>()) {
        j::value error = v.get("error");
        if (!error.is<j::null>()) {
            cout << "Oops API error!! (" << v << ")" << endl;
            interval(1000);
            return false;
        }
    }

    // Type check
    if (!v.is<T>()) {
        cout << "Oops Type mismatch!! (" << v << ")" << endl;
        interval(100);
        return false;
    }

    return true;
}

template<class T>
T get(const j::value& v) {
    if (v.is<T>()) {
        return v.get<T>();
    }
    return T();
}

const unsigned int end_depth = 2;

template<class T>
void fetch(Session& sql, T id, unsigned int depth = 0) {

    while(true) {

        if (exists(sql, id)) {
            cout << "Ignore (" << id << ")" << endl;
        }
        else {
            // First fetch self data
            const j::value& value = request("/users/show", id);

            if (!check<j::object>(value)) continue;

            j::object v = value.get<j::object>();

            unsigned long uid = numeric_cast<unsigned long>(get<double>(v["id"]));
            unsigned long friends_count = numeric_cast<unsigned long>(get<double>(v["friends_count"]));
            unsigned long followers_count = numeric_cast<unsigned long>(get<double>(v["followers_count"]));
            unsigned long favourites_count = numeric_cast<unsigned long>(get<double>(v["favourites_count"]));
            unsigned long statuses_count = numeric_cast<unsigned long>(get<double>(v["statuses_count"]));
            unsigned long created_at = (lexical_cast<ptime>(get<string>(v["created_at"])) - ptime(date(1970, Jan, 1))).total_seconds();
            string time_zone = get<string>(v["time_zone"]);
            string screen_name = get<string>(v["screen_name"]);
            string profile_image_url = get<string>(v["profile_image_url"]);

            cout << "Insert " << screen_name << "(" << uid << ")" << endl;

            // Insert data
            sql << "INSERT INTO user VALUES("
                        ":id, :screen_name, :friends_count, :followers_count, :favourites_count, "
                        ":statuses_count, :created_at, :time_zone, :profile_image_url)",
                        use(uid), use(screen_name), use(friends_count), use(followers_count), use(favourites_count), 
                        use(statuses_count), use(created_at), use(time_zone), use(profile_image_url);
        }

        // Next fetch children data
        if (depth < end_depth) {
            cout << "Next depth" << endl;

            while (true) {
                const j::value &value = request("/friends/ids", id);

                if (!check<j::array>(value)) continue;

                j::array ids = value.get<j::array>();

                for (j::array::const_iterator it = ids.begin(); it != ids.end(); ++it) {
                    unsigned long uid = numeric_cast<unsigned long>(it->get<double>());
                    fetch(sql, uid, depth + 1);
                }
                break;
            }
        }
        break;
    }
}

int main() {
    // Setting ptime locale for lexical_cast
    locale::global(locale(locale(), new time_input_facet("%a %b %d %H:%M:%S +0000 %Y")));

    try {
        Session sql(sqlite3, "twitter.db");

        // Create table
        sql << "CREATE TABLE IF NOT EXISTS user ("
                    "id INTEGER PRIMARY KEY, screen_name TEXT UNIQUE, friends_count INTEGER, followers_count INTEGER, favourites_count INTEGER, "
                    "statuses_count INTEGER, created_at INTEGER, time_zone TEXT, profile_image_url TEXT)";

        fetch(sql, std::string("amachang"));
    }
    catch(const std::exception &e) {
        cerr << "Error: " << e.what() << "\n";
    }
}