July 02, 2006

My HTTP utilities

require 'html/htmltokenizer'    # for HTMLTokenizer.getTag('a','area') 
require 'net/http'              # for Net::HTTP::Get
require 'cgi'                   # for CGI.escape()

class String
    def ends_with?(substr)
        len = substr.length
        self.reverse() [0 .. len-1].reverse == substr
    end
end

#
# get_all_links - Get all links on a web page.
#
#   response = Net::HTTP.get(uri)
#   array_of_link_urls = get_all_links response
#
#
def get_all_links( html_string )
    tokenizer = HTMLTokenizer.new(html_string)
    t = nil
    links = []
    link = nil
    while t = tokenizer.getTag('a','area') 
        # next if t.tag_name != 'a'
        link = t.attr_hash['href']
        next if link.nil?       # for <a name="download" id="download"></a>
        link = CGI.unescapeHTML link
        m = link.match(/[^\?]*/)    # strip everything after "?"
                                     # "e.g. http://www.s.com/m01.mpg?1234"
        links << link
    end
    links
end

#
# A shortcut for getting a page when cookie is required.
#
#   page_body = get_page_with_cookie "http://server.com", "user=john"
#
def get_page_with_cookie( url, cookie )
    uri = URI.parse url
    req = Net::HTTP::Get.new(url)
    req["Cookie"] = cookie
    res = Net::HTTP.new(uri.host, uri.port).start{|http| http.request(req) }
    body = res.body 
end
Posted by laza at July 2, 2006 01:24 AM | TrackBack
Comments