#!/usr/local/bin/ruby =begin get_html_table.rb Getting texts of tables from HTML. Copyright (C) 2001-2002 Masaharu FUJITA This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. You may contact the author by: e-mail: fujita@a2z.co.jp url: http://www.a2z.co.jp/~fujita/Ruby/get_html_table.rb =end require "socket" require "kconv" require "jcode" require "timeout" Port = 80 Timeout = 20 $Kconv = 'e' class GetHtmlFile def initialize(url) @server, @directory = url.gsub('http://','').split('\/',2) @directory = '/' + @directory end # def initialize def get_html count = 0 success = false sock = nil while success == false begin status = timeout(Timeout) { sock = TCPSocket.open(@server, Port) } rescue if count > 10 print "Can't open socket." exit end count += 1 else success = true end end # while success == false sock.print "GET #@directory\r\n" @html_string = Array::new() i = 0 while line = sock.gets @html_string[i] = line i += 1 end sock.close end # def get def html_string return @html_string end end # class GetHtmlnews.yahoo.co.jp class GetTableText def initialize(html) @html = html end # def initialize def get_html_body start = false @html_body = '' @html.each do |line| if start == true if line =~ /.*<\/BODY>.*/i break elsif @html_body.concat(line.strip) end elsif line =~ /.*.*/i start = true next end # if start == true end # @html.each do |line| end # def get_html_body def cut_tag(string,p1,p2) array = Array::new() i = 0 tmp_array = string.split(p1) tmp_array.each do |parts| parts = parts.split(p2) parts.each do |tmp_parts| if tmp_parts =~ /^',2) i += 1 end end return array end def get_text @table = Array::new() table_number = 0 tmp_table = cut_tag(@html_body,//i) tmp_table.each do |parts_tr| if ( parts_tr == nil ) next end @table[table_number] = Array::new() tr_number = 0 tmp_tr = cut_tag(parts_tr,//i) tmp_tr.each do |parts_td| if ( parts_td == nil ) next end @table[table_number][tr_number] = Array::new() td_number = 0 tmp_td = cut_tag(parts_td,/
/i) tmp_td.each do |parts| if ( parts == nil ) next end text = '' is_text = true parts.each_char do |c| if is_text == false if c == '>' is_text = true end elsif c == '<' is_text = false else text.concat(c) end end # parts_td.each_char do |c| @table[table_number][tr_number][td_number] = text td_number += 1 end # tmp_td.each do |parts_td| tr_number += 1 end # tmp_tr.each do |parts_td| table_number += 1 end # tmp_table.each do |parts_tr| end # def get_text def print_index_text i = 0 @table.each do |tr| j = 0 tr.each do |td| k = 0 td.each do |text| print "[#{i}][#{j}][#{k}]: " print @table[i][j][k] print "\n=====================\n" k += 1 end # td.each do |text| j += 1 end # tr.each do |td| i += 1 end # @table.each do |tr| end # def print_index_text def table return @table end # def table end