#!/usr/bin/env ruby # # Author: IKARASHI Yoshinori # Created: 2005-09-17 21:45:01 JST # Modified: 2008-04-07 23:18:56 # Licence: GPLv2 # $Id: mixi2gmail.rb 38 2008-04-07 23:22:01Z yoosee $ # =begin = mixi2gmail description mixi2gmail feeds mixi diary to some e-mail account, such as gmail. = configuration this program requires configuration file at $HOME/.mixi/mixirc . It should be including parameters as follows: username = foo@example.com # mixi account e-mail address password = xxxxxxxx # mixi login password mailaddress = foo@gmail.com # e-mail address mail to send mailfrom = mixi@init.org # dummy email From address smtpserver = localhost # mail server excludelist = 1111, 34567 # skip list watchlist = 98765, 43210, 2222 # watch list who are not your friend = website http://trac.yoosee.net/mixi2gmail/ =end begin require 'rubygems' rescue LoadError end require 'nkf' require 'mechanize' require 'net/smtp' require 'time' require 'optparse' require 'logger' CID_SUFFIX = 'init.org' # just used for multipart content-id part module Mixi class Config attr_reader :config def initialize(rcfile) @rcfile = rcfile @config = Hash.new config_file_parser end def config_file_parser(rcfile=@rcfile) return nil unless File.exist? rcfile File.open(rcfile) do |file| file.each do |line| key, value = line.chomp.split(/\s*=\s*/) if /,/ =~ value values = value.split(/\s*,\s*/) value = values end @config[key] = value end end end def command_line_parser opt = OptionParser.new end end class Diary attr_accessor :author, :datetime, :title, :imagepages, :images, :body, :url, :comments def initialize @comments = Array.new @imagepages = Array.new end def mkdatetime date, time if(/(\d{4})\D+(\d{2})\D+(\d{2})/) =~ date year, month, date = $1, $2, $3 end if(/(\d{2}):(\d{2})/) =~ time hour, minute = $1, $2 end Time.local(year,month,date,hour,minute) end def parse doc # doc is Hpricot::Doc object author, datetime, title, body = nil, nil, nil, nil @author = nil author = doc.search("//div[@class='diaryTitleFriend clearfix']/h2"). inner_html.gsub("さんの日記",'').chomp dbox = doc.search("//div[@class='viewDiaryBox']") ### journal title dhead = dbox.search("/div[@class='listDiaryTitle']/dl") title = dhead.search("dt").inner_html.gsub(//, '').chomp datetime = dhead.search("dd").inner_html date = datetime.gsub(/\d\d:\d\d/, '') time = datetime.gsub(/\d+年\d+月\d+日/, '') dbody = dbox.search("//div[@class='txtconfirmArea']") ### images @imagepages = Array.new @imagesrcs = Array.new # pick up images from diary (standard image upload) dbody.search("//div[@class='diaryPhoto']//td") do |i| @imagepages.push i.at("a")['onclick'].gsub(/MM_openBrWindow\('/, '').gsub(/'.+/, '') a = i.at("a") @imagesrcs.push a.at("img")['src'].gsub(/.+\//, "").gsub(/s\.jpg/, '.jpg') end # pick up images from body (from mixi album) dbody.search("//div[@id='diary_body']/a").each do |a| if onclick = a['onclick'] @imagepages.push onclick.gsub(/MM_openBrWindow\('/,'').gsub(/'.+/,'') # rewrite image uri to large image filename imgfile = a.at("img")['src'].gsub(/.+\//, "").gsub(/s\.jpg/, '.jpg') contentid = imgfile.gsub(/_/, ".") + "@" + CID_SUFFIX a.at("img")['src'] = "cid:#{contentid}" end end ### journal body body = dbody.search("//div[@id='diary_body']").inner_html unless title.empty? enc = ENV['LANG'] =~ /euc-jp/i ? 'e' : 'w' puts NKF.nkf("-#{enc}", " + #{author}: #{title} (#{date} #{time})") else puts " + failed to parse diary of #{author}" # puts "==================================================" # puts doc.inner_html # puts "==================================================" return false end @author = author @title = title @body = body @datetime = mkdatetime(date, time) #### comments doc.search("//div[@class='diaryCommentbox']").each do |c| author = c.search("span[@class='commentTitleName']/a").inner_html url = c.at("span[@class='commentTitleName']/a")['href'] datetime = c.search("span[@class='commentTitleDate']").inner_html body = c.search("dd").inner_html @comments.push(DiaryComment.new(author, mkdatetime(date,time), url, body)) end # pickup last comment (different class) doc.search("//div[@class='diaryCommentboxLast']").each do |c| author = c.search("span[@class='commentTitleName']/a").inner_html url = c.at("span[@class='commentTitleName']/a")['href'] datetime = c.search("span[@class='commentTitleDate']").inner_html body = c.search("dd").inner_html @comments.push(DiaryComment.new(author, mkdatetime(date,time), url, body)) end return true end def html output = '' output << "

#{@title}

by #{@author} at #{@datetime.strftime('%Y-%m-%d %H:%M')}

" @imagesrcs.each do |src| contentid = src.gsub(/_/, ".") + "@" + CID_SUFFIX output << "\"[IMAGE]\" " end output << "

#{@body}


" @comments.each do |comment| output << "

#{comment.author} at #{comment.datetime.strftime('%Y-%m-%d %H:%M')}

#{comment.body}
" end output << '' output end end class DiaryComment attr_accessor :author, :datetime, :url, :body def initialize(author, datetime, url, body) @author, @datetime, @url, @body = author, datetime, url, body end end class DiaryFetcher attr_accessor :username, :password, :lastidfile, :logfile, :debug, :excludelist, :wait attr_reader :lastid def initialize(username, password) @username, @password = username, password @logfile = ENV['HOME'] + '/.mixi/' + 'mixi-access.log' @lastidfile = ENV['HOME'] + '/.mixi/' + 'lastid.log' @excludelist = Array.new @wait = 3 login @lastid = File.open(@lastidfile).read.to_i rescue 0 @maxid = @lastid @debug = true end def login @agent = WWW::Mechanize.new {|a| a.log = Logger.new(@logfile) } @agent.user_agent_alias = 'Windows Mozilla' if ENV['HTTP_PROXY'] then host, port = ENV['HTTP_PROXY'].split( /:/, 2 ) port = (port || 8080).to_i @agent.set_proxy(host, port) end puts "access to http://mixi.jp as #{@agent.user_agent}" page = @agent.get('http://mixi.jp/') # page = @agent.get('mixi.html') form = page.forms[0] # form.fields.find {|f| p f} form.fields.find {|f| f.name == 'email'}.value = @username form.fields.find {|f| f.name == 'password'}.value = @password form.fields.find {|f| f.name == 'next_url'}.value = '/home.pl' page = @agent.submit(form, form.buttons.first) puts "login to mixi as #{@username}." if @debug if /url=([^"])"/ =~ page.body link = 'http://mixi.jp' + $1.to_s @agent.get(link) end sleep @wait if @wait > 0 end def fetch_user_diary ownerid listurl = "http://mixi.jp/list_diary.pl?id=#{ownerid}" page = @agent.get(listurl) diary = Array.new page.links.each do |link| if /続きを読む/ =~ link.node.inner_html && /view_diary.pl\?id=(\d+)&owner_id=(\d+)/ =~ link.href diaryid = $1.to_i @maxid = diaryid if @maxid < diaryid if diaryid <= @lastid # puts "pre-fetched diary: #{link.href}" if @debug next end sleep @wait if @wait > 0 mixidiary = Diary.new mixidiary.url = link.href mixidiary.parse(@agent.get(link.href)) diary.push mixidiary end end return diary end def fetch_friend_diary listurl = 'http://mixi.jp/new_friend_diary.pl' page = @agent.get(listurl) list = Array.new page.body.gsub(/view_diary.pl\?id=(\d+)&owner_id=(\d+)/) do |href| diaryid, ownerid = $1.to_i, $2.to_i @maxid = diaryid if @maxid < diaryid if @lastid.to_i >= diaryid # puts "pre-fetched diary: #{href}" if @debug next end if @excludelist.include? ownerid.to_s # to_s is temporaly... puts "skip #{$1} because of exclude list" if @debug next end list.push ownerid end diary = Array.new list.uniq.reverse.each do |ownerid| sleep @wait if @wait > 0 puts "fetching diary: id=#{ownerid}" if @debug userdiary = fetch_user_diary(ownerid) diary << userdiary unless userdiary.empty? end return diary.flatten end def fetch_image imagepage_uri return if imagepage_uri.empty? # puts " + try to get #{imagepage_uri}" image, src = nil, '' page = @agent.get("http://mixi.jp/" + imagepage_uri) page.body.gsub(/]*>/i) { src = $1 puts " -> fetch image: #{src}" image = @agent.get(src).body } return image, src end def close File.open(@lastidfile, 'w') do |file| file.write(@maxid) end end end end def send_mail(text, subject, datetime, mail_from, recipients, smtpserver, images) header = '' imagebody = '' boundary = '__mixi2gmail_123456789ABCDEFGHIJ__' header << "Content-Type: multipart/mixed; boundary=\"#{boundary}\"" content = '' tmp = text text = "--#{boundary}\n" text << "Content-Type: text/html;\n\n" text << tmp text << "\n\n" unless images.empty? images.each do |src, image| filename = src.gsub(/.+\//, "") # filename = Time.now.to_i.to_s + '_' + rand(1000).to_i.to_s + '.jpg' contentid = filename.gsub(/_/, ".") + "@" + CID_SUFFIX content = "--#{boundary}\n" # content << "Content-Type: application/octed-stream;\n" content << "Content-Type: image/jpeg;\n" content << " name=\"#{filename}\"\n" content << "Content-ID: <#{contentid}>\n" content << "Content-Transfer-Encoding: base64\n" content << "Content-Disposition: inline;\n" content << " filename=\"#{filename}\"\n\n" content << [image].pack('m') content << "\n\n" imagebody << content end imagebody << "--#{boundary}\n" end text << imagebody message = "From: #{mail_from} To: #{recipients} Subject: #{NKF::nkf('-M', subject)} Date: #{datetime.rfc2822} MIME-Version: 1.0 X-ML-Name: mixi Content-Type: text/plain; charset=\"iso-2022-jp\" Content-Transfer-Encoding: 7bit #{header} #{NKF.nkf('-j',text)} " Net::SMTP.start(smtpserver, 25) do |smtp| smtp.send_message(message, mail_from, recipients) end end rcfile = "#{ENV['HOME']}/.mixi/mixirc" config = Mixi::Config.new(rcfile) unless config puts "configuration file #{rcfile} not found." exit end agent = Mixi::DiaryFetcher.new(config.config['username'], config.config['password']) agent.debug = true agent.excludelist = config.config['excludelist'] || [] agent.fetch_friend_diary.reverse.each do |diary| begin images = Hash.new diary.imagepages.each do |imagepage| image, src = agent.fetch_image(imagepage) images[src] = image sleep 2 end send_mail(diary.html, diary.author+'さんの日記', diary.datetime, config.config['mailfrom'], config.config['mailaddress'], config.config['smtpserver'], images) rescue puts "Error: #{diary.url if diary.url} => #{$!}:#{$@}" p diary end end (config.config['watchlist'] || []).each do |ownerid| agent.fetch_user_diary(ownerid).each do |diary| begin images = Hash.new diary.imagepages.each do |imagepage| image, src = agent.fetch_image(imagepage) images[src] = image sleep 2 end send_mail(diary.html, diary.author+'さんの日記', diary.datetime, config.config['mailfrom'], config.config['mailaddress'], config.config['smtpserver'], images) rescue puts "Error: #{diary.url if diary.url} => #{$!}:#{$@}" end end end agent.close