Snippets

ESTI design yandex_parser

Created by Illia Barkov
#!/usr/bin/env ruby
require_relative '../../main.rb'

require 'digest'
require 'httparty'
require 'date'
require 'pry'
require 'csv'
require 'nokogiri'
require 'activerecord-import'

LOGGER = Logger.new('/tmp/update_epg-v2.log')
#LOGGER = Logger.new STDOUT
ActiveRecord::Base.logger = LOGGER
ActiveRecord::Base.logger.level = ENV['DEBUG']? 0 : 1 # 0 - debug; 1 - info;

#channel_id = 462
ID_OVERRIDE= ENV['CH_ID']
DATE_LIMIT = Date.today + 5
DATE_START = Date.today
PERIOD = DATE_START...DATE_LIMIT
TIME_SHIFT_SOURCE = ENV['TIME_SHIFT_SOURCE']? ENV['TIME_SHIFT_SOURCE'] : '+03:00'
TIME_SHIFT_OUTPUT = ENV['TIME_SHIFT_OUTPUT']? ENV['TIME_SHIFT_OUTPUT'] : '-04:00'
@start_time = Time.now

def grab_epg(c)
table = CSV.read("./tv.yandex.ru.channels.csv")
channel_id = c.id

channel = c
if ARGV.include? channel.id.to_s
        LOGGER.info ("[Script]") { "Channel ID: #{channel.id} | Channel name: #{channel.name_original} | Skipped... " }
        return 0
end

epg = []
      LOGGER.info ("[Script]") { "Channel ID: #{c.id} | Channel uid: #{c.epg_external_id} | Channel name: #{c.name_original} " }
if TRUE #!channel_creds.nil? # and channel_creds[0].to_i == 213
PERIOD.each do |date_d|

begin
channel_creds = table.find{ |v| v[1] == channel.epg_external_id }
    LOGGER.info("[Script]") { "Day for update: #{date_d}" }
        date = date_d
        if !channel_creds.nil?
        link = "https://tv.yandex.ru/#{channel_creds[0]}/channel/#{channel_creds[1]}?date=#{(date).strftime('%Y-%m-%d')}"
        else
        link = "https://tv.yandex.ru/channel/#{c.epg_external_id}?date=#{(date).strftime('%Y-%m-%d')}"
        end
    LOGGER.info("[DEBUG]") { link } if ENV['DEBUG']
        response = HTTParty.get(URI::escape(link), :headers => {"User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17"})
        json = JSON.parse /window.__INITIAL_STATE__ = ({.*});$/.match(response.body)[1]
        if current_channel = json["channel"].nil?
        link = "https://tv.yandex.ru/2/channel/#{c.epg_external_id}?date=#{(date).strftime('%Y-%m-%d')}"
        response = HTTParty.get(URI::escape(link), :headers => {"User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17"})
        json = JSON.parse /window.__INITIAL_STATE__ = ({.*});$/.match(response.body)[1]
        end
        if current_channel = json["channel"].nil?
        link = "https://tv.yandex.ru/213/channel/#{c.epg_external_id}?date=#{(date).strftime('%Y-%m-%d')}"
        response = HTTParty.get(URI::escape(link), :headers => {"User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17"})
        json = JSON.parse /window.__INITIAL_STATE__ = ({.*});$/.match(response.body)[1]
        end
        current_channel = json["channel"]["channel"]["title"]
        if current_channel != c.name
        LOGGER.warn("Script") {"Channel id: #{c.id} will be updated from Yandex channel #{current_channel}"}
        end

        events = json["channel"]["schedule"]["events"]
        LOGGER.info("[Script]") { "Events found: #{events.count}" }
        events.each do |event|
                description = event["episode"]["description"]
                time = Time.parse event["start"]
                time_to = Time.parse event["finish"]
                name = event["title"]
                duration = (time_to - time).to_i.to_s
                link_for_description = event["url"]
                if description.nil? and !link_for_description.nil?
                LOGGER.info("[Script]") { "Description has not found. Crawling external page. " }
                page = HTTParty.get(URI::escape("https://tv.yandex.ru"+link_for_description), :headers => {"User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17"})
                json_page = JSON.parse /window.__INITIAL_STATE__ = ({.*});$/.match(page.body)[1]
                description = json_page["program"]["meta"]["description"]
                end

                data = { ch_id: c.id, time: time, time_to: time_to, duration: duration, name: name, descr: description.to_s }
                epg |= [data]
        end
#sleep(0.1)
rescue => e

    LOGGER.error("[Script]") { e }
end


end
                epg.each {|e| e[:time] = e[:time].getlocal(TIME_SHIFT_OUTPUT).to_s[0...-6]; e[:time_to] = e[:time_to].getlocal(TIME_SHIFT_OUTPUT).to_s[0...-6] }

else
puts "Failed"
end

if !epg.nil?
LOGGER.info("[Script]") { "Wiping records..." }
Epg.where('ch_id = ? and time >= ?', channel_id,epg[0][:time]).destroy_all
LOGGER.info("[Script]") { "Updating DB" }
      LOGGER.info ("[Script]") { "Channel: #{c.name_original} | Epg found! Enteties: #{epg.count} " }
Epg.import epg
end
end

if !ID_OVERRIDE.nil?


begin

a = Itv.find(ID_OVERRIDE)
grab_epg(a)
rescue => e
    LOGGER.error("[Script]") { e }
end

else

Itv.where(epg_api_type: "Yandex_html").each do |ch|
begin
        grab_epg(ch)
rescue => e
    LOGGER.error("[Script]") { e }
end
end

end

ActiveRecord::Base.connection.close
LOGGER.info ("[Script]") { "--- Total time: #{(Time.now - @start_time).round(2)} sec"}

Comments (0)

HTTPS SSH

You can clone a snippet to your computer for local editing. Learn more.