RSS跟踪github文件更新+某管子更新

2021/07/24

前言

好久没有更新博客了。RSS这种形式的信息获取方式,似乎又流行起来。

github文件更新

github上面有很多不错的资源。我的流程是:github readme文件更新->svn下载对应文件->生成rss文件。

这里为什么要这么复杂,因为github仓库更新多了好多不需要的rss信息,而且github文件下载比较慢。

最后效果,获取《经济学人》和《纽约客》的更新~

image-20210724005152399

下载脚本

解析readme中的地址,自动下载,并生成rss源。

# -*- coding: utf-8 -*-
# /etc/cron.hour 
from urllib import request
import datetime
from rfeed import * # 为生成rss.xml的库
import re
import os
from glob import glob
URL="https://raw.githubusercontent.com/hehonghui/the-economist-ebooks/master/README.md"
BASE_URL="http://your_vps_host/download/"

SVN_BASE_URL="https://github.com/hehonghui/the-economist-ebooks/trunk/"

s="""
* [经济学人 - 周刊, 点击这里下载最新一期](01_economist/te_2021.02.27) , 每周五十一点更新
* [纽约客 - 周刊, 点击这里下载最新一期](02_new_yorker/2021.03.01) , 每周六上午更新
* [卫报 - 每周两期](09_guardian/), 每周三、周日更新

"""
s = request.urlopen(URL).read().decode('utf8')

res= re.findall(r"\[.+\]\((0.+)\)",s)[0:2]


# generate xml
res_list=[]
for item in res:
	res_list+=sorted(glob(item.split('/')[0]+"/*"))


item_list=[]
print(item_list)

for item in res_list:
    _one = Item(
			title = item[3:],
        link = BASE_URL+item,
		description = 'pdf etc. <a href="%s">link</a> '% (BASE_URL+item),
        author = "hehonghui",
        guid = Guid(BASE_URL+item),
        pubDate = datetime.datetime(2020, 1, 1, 4, 0)) # year, month, date, hh, mm, ss
    item_list+=[_one]

feed = Feed(
        title = "经济学人+纽约客更新",
        link = "https://www.xxxxx.biz/atom/updated.xml",
        description = "更新经济学人,纽约客",
        language = "en-US",
        lastBuildDate = datetime.datetime.now(),
        items = item_list)
 

def save_to_file(file_name, contents):
    fh = open(file_name, 'w')
    fh.write(contents)
    fh.close()

save_to_file('test.xml', feed.rss())


# start downing 
for item in res:
    if item not in res_list:
	    print("downing %s"%item)
	    os.popen("svn checkout %s %s"%(SVN_BASE_URL+item,item))

    _now=sorted(glob(item.split('/')[0]+"/*"))
    print(_now)
    if len(_now)>=5: # 最多5个,免得服务器下载过多。
        for _d in _now[:-5]:
            os.popen("rm -rf %s"%_d)
    

在linux亦可使用crontab定时启动下载任务。在 /etc/cron.hourly/ 设置即可。

#!/bin/sh
date >> test.log
python3 down_ecomic.py >>test.log
exit 0

下载某管音频,自动转成mp3格式

某管自带rss的,所以只要用feedparser解析一下,然后调用youtube_dl下载一哈。 各种码包自行安装。

# -*- coding: utf-8 -*-
#
# 2020-04-18
# /etc/cron.hour # https://www.runoob.com/w3cnote/linux-crontab-tasks.html
from urllib import request
import datetime
from rfeed import *
import re
import os
from glob import glob
import feedparser
import pprint
URL="https://www.youtube.com/feeds/videos.xml?channel_id=UCFhp6N5z8W9Ann2eyHAzbbA"
rss = feedparser.parse(URL)
entries=rss['entries'][:3]

BASE_URL="http://xxxxx/down_youtube/"

item_list=[]
for entry in entries:
    print(entry['title'])
    print(entry['published'])
    print(entry['link'])
    #print(entry['summary'])
    new_link=BASE_URL+"book_audios/%s.mp3"%entry['yt_videoid']
    _one = Item(
			title = entry['title'],
        link = new_link,
		description = 'audio: <a href="%s">mp3</a> %s'% (new_link,entry['summary']),
        author = "Youtube",
        guid = Guid(new_link),
        pubDate = datetime.datetime(2020, 1, 1, 4, 0)) # year, month, date, hh, mm, ss
    item_list+=[_one]




feed = Feed(
        title = "知乎读书会更新",
        link = "https://www.xxxxx.biz/atom/updated.xml",
        description = "更新Youtube知乎读书会",
        language = "en-US",
        lastBuildDate = datetime.datetime.now(),
        items = item_list)

def save_to_file(file_name, contents):
    fh = open(file_name, 'w')
    fh.write(contents)
    fh.close()

save_to_file('audio.xml', feed.rss())




from os import rename
import youtube_dl

# start downing 
if not os.path.exists("book_audios/"):
    os.makedirs("book_audios/")


def audio_download(youtube_url):
    # 定义某些下载参数
    ydl_opts = {
        'format': 'bestaudio/best',
     #   'download_archive': 'downloaded_songs.txt',
        'outtmpl': 'book_audios/%(id)s.%(ext)s',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
            }],

    }

    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])


#download('https://www.youtube.com/watch?v=JElpSrsmbTU')



for entry in entries:
    if entry['yt_videoid']+'.mp3' not in os.listdir('book_audios'):
        print("downing %s, %s"%(entry['title'], entry['link']))
        audio_download(entry['link'])
   
    _now=sorted(glob("book_audios/*"),key=os.path.getctime)
    print(_now)
    if len(_now)>=6: # 多余删除
        for _d in _now[:-5]:
            os.popen("rm -rf %s"%_d)
    

下载某管视频

# -*- coding: utf-8 -*-
#
# 2020-04-18
# /etc/cron.hour # https://www.runoob.com/w3cnote/linux-crontab-tasks.html
# pip install feedparser youtube_dl
from urllib import request
import datetime
from rfeed import *
import re
import os
from glob import glob
import feedparser
import pprint
from os import rename
import youtube_dl



def save_to_file(file_name, contents):
    fh = open(file_name, 'w')
    fh.write(contents)
    fh.close()

def audio_download(youtube_url, file_root, only_audio=False):
    # 定义某些下载参数
    ydl_opts = {
        'format': 'best',
        'outtmpl': file_root+'/%(id)s.%(ext)s',
    }
        # 定义某些下载参数
    ydl_opts_a = {
        'format': 'bestaudio/best',
     #   'download_archive': 'downloaded_songs.txt',
        'outtmpl': file_root+'/%(id)s.%(ext)s',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
            }],

    }
    if only_audio:
        ydl_opts = ydl_opts_a
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])



class YoutubeDowner:
    def __init__(self, yurl, burl, file_root, feed_title, only_audio=False):
        self.yurl = yurl
        self.burl = burl
        self.file_root = file_root
        self.feed_title = feed_title
        self.feed_des = feed_title
        self.only_audio = only_audio
        self.type_name = "mp4"
        if only_audio:
            self.type_name = "mp3"

        rss = feedparser.parse(yurl)
        self.entries = rss['entries'][:3]
        if not os.path.exists(file_root):
            os.makedirs(file_root)
        self.generate_xml()

        self.down()

    def generate_xml(self):
        item_list = []
        for entry in self.entries:
            print(entry['title'],entry['published'],entry['link'])
           # print(entry['summary'])
            date_s = entry['published'].split('T')[0].split('-')
            dates = [int(x) for x in date_s]
            # print(dates)
            new_link = self.burl + "/%s.%s" % (entry['yt_videoid'],self.type_name) # 下载的实际地址
            print('%s: <a href="%s">%s</a> <pre>%s</pre>' % (self.type_name,new_link,self.type_name,entry['summary']))
            _one = Item(
                    title = entry['title'],
                link = new_link,
                description = '%s: <a href="%s">%s</a> <pre>%s</pre>' % (self.type_name,new_link,self.type_name,entry['summary']),
                author = "Youtube",
                guid = Guid(new_link),
                pubDate = datetime.datetime(dates[0], dates[1], dates[2], 6, 0)) # year, month, date, hh, mm, ss
            item_list+=[_one]


        feed = Feed(
                title = self.feed_title,
                link = "https://www.xxxxx.biz/atom/updated.xml",
                description = self.feed_des,
                language = "en-US",
                lastBuildDate = datetime.datetime.now(),
                items = item_list)

        save_to_file('%s.xml' % self.file_root, feed.rss())
    
    def down(self):
        for entry in self.entries:
            if entry['yt_videoid']+'.'+self.type_name not in os.listdir(self.file_root):
                print("downing %s, %s"%(entry['title'], entry['link']))
                audio_download(entry['link'], self.file_root, self.only_audio)
    
        _now = sorted(glob(self.file_root + "/*"),key=os.path.getctime)
        print(_now)
        if len(_now)>=6: # 多余删除
            for _d in _now[:-5]:
                os.popen("rm -rf %s"%_d)



    


# url_rss = "https://www.youtube.com/feeds/videos.xml?channel_id=UCSs4A6HYKmHA2MG_0z-F0xw"
# url_base="http://xxxxx/"
# root="rss2"
# title="李永乐老师"

# YoutubeDowner(url_rss, url_base, root, title)

最后效果如下:

祝大家使用愉快!