前言
好久没有更新博客了。RSS这种形式的信息获取方式,似乎又流行起来。
github文件更新
github上面有很多不错的资源。我的流程是:github readme文件更新->svn下载对应文件->生成rss文件。
这里为什么要这么复杂,因为github仓库更新多了好多不需要的rss信息,而且github文件下载比较慢。
最后效果,获取《经济学人》和《纽约客》的更新~

下载脚本
解析readme中的地址,自动下载,并生成rss源。
# -*- coding: utf-8 -*-
# /etc/cron.hour
from urllib import request
import datetime
from rfeed import * # 为生成rss.xml的库
import re
import os
from glob import glob
URL="https://raw.githubusercontent.com/hehonghui/the-economist-ebooks/master/README.md"
BASE_URL="http://your_vps_host/download/"
SVN_BASE_URL="https://github.com/hehonghui/the-economist-ebooks/trunk/"
s="""
* [经济学人 - 周刊, 点击这里下载最新一期](01_economist/te_2021.02.27) , 每周五十一点更新
* [纽约客 - 周刊, 点击这里下载最新一期](02_new_yorker/2021.03.01) , 每周六上午更新
* [卫报 - 每周两期](09_guardian/), 每周三、周日更新
"""
s = request.urlopen(URL).read().decode('utf8')
res= re.findall(r"\[.+\]\((0.+)\)",s)[0:2]
# generate xml
res_list=[]
for item in res:
res_list+=sorted(glob(item.split('/')[0]+"/*"))
item_list=[]
print(item_list)
for item in res_list:
_one = Item(
title = item[3:],
link = BASE_URL+item,
description = 'pdf etc. <a href="%s">link</a> '% (BASE_URL+item),
author = "hehonghui",
guid = Guid(BASE_URL+item),
pubDate = datetime.datetime(2020, 1, 1, 4, 0)) # year, month, date, hh, mm, ss
item_list+=[_one]
feed = Feed(
title = "经济学人+纽约客更新",
link = "https://www.xxxxx.biz/atom/updated.xml",
description = "更新经济学人,纽约客",
language = "en-US",
lastBuildDate = datetime.datetime.now(),
items = item_list)
def save_to_file(file_name, contents):
fh = open(file_name, 'w')
fh.write(contents)
fh.close()
save_to_file('test.xml', feed.rss())
# start downing
for item in res:
if item not in res_list:
print("downing %s"%item)
os.popen("svn checkout %s %s"%(SVN_BASE_URL+item,item))
_now=sorted(glob(item.split('/')[0]+"/*"))
print(_now)
if len(_now)>=5: # 最多5个,免得服务器下载过多。
for _d in _now[:-5]:
os.popen("rm -rf %s"%_d)
在linux亦可使用crontab定时启动下载任务。在 /etc/cron.hourly/ 设置即可。
#!/bin/sh
date >> test.log
python3 down_ecomic.py >>test.log
exit 0
下载某管音频,自动转成mp3格式
某管自带rss的,所以只要用feedparser解析一下,然后调用youtube_dl下载一哈。 各种码包自行安装。
# -*- coding: utf-8 -*-
#
# 2020-04-18
# /etc/cron.hour # https://www.runoob.com/w3cnote/linux-crontab-tasks.html
from urllib import request
import datetime
from rfeed import *
import re
import os
from glob import glob
import feedparser
import pprint
URL="https://www.youtube.com/feeds/videos.xml?channel_id=UCFhp6N5z8W9Ann2eyHAzbbA"
rss = feedparser.parse(URL)
entries=rss['entries'][:3]
BASE_URL="http://xxxxx/down_youtube/"
item_list=[]
for entry in entries:
print(entry['title'])
print(entry['published'])
print(entry['link'])
#print(entry['summary'])
new_link=BASE_URL+"book_audios/%s.mp3"%entry['yt_videoid']
_one = Item(
title = entry['title'],
link = new_link,
description = 'audio: <a href="%s">mp3</a> %s'% (new_link,entry['summary']),
author = "Youtube",
guid = Guid(new_link),
pubDate = datetime.datetime(2020, 1, 1, 4, 0)) # year, month, date, hh, mm, ss
item_list+=[_one]
feed = Feed(
title = "知乎读书会更新",
link = "https://www.xxxxx.biz/atom/updated.xml",
description = "更新Youtube知乎读书会",
language = "en-US",
lastBuildDate = datetime.datetime.now(),
items = item_list)
def save_to_file(file_name, contents):
fh = open(file_name, 'w')
fh.write(contents)
fh.close()
save_to_file('audio.xml', feed.rss())
from os import rename
import youtube_dl
# start downing
if not os.path.exists("book_audios/"):
os.makedirs("book_audios/")
def audio_download(youtube_url):
# 定义某些下载参数
ydl_opts = {
'format': 'bestaudio/best',
# 'download_archive': 'downloaded_songs.txt',
'outtmpl': 'book_audios/%(id)s.%(ext)s',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([youtube_url])
#download('https://www.youtube.com/watch?v=JElpSrsmbTU')
for entry in entries:
if entry['yt_videoid']+'.mp3' not in os.listdir('book_audios'):
print("downing %s, %s"%(entry['title'], entry['link']))
audio_download(entry['link'])
_now=sorted(glob("book_audios/*"),key=os.path.getctime)
print(_now)
if len(_now)>=6: # 多余删除
for _d in _now[:-5]:
os.popen("rm -rf %s"%_d)
下载某管视频
# -*- coding: utf-8 -*-
#
# 2020-04-18
# /etc/cron.hour # https://www.runoob.com/w3cnote/linux-crontab-tasks.html
# pip install feedparser youtube_dl
from urllib import request
import datetime
from rfeed import *
import re
import os
from glob import glob
import feedparser
import pprint
from os import rename
import youtube_dl
def save_to_file(file_name, contents):
fh = open(file_name, 'w')
fh.write(contents)
fh.close()
def audio_download(youtube_url, file_root, only_audio=False):
# 定义某些下载参数
ydl_opts = {
'format': 'best',
'outtmpl': file_root+'/%(id)s.%(ext)s',
}
# 定义某些下载参数
ydl_opts_a = {
'format': 'bestaudio/best',
# 'download_archive': 'downloaded_songs.txt',
'outtmpl': file_root+'/%(id)s.%(ext)s',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
}
if only_audio:
ydl_opts = ydl_opts_a
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([youtube_url])
class YoutubeDowner:
def __init__(self, yurl, burl, file_root, feed_title, only_audio=False):
self.yurl = yurl
self.burl = burl
self.file_root = file_root
self.feed_title = feed_title
self.feed_des = feed_title
self.only_audio = only_audio
self.type_name = "mp4"
if only_audio:
self.type_name = "mp3"
rss = feedparser.parse(yurl)
self.entries = rss['entries'][:3]
if not os.path.exists(file_root):
os.makedirs(file_root)
self.generate_xml()
self.down()
def generate_xml(self):
item_list = []
for entry in self.entries:
print(entry['title'],entry['published'],entry['link'])
# print(entry['summary'])
date_s = entry['published'].split('T')[0].split('-')
dates = [int(x) for x in date_s]
# print(dates)
new_link = self.burl + "/%s.%s" % (entry['yt_videoid'],self.type_name) # 下载的实际地址
print('%s: <a href="%s">%s</a> <pre>%s</pre>' % (self.type_name,new_link,self.type_name,entry['summary']))
_one = Item(
title = entry['title'],
link = new_link,
description = '%s: <a href="%s">%s</a> <pre>%s</pre>' % (self.type_name,new_link,self.type_name,entry['summary']),
author = "Youtube",
guid = Guid(new_link),
pubDate = datetime.datetime(dates[0], dates[1], dates[2], 6, 0)) # year, month, date, hh, mm, ss
item_list+=[_one]
feed = Feed(
title = self.feed_title,
link = "https://www.xxxxx.biz/atom/updated.xml",
description = self.feed_des,
language = "en-US",
lastBuildDate = datetime.datetime.now(),
items = item_list)
save_to_file('%s.xml' % self.file_root, feed.rss())
def down(self):
for entry in self.entries:
if entry['yt_videoid']+'.'+self.type_name not in os.listdir(self.file_root):
print("downing %s, %s"%(entry['title'], entry['link']))
audio_download(entry['link'], self.file_root, self.only_audio)
_now = sorted(glob(self.file_root + "/*"),key=os.path.getctime)
print(_now)
if len(_now)>=6: # 多余删除
for _d in _now[:-5]:
os.popen("rm -rf %s"%_d)
# url_rss = "https://www.youtube.com/feeds/videos.xml?channel_id=UCSs4A6HYKmHA2MG_0z-F0xw"
# url_base="http://xxxxx/"
# root="rss2"
# title="李永乐老师"
# YoutubeDowner(url_rss, url_base, root, title)
最后效果如下:

祝大家使用愉快!