RSS跟踪github文件更新+某管子更新

前言

好久没有更新博客了。RSS这种形式的信息获取方式,似乎又流行起来。

github文件更新

github上面有很多不错的资源。我的流程是:github readme文件更新->svn下载对应文件->生成rss文件。

这里为什么要这么复杂,因为github仓库更新多了好多不需要的rss信息,而且github文件下载比较慢。

最后效果,获取《经济学人》和《纽约客》的更新~

image-20210724005152399

下载脚本

解析readme中的地址,自动下载,并生成rss源。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# -*- coding: utf-8 -*-
# /etc/cron.hour
from urllib import request
import datetime
from rfeed import * # 为生成rss.xml的库
import re
import os
from glob import glob
URL="https://raw.githubusercontent.com/hehonghui/the-economist-ebooks/master/README.md"
BASE_URL="http://your_vps_host/download/"

SVN_BASE_URL="https://github.com/hehonghui/the-economist-ebooks/trunk/"

s="""
* [经济学人 - 周刊, 点击这里下载最新一期](01_economist/te_2021.02.27) , 每周五十一点更新
* [纽约客 - 周刊, 点击这里下载最新一期](02_new_yorker/2021.03.01) , 每周六上午更新
* [卫报 - 每周两期](09_guardian/), 每周三、周日更新

"""
s = request.urlopen(URL).read().decode('utf8')

res= re.findall(r"\[.+\]\((0.+)\)",s)[0:2]


# generate xml
res_list=[]
for item in res:
res_list+=sorted(glob(item.split('/')[0]+"/*"))


item_list=[]
print(item_list)

for item in res_list:
_one = Item(
title = item[3:],
link = BASE_URL+item,
description = 'pdf etc. <a href="%s">link</a> '% (BASE_URL+item),
author = "hehonghui",
guid = Guid(BASE_URL+item),
pubDate = datetime.datetime(2020, 1, 1, 4, 0)) # year, month, date, hh, mm, ss
item_list+=[_one]

feed = Feed(
title = "经济学人+纽约客更新",
link = "https://www.xxxxx.biz/atom/updated.xml",
description = "更新经济学人,纽约客",
language = "en-US",
lastBuildDate = datetime.datetime.now(),
items = item_list)


def save_to_file(file_name, contents):
fh = open(file_name, 'w')
fh.write(contents)
fh.close()

save_to_file('test.xml', feed.rss())


# start downing
for item in res:
if item not in res_list:
print("downing %s"%item)
os.popen("svn checkout %s %s"%(SVN_BASE_URL+item,item))

_now=sorted(glob(item.split('/')[0]+"/*"))
print(_now)
if len(_now)>=5: # 最多5个,免得服务器下载过多。
for _d in _now[:-5]:
os.popen("rm -rf %s"%_d)

在linux亦可使用crontab定时启动下载任务。在 /etc/cron.hourly/ 设置即可。

1
2
3
4
#!/bin/sh
date >> test.log
python3 down_ecomic.py >>test.log
exit 0

下载某管音频,自动转成mp3格式

某管自带rss的,所以只要用feedparser解析一下,然后调用youtube_dl下载一哈。 各种码包自行安装。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# -*- coding: utf-8 -*-
#
# 2020-04-18
# /etc/cron.hour # https://www.runoob.com/w3cnote/linux-crontab-tasks.html
from urllib import request
import datetime
from rfeed import *
import re
import os
from glob import glob
import feedparser
import pprint
URL="https://www.youtube.com/feeds/videos.xml?channel_id=UCFhp6N5z8W9Ann2eyHAzbbA"
rss = feedparser.parse(URL)
entries=rss['entries'][:3]

BASE_URL="http://xxxxx/down_youtube/"

item_list=[]
for entry in entries:
print(entry['title'])
print(entry['published'])
print(entry['link'])
#print(entry['summary'])
new_link=BASE_URL+"book_audios/%s.mp3"%entry['yt_videoid']
_one = Item(
title = entry['title'],
link = new_link,
description = 'audio: <a href="%s">mp3</a> %s'% (new_link,entry['summary']),
author = "Youtube",
guid = Guid(new_link),
pubDate = datetime.datetime(2020, 1, 1, 4, 0)) # year, month, date, hh, mm, ss
item_list+=[_one]




feed = Feed(
title = "知乎读书会更新",
link = "https://www.xxxxx.biz/atom/updated.xml",
description = "更新Youtube知乎读书会",
language = "en-US",
lastBuildDate = datetime.datetime.now(),
items = item_list)

def save_to_file(file_name, contents):
fh = open(file_name, 'w')
fh.write(contents)
fh.close()

save_to_file('audio.xml', feed.rss())




from os import rename
import youtube_dl

# start downing
if not os.path.exists("book_audios/"):
os.makedirs("book_audios/")


def audio_download(youtube_url):
# 定义某些下载参数
ydl_opts = {
'format': 'bestaudio/best',
# 'download_archive': 'downloaded_songs.txt',
'outtmpl': 'book_audios/%(id)s.%(ext)s',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],

}

with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([youtube_url])


#download('https://www.youtube.com/watch?v=JElpSrsmbTU')



for entry in entries:
if entry['yt_videoid']+'.mp3' not in os.listdir('book_audios'):
print("downing %s, %s"%(entry['title'], entry['link']))
audio_download(entry['link'])

_now=sorted(glob("book_audios/*"),key=os.path.getctime)
print(_now)
if len(_now)>=6: # 多余删除
for _d in _now[:-5]:
os.popen("rm -rf %s"%_d)

下载某管视频

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# -*- coding: utf-8 -*-
#
# 2020-04-18
# /etc/cron.hour # https://www.runoob.com/w3cnote/linux-crontab-tasks.html
# pip install feedparser youtube_dl
from urllib import request
import datetime
from rfeed import *
import re
import os
from glob import glob
import feedparser
import pprint
from os import rename
import youtube_dl



def save_to_file(file_name, contents):
fh = open(file_name, 'w')
fh.write(contents)
fh.close()

def audio_download(youtube_url, file_root, only_audio=False):
# 定义某些下载参数
ydl_opts = {
'format': 'best',
'outtmpl': file_root+'/%(id)s.%(ext)s',
}
# 定义某些下载参数
ydl_opts_a = {
'format': 'bestaudio/best',
# 'download_archive': 'downloaded_songs.txt',
'outtmpl': file_root+'/%(id)s.%(ext)s',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],

}
if only_audio:
ydl_opts = ydl_opts_a
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([youtube_url])



class YoutubeDowner:
def __init__(self, yurl, burl, file_root, feed_title, only_audio=False):
self.yurl = yurl
self.burl = burl
self.file_root = file_root
self.feed_title = feed_title
self.feed_des = feed_title
self.only_audio = only_audio
self.type_name = "mp4"
if only_audio:
self.type_name = "mp3"

rss = feedparser.parse(yurl)
self.entries = rss['entries'][:3]
if not os.path.exists(file_root):
os.makedirs(file_root)
self.generate_xml()

self.down()

def generate_xml(self):
item_list = []
for entry in self.entries:
print(entry['title'],entry['published'],entry['link'])
# print(entry['summary'])
date_s = entry['published'].split('T')[0].split('-')
dates = [int(x) for x in date_s]
# print(dates)
new_link = self.burl + "/%s.%s" % (entry['yt_videoid'],self.type_name) # 下载的实际地址
print('%s: <a href="%s">%s</a> <pre>%s</pre>' % (self.type_name,new_link,self.type_name,entry['summary']))
_one = Item(
title = entry['title'],
link = new_link,
description = '%s: <a href="%s">%s</a> <pre>%s</pre>' % (self.type_name,new_link,self.type_name,entry['summary']),
author = "Youtube",
guid = Guid(new_link),
pubDate = datetime.datetime(dates[0], dates[1], dates[2], 6, 0)) # year, month, date, hh, mm, ss
item_list+=[_one]


feed = Feed(
title = self.feed_title,
link = "https://www.xxxxx.biz/atom/updated.xml",
description = self.feed_des,
language = "en-US",
lastBuildDate = datetime.datetime.now(),
items = item_list)

save_to_file('%s.xml' % self.file_root, feed.rss())

def down(self):
for entry in self.entries:
if entry['yt_videoid']+'.'+self.type_name not in os.listdir(self.file_root):
print("downing %s, %s"%(entry['title'], entry['link']))
audio_download(entry['link'], self.file_root, self.only_audio)

_now = sorted(glob(self.file_root + "/*"),key=os.path.getctime)
print(_now)
if len(_now)>=6: # 多余删除
for _d in _now[:-5]:
os.popen("rm -rf %s"%_d)






# url_rss = "https://www.youtube.com/feeds/videos.xml?channel_id=UCSs4A6HYKmHA2MG_0z-F0xw"
# url_base="http://xxxxx/"
# root="rss2"
# title="李永乐老师"

# YoutubeDowner(url_rss, url_base, root, title)

最后效果如下:

祝大家使用愉快!

0%