当前位置:首页 » 《我的小黑屋》 » 正文

python爬取B站视频

11 人参与  2024年04月07日 13:00  分类 : 《我的小黑屋》  评论

点击全文阅读


参考:https://cloud.tencent.com/developer/article/1768680

参考的代码有点问题,请求头需要修改,上代码:

import requestsimport re  # 正则表达式import pprintimport jsonfrom moviepy.editor import AudioFileClip, VideoFileClipfrom bs4 import BeautifulSoup as bsheaders = {    # 防盗链 告诉服务器 我们请求的url网址是从哪里跳转过来的    'referer': 'https://www.bilibili.com/a',    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}def send_request(url):    response = requests.get(url=url, headers=headers)    return responsedef get_video_data(html_data):    """解析视频数据"""    # 提取视频的标题    soup = bs(html_data, 'lxml')    title = soup.find_all(name='h1',attrs={"class":"video-title special-text-indent"})[0].get_text()    # print(title)    # 提取视频对应的json数据    json_data = re.findall('<script>window\.__playinfo__=(.*?)</script>', html_data)[0]    # print(json_data)  # json_data 字符串    json_data = json.loads(json_data)    pprint.pprint(json_data)    # 提取音频的url地址    audio_url = json_data['data']['dash']['audio'][0]['backupUrl'][0]    print('解析到的音频地址:', audio_url)    # 提取视频画面的url地址    video_url = json_data['data']['dash']['video'][0]['backupUrl'][0]    print('解析到的视频地址:', video_url)    video_data = [title, audio_url, video_url]    return video_datadef save_data(file_name, audio_url, video_url):    # 请求数据    print('正在请求音频数据')    audio_data = send_request(audio_url).content    print('正在请求视频数据')    video_data = send_request(video_url).content    with open(file_name + '.mp3', mode='wb') as f:        f.write(audio_data)        print('正在保存音频数据')    with open(file_name + '.mp4', mode='wb') as f:        f.write(video_data)        print('正在保存视频数据')def merge_data(video_name):    print('视频合成开始:', video_name)    audioclip = AudioFileClip(video_name+'.mp3')    videoclip = VideoFileClip(video_name+'.mp4')    # 3.获取视频和音频的时长    video_time = videoclip.duration    audio_time = audioclip.duration    # 4.对视频或者音频进行裁剪    if video_time > audio_time:        # 视频时长>音频时长,对视频进行截取        videoclip_new = videoclip.subclip(0, audio_time)        audioclip_new = audioclip    else:        # 音频时长>视频时长,对音频进行截取        videoclip_new = videoclip        audioclip_new = audioclip.subclip(0, video_time)    # 5.视频中加入音频    video_with_new_audio = videoclip_new.set_audio(audioclip_new)    # 6.写入到新的视频文件中    video_with_new_audio.write_videofile("output.mp4",                                         codec='libx264',                                         audio_codec='aac',                                         temp_audiofile='temp-video.m4a',                                         remove_temp=True                                         )    print('视频合成结束:', video_name)url = 'https://www.bilibili.com/video/BV1bK421a7qG/?spm_id_from=333.1007.tianma.6-4-22.click'response = send_request(url)response.encoding = requests.utils.get_encodings_from_content(response.text)[0]html_data = response.textvideo_data = get_video_data(html_data)save_data(video_data[0], video_data[1], video_data[2])merge_data(video_data[0])

效果

小姐姐挺靓,就是左下角有水印,想办法去除水印,参考:python实战之去除视频水印&字幕_python 去除视频水印-CSDN博客

import osimport sysimport cv2import numpyfrom moviepy import editor TEMP_VIDEO = 'temp.mp4'  class WatermarkRemover():     def __init__(self, video_path, output, threshold: int, kernel_size: int):        self.threshold = threshold  # 阈值分割所用阈值        self.kernel_size = kernel_size  # 膨胀运算核尺寸        self.video_path = video_path        self.output = output      #根据用户手动选择的ROI(Region of Interest,感兴趣区域)框选水印或字幕位置。    def select_roi(self, img: numpy.ndarray, hint: str) -> list:        '''    框选水印或字幕位置,SPACE或ENTER键退出    :param img: 显示图片    :return: 框选区域坐标    '''        COFF = 0.7        w, h = int(COFF * img.shape[1]), int(COFF * img.shape[0])        resize_img = cv2.resize(img, (w, h))        roi = cv2.selectROI(hint, resize_img, False, False)        cv2.destroyAllWindows()        watermark_roi = [int(roi[0] / COFF), int(roi[1] / COFF), int(roi[2] / COFF), int(roi[3] / COFF)]        return watermark_roi      #对输入的蒙版进行膨胀运算,扩大蒙版的范围    def dilate_mask(self, mask: numpy.ndarray) -> numpy.ndarray:         '''    对蒙版进行膨胀运算    :param mask: 蒙版图片    :return: 膨胀处理后蒙版    '''        kernel = numpy.ones((self.kernel_size, self.kernel_size), numpy.uint8)        mask = cv2.dilate(mask, kernel)        return mask        #根据手动选择的ROI区域,在单帧图像中生成水印或字幕的蒙版。    def generate_single_mask(self, img: numpy.ndarray, roi: list, threshold: int) -> numpy.ndarray:        '''    通过手动选择的ROI区域生成单帧图像的水印蒙版    :param img: 单帧图像    :param roi: 手动选择区域坐标    :param threshold: 二值化阈值    :return: 水印蒙版    '''        # 区域无效,程序退出        if len(roi) != 4:            print('NULL ROI!')            sys.exit()         # 复制单帧灰度图像ROI内像素点        roi_img = numpy.zeros((img.shape[0], img.shape[1]), numpy.uint8)        start_x, end_x = int(roi[1]), int(roi[1] + roi[3])        start_y, end_y = int(roi[0]), int(roi[0] + roi[2])        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)        roi_img[start_x:end_x, start_y:end_y] = gray[start_x:end_x, start_y:end_y]         # 阈值分割        _, mask = cv2.threshold(roi_img, threshold, 255, cv2.THRESH_BINARY)        return mask     #通过截取视频中多帧图像生成多张水印蒙版,并通过逻辑与计算生成最终的水印蒙版    def generate_watermark_mask(self, video_path: str) -> numpy.ndarray:        '''    截取视频中多帧图像生成多张水印蒙版,通过逻辑与计算生成最终水印蒙版    :param video_path: 视频文件路径    :return: 水印蒙版    '''        video = cv2.VideoCapture(video_path)        success, frame = video.read()        roi = self.select_roi(frame, 'select watermark ROI')        mask = numpy.ones((frame.shape[0], frame.shape[1]), numpy.uint8)        mask.fill(255)         step = video.get(cv2.CAP_PROP_FRAME_COUNT) // 5        index = 0        while success:            if index % step == 0:                mask = cv2.bitwise_and(mask, self.generate_single_mask(frame, roi, self.threshold))            success, frame = video.read()            index += 1        video.release()         return self.dilate_mask(mask)     #根据手动选择的ROI区域,在单帧图像中生成字幕的蒙版。    def generate_subtitle_mask(self, frame: numpy.ndarray, roi: list) -> numpy.ndarray:        '''    通过手动选择ROI区域生成单帧图像字幕蒙版    :param frame: 单帧图像    :param roi: 手动选择区域坐标    :return: 字幕蒙版    '''        mask = self.generate_single_mask(frame, [0, roi[1], frame.shape[1], roi[3]], self.threshold)  # 仅使用ROI横坐标区域        return self.dilate_mask(mask)     def inpaint_image(self, img: numpy.ndarray, mask: numpy.ndarray) -> numpy.ndarray:        '''    修复图像    :param img: 单帧图像    :parma mask: 蒙版    :return: 修复后图像    '''        telea = cv2.inpaint(img, mask, 1, cv2.INPAINT_TELEA)        return telea      def merge_audio(self, input_path: str, output_path: str, temp_path: str):        '''    合并音频与处理后视频    :param input_path: 原视频文件路径    :param output_path: 封装音视频后文件路径    :param temp_path: 无声视频文件路径    '''        with editor.VideoFileClip(input_path) as video:            audio = video.audio            with editor.VideoFileClip(temp_path) as opencv_video:                clip = opencv_video.set_audio(audio)                clip.to_videofile(output_path)     def remove_video_watermark(self):        '''    去除视频水印    '''        if not os.path.exists(self.output):            os.makedirs(self.output)         filenames = [os.path.join(self.video_path, i) for i in os.listdir(self.video_path)]        mask = None         for i, name in enumerate(filenames):            if i == 0:                # 生成水印蒙版                mask = self.generate_watermark_mask(name)             # 创建待写入文件对象            video = cv2.VideoCapture(name)            fps = video.get(cv2.CAP_PROP_FPS)            size = (int(video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)))            video_writer = cv2.VideoWriter(TEMP_VIDEO, cv2.VideoWriter_fourcc(*'mp4v'), fps, size)             # 逐帧处理图像            success, frame = video.read()             while success:                frame = self.inpaint_image(frame, mask)                video_writer.write(frame)                success, frame = video.read()             video.release()            video_writer.release()             # 封装视频            (_, filename) = os.path.split(name)            output_path = os.path.join(self.output, filename.split('.')[0] + '_no_watermark.mp4')  # 输出文件路径            self.merge_audio(name, output_path, TEMP_VIDEO)     if os.path.exists(TEMP_VIDEO):        os.remove(TEMP_VIDEO)     def remove_video_subtitle(self):        '''去除视频字幕'''        if not os.path.exists(self.output):            os.makedirs(self.output)         filenames = [os.path.join(self.video_path, i) for i in os.listdir(self.video_path)]        roi = []         for i, name in enumerate(filenames):            # 创建待写入文件对象            video = cv2.VideoCapture(name)            fps = video.get(cv2.CAP_PROP_FPS)            size = (int(video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)))            video_writer = cv2.VideoWriter(TEMP_VIDEO, cv2.VideoWriter_fourcc(*'mp4v'), fps, size)             # 逐帧处理图像            success, frame = video.read()            if i == 0:                roi = self.select_roi(frame, 'select subtitle ROI')             while success:                mask = self.generate_subtitle_mask(frame, roi)                frame = self.inpaint_image(frame, mask)                video_writer.write(frame)                success, frame = video.read()             video.release()            video_writer.release()             # 封装视频            (_, filename) = os.path.split(name)            output_path = os.path.join(OUTPUT_PATH, filename.split('.')[0] + '_no_sub.mp4')  # 输出文件路径            self.merge_audio(name, output_path, TEMP_VIDEO)         if os.path.exists(TEMP_VIDEO):            os.remove(TEMP_VIDEO)  # 去水印video_path = 'video'output_path = 'output'remover = WatermarkRemover(video_path,output_path,threshold=80, kernel_size=5)remover.remove_video_watermark()   #去字幕# remover = WatermarkRemover(video_path,output_path,threshold=80, kernel_size=5)# remover.remove_video_subtitle()

效果一般吧:


点击全文阅读


本文链接:http://m.zhangshiyu.com/post/91663.html

<< 上一篇 下一篇 >>

  • 评论(0)
  • 赞助本站

◎欢迎参与讨论,请在这里发表您的看法、交流您的观点。

关于我们 | 我要投稿 | 免责申明

Copyright © 2020-2022 ZhangShiYu.com Rights Reserved.豫ICP备2022013469号-1