Python实现GIF动图以及视频卡通化详解
本文主要介绍了如何使用Python中的animegan2-pytorch实现动图以及视频的卡通化效果,文中的代码具有一定的学习价值,需要的朋友可以参考一下
目录
前言
环境依赖
核心代码
gif动图卡通化
视频卡通化
总结
前言
参考文章:Python实现照片卡通化
我继续魔改一下,让该模型可以支持将gif动图或者视频,也做成卡通化效果。毕竟一张图可以那就带边视频也可以,没毛病。所以继给次元壁来了一拳,我在加两脚。
项目github地址:github地址
环境依赖
除了参考文章中的依赖,还需要加一些其他依赖,requirements.txt如下:
其他环境不太清楚的,可以看我前言链接地址的文章,有具体说明。
核心代码
不废话了,先上gif代码。
gif动图卡通化
实现代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | #!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2021/12/5 18:10 # @Author : 剑客阿良_ALiang # @Site : # @File : gif_cartoon_tool.py # !/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2021/12/5 0:26 # @Author : 剑客阿良_ALiang # @Site : # @File : video_cartoon_tool.py # !/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2021/12/4 22:34 # @Author : 剑客阿良_ALiang # @Site : # @File : image_cartoon_tool.py from PIL import Image, ImageEnhance, ImageSequence import torch from torchvision.transforms.functional import to_tensor, to_pil_image from torch import nn import os import torch.nn.functional as F import uuid import imageio # -------------------------- hy add 01 -------------------------- class ConvNormLReLU(nn.Sequential): def __init__( self , in_ch, out_ch, kernel_size = 3 , stride = 1 , padding = 1 , pad_mode = "reflect" , groups = 1 , bias = False ): pad_layer = { "zero" : nn.ZeroPad2d, "same" : nn.ReplicationPad2d, "reflect" : nn.ReflectionPad2d, } if pad_mode not in pad_layer: raise NotImplementedError super (ConvNormLReLU, self ).__init__( pad_layer[pad_mode](padding), nn.Conv2d(in_ch, out_ch, kernel_size = kernel_size, stride = stride, padding = 0 , groups = groups, bias = bias), nn.GroupNorm(num_groups = 1 , num_channels = out_ch, affine = True ), nn.LeakyReLU( 0.2 , inplace = True ) ) class InvertedResBlock(nn.Module): def __init__( self , in_ch, out_ch, expansion_ratio = 2 ): super (InvertedResBlock, self ).__init__() self .use_res_connect = in_ch = = out_ch bottleneck = int ( round (in_ch * expansion_ratio)) layers = [] if expansion_ratio ! = 1 : layers.append(ConvNormLReLU(in_ch, bottleneck, kernel_size = 1 , padding = 0 )) # dw layers.append(ConvNormLReLU(bottleneck, bottleneck, groups = bottleneck, bias = True )) # pw layers.append(nn.Conv2d(bottleneck, out_ch, kernel_size = 1 , padding = 0 , bias = False )) layers.append(nn.GroupNorm(num_groups = 1 , num_channels = out_ch, affine = True )) self .layers = nn.Sequential( * layers) def forward( self , input ): out = self .layers( input ) if self .use_res_connect: out = input + out return out class Generator(nn.Module): def __init__( self , ): super ().__init__() self .block_a = nn.Sequential( ConvNormLReLU( 3 , 32 , kernel_size = 7 , padding = 3 ), ConvNormLReLU( 32 , 64 , stride = 2 , padding = ( 0 , 1 , 0 , 1 )), ConvNormLReLU( 64 , 64 ) ) self .block_b = nn.Sequential( ConvNormLReLU( 64 , 128 , stride = 2 , padding = ( 0 , 1 , 0 , 1 )), ConvNormLReLU( 128 , 128 ) ) self .block_c = nn.Sequential( ConvNormLReLU( 128 , 128 ), InvertedResBlock( 128 , 256 , 2 ), InvertedResBlock( 256 , 256 , 2 ), InvertedResBlock( 256 , 256 , 2 ), InvertedResBlock( 256 , 256 , 2 ), ConvNormLReLU( 256 , 128 ), ) self .block_d = nn.Sequential( ConvNormLReLU( 128 , 128 ), ConvNormLReLU( 128 , 128 ) ) self .block_e = nn.Sequential( ConvNormLReLU( 128 , 64 ), ConvNormLReLU( 64 , 64 ), ConvNormLReLU( 64 , 32 , kernel_size = 7 , padding = 3 ) ) self .out_layer = nn.Sequential( nn.Conv2d( 32 , 3 , kernel_size = 1 , stride = 1 , padding = 0 , bias = False ), nn.Tanh() ) def forward( self , input , align_corners = True ): out = self .block_a( input ) half_size = out.size()[ - 2 :] out = self .block_b(out) out = self .block_c(out) if align_corners: out = F.interpolate(out, half_size, mode = "bilinear" , align_corners = True ) else : out = F.interpolate(out, scale_factor = 2 , mode = "bilinear" , align_corners = False ) out = self .block_d(out) if align_corners: out = F.interpolate(out, input .size()[ - 2 :], mode = "bilinear" , align_corners = True ) else : out = F.interpolate(out, scale_factor = 2 , mode = "bilinear" , align_corners = False ) out = self .block_e(out) out = self .out_layer(out) return out # -------------------------- hy add 02 -------------------------- def handle(gif_path: str , output_dir: str , type : int , device = 'cpu' ): _ext = os.path.basename(gif_path).strip().split( '.' )[ - 1 ] if type = = 1 : _checkpoint = './weights/paprika.pt' elif type = = 2 : _checkpoint = './weights/face_paint_512_v1.pt' elif type = = 3 : _checkpoint = './weights/face_paint_512_v2.pt' elif type = = 4 : _checkpoint = './weights/celeba_distill.pt' else : raise Exception( 'type not support' ) os.makedirs(output_dir, exist_ok = True ) net = Generator() net.load_state_dict(torch.load(_checkpoint, map_location = "cpu" )) net.to(device). eval () result = os.path.join(output_dir, '{}.{}' . format (uuid.uuid1(). hex , _ext)) img = Image. open (gif_path) out_images = [] for frame in ImageSequence.Iterator(img): frame = frame.convert( "RGB" ) with torch.no_grad(): image = to_tensor(frame).unsqueeze( 0 ) * 2 - 1 out = net(image.to(device), False ).cpu() out = out.squeeze( 0 ).clip( - 1 , 1 ) * 0.5 + 0.5 out = to_pil_image(out) out_images.append(out) # out_images[0].save(result, save_all=True, loop=True, append_images=out_images[1:], duration=100) imageio.mimsave(result, out_images, fps = 15 ) return result if __name__ = = '__main__' : print (handle( 'samples/gif/128.gif' , 'samples/gif_result/' , 3 , 'cuda' )) |
代码说明:
1、主要的handle方法入参分别为:gif地址、输出目录、类型、设备使用(默认cpu,可选cuda使用显卡)。
2、类型主要是选择模型,最好用3,人像处理更生动一些。
执行验证一下
下面是我准备的gif素材
执行结果如下
看一下效果
哈哈,有点意思哦。
视频卡通化
实现代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 | #!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2021/12/5 0:26 # @Author : 剑客阿良_ALiang # @Site : # @File : video_cartoon_tool.py # !/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2021/12/4 22:34 # @Author : 剑客阿良_ALiang # @Site : # @File : image_cartoon_tool.py from PIL import Image, ImageEnhance import torch from torchvision.transforms.functional import to_tensor, to_pil_image from torch import nn import os import torch.nn.functional as F import uuid import cv2 import numpy as np import time from ffmpy import FFmpeg # -------------------------- hy add 01 -------------------------- class ConvNormLReLU(nn.Sequential): def __init__( self , in_ch, out_ch, kernel_size = 3 , stride = 1 , padding = 1 , pad_mode = "reflect" , groups = 1 , bias = False ): pad_layer = { "zero" : nn.ZeroPad2d, "same" : nn.ReplicationPad2d, "reflect" : nn.ReflectionPad2d, } if pad_mode not in pad_layer: raise NotImplementedError super (ConvNormLReLU, self ).__init__( pad_layer[pad_mode](padding), nn.Conv2d(in_ch, out_ch, kernel_size = kernel_size, stride = stride, padding = 0 , groups = groups, bias = bias), nn.GroupNorm(num_groups = 1 , num_channels = out_ch, affine = True ), nn.LeakyReLU( 0.2 , inplace = True ) ) class InvertedResBlock(nn.Module): def __init__( self , in_ch, out_ch, expansion_ratio = 2 ): super (InvertedResBlock, self ).__init__() self .use_res_connect = in_ch = = out_ch bottleneck = int ( round (in_ch * expansion_ratio)) layers = [] if expansion_ratio ! = 1 : layers.append(ConvNormLReLU(in_ch, bottleneck, kernel_size = 1 , padding = 0 )) # dw layers.append(ConvNormLReLU(bottleneck, bottleneck, groups = bottleneck, bias = True )) # pw layers.append(nn.Conv2d(bottleneck, out_ch, kernel_size = 1 , padding = 0 , bias = False )) layers.append(nn.GroupNorm(num_groups = 1 , num_channels = out_ch, affine = True )) self .layers = nn.Sequential( * layers) def forward( self , input ): out = self .layers( input ) if self .use_res_connect: out = input + out return out class Generator(nn.Module): def __init__( self , ): super ().__init__() self .block_a = nn.Sequential( ConvNormLReLU( 3 , 32 , kernel_size = 7 , padding = 3 ), ConvNormLReLU( 32 , 64 , stride = 2 , padding = ( 0 , 1 , 0 , 1 )), ConvNormLReLU( 64 , 64 ) ) self .block_b = nn.Sequential( ConvNormLReLU( 64 , 128 , stride = 2 , padding = ( 0 , 1 , 0 , 1 )), ConvNormLReLU( 128 , 128 ) ) self .block_c = nn.Sequential( ConvNormLReLU( 128 , 128 ), InvertedResBlock( 128 , 256 , 2 ), InvertedResBlock( 256 , 256 , 2 ), InvertedResBlock( 256 , 256 , 2 ), InvertedResBlock( 256 , 256 , 2 ), ConvNormLReLU( 256 , 128 ), ) self .block_d = nn.Sequential( ConvNormLReLU( 128 , 128 ), ConvNormLReLU( 128 , 128 ) ) self .block_e = nn.Sequential( ConvNormLReLU( 128 , 64 ), ConvNormLReLU( 64 , 64 ), ConvNormLReLU( 64 , 32 , kernel_size = 7 , padding = 3 ) ) self .out_layer = nn.Sequential( nn.Conv2d( 32 , 3 , kernel_size = 1 , stride = 1 , padding = 0 , bias = False ), nn.Tanh() ) def forward( self , input , align_corners = True ): out = self .block_a( input ) half_size = out.size()[ - 2 :] out = self .block_b(out) out = self .block_c(out) if align_corners: out = F.interpolate(out, half_size, mode = "bilinear" , align_corners = True ) else : out = F.interpolate(out, scale_factor = 2 , mode = "bilinear" , align_corners = False ) out = self .block_d(out) if align_corners: out = F.interpolate(out, input .size()[ - 2 :], mode = "bilinear" , align_corners = True ) else : out = F.interpolate(out, scale_factor = 2 , mode = "bilinear" , align_corners = False ) out = self .block_e(out) out = self .out_layer(out) return out # -------------------------- hy add 02 -------------------------- def handle(video_path: str , output_dir: str , type : int , fps: int , device = 'cpu' ): _ext = os.path.basename(video_path).strip().split( '.' )[ - 1 ] if type = = 1 : _checkpoint = './weights/paprika.pt' elif type = = 2 : _checkpoint = './weights/face_paint_512_v1.pt' elif type = = 3 : _checkpoint = './weights/face_paint_512_v2.pt' elif type = = 4 : _checkpoint = './weights/celeba_distill.pt' else : raise Exception( 'type not support' ) os.makedirs(output_dir, exist_ok = True ) # 获取视频音频 _audio = extract(video_path, output_dir, 'wav' ) net = Generator() net.load_state_dict(torch.load(_checkpoint, map_location = "cpu" )) net.to(device). eval () result = os.path.join(output_dir, '{}.{}' . format (uuid.uuid1(). hex , _ext)) capture = cv2.VideoCapture(video_path) size = ( int (capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int (capture.get(cv2.CAP_PROP_FRAME_HEIGHT))) print (size) videoWriter = cv2.VideoWriter(result, cv2.VideoWriter_fourcc( * 'mp4v' ), fps, size) cul = 0 with torch.no_grad(): while True : ret, frame = capture.read() if ret: print (ret) image = to_tensor(frame).unsqueeze( 0 ) * 2 - 1 out = net(image.to(device), False ).cpu() out = out.squeeze( 0 ).clip( - 1 , 1 ) * 0.5 + 0.5 out = to_pil_image(out) contrast_enhancer = ImageEnhance.Contrast(out) img_enhanced_image = contrast_enhancer.enhance( 2 ) enhanced_image = np.asarray(img_enhanced_image) videoWriter.write(enhanced_image) cul + = 1 print ( '第{}张图' . format (cul)) else : break videoWriter.release() # 视频添加原音频 _final_video = video_add_audio(result, _audio, output_dir) return _final_video # -------------------------- hy add 03 -------------------------- def extract(video_path: str , tmp_dir: str , ext: str ): file_name = '.' .join(os.path.basename(video_path).split( '.' )[ 0 : - 1 ]) print ( '文件名:{},提取音频' . format (file_name)) if ext = = 'mp3' : return _run_ffmpeg(video_path, os.path.join(tmp_dir, '{}.{}' . format (uuid.uuid1(). hex , ext)), 'mp3' ) if ext = = 'wav' : return _run_ffmpeg(video_path, os.path.join(tmp_dir, '{}.{}' . format (uuid.uuid1(). hex , ext)), 'wav' ) def _run_ffmpeg(video_path: str , audio_path: str , format : str ): ff = FFmpeg(inputs = {video_path: None }, outputs = {audio_path: '-f {} -vn' . format ( format )}) print (ff.cmd) ff.run() return audio_path # 视频添加音频 def video_add_audio(video_path: str , audio_path: str , output_dir: str ): _ext_video = os.path.basename(video_path).strip().split( '.' )[ - 1 ] _ext_audio = os.path.basename(audio_path).strip().split( '.' )[ - 1 ] if _ext_audio not in [ 'mp3' , 'wav' ]: raise Exception( 'audio format not support' ) _codec = 'copy' if _ext_audio = = 'wav' : _codec = 'aac' result = os.path.join( output_dir, '{}.{}' . format ( uuid.uuid4(), _ext_video)) ff = FFmpeg( inputs = {video_path: None , audio_path: None }, outputs = {result: '-map 0:v -map 1:a -c:v copy -c:a {} -shortest' . format (_codec)}) print (ff.cmd) ff.run() return result if __name__ = = '__main__' : print (handle( 'samples/video/981.mp4' , 'samples/video_result/' , 3 , 25 , 'cuda' )) |
代码说明
1、主要的实现方法入参分别为:视频地址、输出目录、类型、fps(帧率)、设备类型(默认cpu,可选择cuda显卡模式)。
2、类型主要是选择模型,最好用3,人像处理更生动一些。
3、代码设计思路:先将视频音频提取出来、将视频逐帧处理后写入新视频、新视频和原视频音频融合。
关于如何视频提取音频可以参考我的另一篇文章:python 提取视频中的音频
关于如何视频融合音频可以参考我的另一篇文章:Python 视频添加音频
4、视频中间会产生临时文件,没有清理,如需要可以修改代码自行清理。
验证一下
下面是我准备的视频素材截图,我会上传到github上。
执行结果
看看效果截图
还是很不错的哦。
总结
这次可不是没什么好总结的,总结的东西蛮多的。首先我说一下这个开源项目目前模型的一些问题。
1、我测试了不少图片,总的来说对亚洲人的脸型不能很好的卡通化,但是欧美的脸型都比较好。所以还是训练的数据不是很够,但是能理解,毕竟要专门做卡通化的标注数据想想就是蛮头疼的事。所以我建议大家在使用的时候,多关注一下项目是否更新了最新的模型。
2、视频一但有字幕,会对字幕也做处理。所以可以考虑找一些视频和字幕分开的素材,效果会更好一些。
以上就是Python实现GIF动图以及视频卡通化详解的详细内容
原文链接:https://blog.csdn.net/zhiweihongyan1/article/details/121736254