python利用xpath爬取网上数据并存储到django模型中
这篇文章主要介绍了python利用xpath爬取网上数据并存储到django模型中,本文给大家介绍的非常详细,对大家的学习或工作具有一定的参考借鉴价值,需要的朋友可以参考下
帮朋友制作一个网站,需要一些产品数据信息,因为是代理其他公司产品,直接爬取代理公司产品数据
1.设计数据库
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | from django.db import models from uuslug import slugify import uuid import os def products_directory_path(instance, filename): ext = filename.split( '.' )[ - 1 ] filename = '{}.{}' . format (uuid.uuid4(). hex [: 8 ], ext) # return the whole path to the file return os.path.join( 'images' , "products" , instance.title, filename) def product_relatedimage_directory_path(instance, filename): ext = filename.split( '.' )[ - 1 ] filename = '{}.{}' . format (uuid.uuid4(). hex [: 8 ], ext) # return the whole path to the file return os.path.join( 'images' , "product_relatedimage" , instance.product.title, filename) class ProductsCategory(models.Model): """产品分类""" name = models.CharField( '产品分类名' , max_length = 80 , unique = True ) description = models.TextField( '产品分类描述' , blank = True , null = True ) slug = models.SlugField( 'slug' , max_length = 80 , blank = True , null = True ) parent_category = models.ForeignKey( 'self' , verbose_name = "父级分类" , blank = True , null = True , on_delete = models.CASCADE) def save( self , * args, * * kwargs): if not self . id or not self .slug: self .slug = slugify( self .name) super ().save( * args, * * kwargs) def __str__( self ): return self .name class Meta: ordering = [ 'name' ] verbose_name = "产品分类" verbose_name_plural = verbose_name class ProductsTag(models.Model): """产品标签""" name = models.CharField( '产品标签名' , max_length = 30 , unique = True ) slug = models.SlugField( 'slug' , max_length = 40 ) def __str__( self ): return self .name def save( self , * args, * * kwargs): if not self . id or not self .slug: self .slug = slugify( self .name) super ().save( * args, * * kwargs) class Meta: ordering = [ 'name' ] verbose_name = "产品标签" verbose_name_plural = verbose_name class Product(models.Model): title = models.CharField( '标题' , max_length = 255 , unique = True ) slug = models.SlugField( 'slug' , max_length = 255 , blank = True , null = True ) jscs = models.TextField( '技术参数' , blank = True , null = True ) image = models.ImageField(upload_to = products_directory_path, verbose_name = "产品图片" ) views = models.PositiveIntegerField( '浏览量' , default = 0 ) category = models.ForeignKey( 'ProductsCategory' , verbose_name = '分类' , on_delete = models.CASCADE, blank = True , null = True ) tags = models.ManyToManyField( 'ProductsTag' , verbose_name = '标签集合' , blank = True ) def save( self , * args, * * kwargs): if not self . id or not self .slug: self .slug = slugify( self .title) super ().save( * args, * * kwargs) def update_views( self ): self .views + = 1 self .save(update_fields = [ 'views' ]) def get_pre( self ): return Product.objects. filter (id__lt = self . id ).order_by( '-id' ).first() def get_next( self ): return Product.objects. filter (id__gt = self . id ).order_by( 'id' ).first() def __str__( self ): return self .title class Meta: verbose_name = "产品" verbose_name_plural = verbose_name class ProductAdvantage(models.Model): content = models.TextField( '产品优势' , blank = True , null = True ) product = models.ForeignKey(Product, on_delete = models.CASCADE, blank = True , null = True ) def __str__( self ): return self .content class Meta: verbose_name = "产品优势" verbose_name_plural = verbose_name class ProductBody(models.Model): body = models.CharField( '产品内容' , max_length = 256 , blank = True , null = True ) product = models.ForeignKey(Product, on_delete = models.CASCADE, blank = True , null = True ) def __str__( self ): return self .product.title class Meta: verbose_name = "产品内容" verbose_name_plural = verbose_name |
2.脚本编写
2.1编写获取网页源代码函数
1 2 3 4 5 6 7 8 9 10 11 12 | def get_one_page(url): try : headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36" } res = requests.get(url = url, headers = headers) res.encoding = 'utf-8' if res.status_code = = 200 : return res.text else : return None except Exception: return None |
2.2根据base页面获取所有产品分类页面链接
1 2 3 4 5 6 7 8 9 | if __name__ = = '__main__' : content = get_one_page(url) tree = etree.HTML(content) # 产品分类url catgory_urls = tree.xpath( '//div[@class="fdh-01-nav"]/div/h3/a/@href' ) # 处理catgory_urls for url in catgory_urls: print (url) |
2.3根据产品分类页面链接获取对应所有产品链接
1 2 3 4 5 6 7 8 9 10 11 12 13 | if __name__ = = '__main__' : content = get_one_page(url) tree = etree.HTML(content) # 产品分类 catgory = tree.xpath( '//div[@class="cplb-3n-ts-03 b"]/h3/span/text()' ) print ( "产品分类:" + catgory[ 0 ]) # 该分类下产品url urls = tree.xpath( '//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href' ) # 处理url for url in urls: print (url) print ( "=====================================================" ) |
两者结合起来就可以打印出所有产品链接
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | if __name__ = = '__main__' : content = get_one_page(url) tree = etree.HTML(content) # 产品分类url catgory_urls = tree.xpath( '//div[@class="fdh-01-nav"]/div/h3/a/@href' ) # 处理catgory_urls for url in catgory_urls: content = get_one_page(url) tree = etree.HTML(content) # 产品分类 catgory = tree.xpath( '//div[@class="cplb-3n-ts-03 b"]/h3/span/text()' ) print ( "产品分类:" + catgory[ 0 ]) # 该分类下产品url urls = tree.xpath( '//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href' ) # 处理url for url in urls: print (url) print ( "=====================================================" ) |
2.2使用xpath解析函数返回产品链接的内容
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | if __name__ = = '__main__' : content = get_one_page(url) tree = etree.HTML(content) # 产品名称 title = tree.xpath( '//*[@id="wrap"]//h1/text()' ) images = tree.xpath( '//div[@class="sol_tj_left"]/a/img/@src' ) # 产品图片 # 性能特点 xntd = tree.xpath( '//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()' ) # 技术参数 jscs = tree.xpath( '//table' )[ 0 ] jscs_str = etree.tostring(jscs, encoding = 'utf-8' ).decode( 'utf-8' ) # 产品内容 cpnr = tree.xpath( '//div[@class="describe"]/p' ) print ( '产品名称:' + title[ 0 ]) print ( '产品图片:' + images_url) for td in xntd: print ( '性能特点:' + td) print ( '技术参数:' + jscs_str) for cp in cpnr: # string(.) 获取当前标签下所有文本内容 cp = cp.xpath( 'string(.)' ) print ( '产品内容:' + cp) print ( '============================================' ) |
将三者结合在一起就可以获取所有产品信息
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | if __name__ = = '__main__' : content = get_one_page(url) tree = etree.HTML(content) # 产品分类url catgory_urls = tree.xpath( '//div[@class="fdh-01-nav"]/div/h3/a/@href' ) # 处理catgory_urls for url in catgory_urls: content = get_one_page(url) tree = etree.HTML(content) # 产品分类 catgory = tree.xpath( '//div[@class="cplb-3n-ts-03 b"]/h3/span/text()' ) # 该分类下产品url urls = tree.xpath( '//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href' ) # 处理url for url in urls: content = get_one_page(url) try : tree = etree.HTML(content) # 产品名称 title = tree.xpath( '//*[@id="wrap"]//h1/text()' ) images = tree.xpath( '//div[@class="sol_tj_left"]/a/img/@src' ) # 产品图片 # 性能特点 xntd = tree.xpath( '//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()' ) # 技术参数 jscs = tree.xpath( '//table' )[ 0 ] jscs_str = etree.tostring(jscs, encoding = 'utf-8' ).decode( 'utf-8' ) # 产品内容 cpnr = tree.xpath( '//div[@class="describe"]/p' ) print ( "产品分类:" + catgory[ 0 ]) print ( '产品链接:' + url) print ( '产品名称:' + title[ 0 ]) print ( '产品图片:' + images_url) for td in xntd: print ( '性能特点:' + td.strip()) # print('技术参数:' + jscs_str) for cp in cpnr: # string(.) 获取当前标签下所有文本内容 cp = cp.xpath( 'string(.)' ) print ( '产品内容:' + cp) print ( '============================================' ) except Exception as e: print (e) print ( '出错url:' + url) pass |
3.存储到django模型
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 | import requests from lxml.html import etree import os import django import uuid from django.core.files.base import ContentFile os.environ.setdefault( "DJANGO_SETTINGS_MODULE" , "jiaobanzhan.settings" ) django.setup() from products.models import ProductBody, ProductsCategory, Product, ProductAdvantage def get_one_page(url): try : headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36" } res = requests.get(url = url, headers = headers, timeout = 10 ) res.encoding = 'utf-8' if res.status_code = = 200 : return res.text else : return None except Exception: print ( 'aa' ) return None if __name__ = = '__main__' : content = get_one_page(url) tree = etree.HTML(content) # 产品分类url catgory_urls = tree.xpath( '//div[@class="fdh-01-nav"]/div/h3/a/@href' ) # 处理catgory_urls for url in catgory_urls: content = get_one_page(url) tree = etree.HTML(content) # 产品分类 p_catgory = tree.xpath( '//div[@class="cplb-3n-ts-03 b"]/h3/span/text()' ) # 该分类下产品url urls = tree.xpath( '//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href' ) # 处理url for url in urls: content = get_one_page(url) try : tree = etree.HTML(content) # 产品名称 title = tree.xpath( '//*[@id="wrap"]//h1/text()' ) images = tree.xpath( '//div[@class="sol_tj_left"]/a/img/@src' ) # 产品图片 # 性能特点 xntd = tree.xpath( '//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()' ) # 技术参数 jscs = tree.xpath( '//table' )[ 0 ] jscs_str = etree.tostring(jscs, encoding = 'utf-8' ).decode( 'utf-8' ) # 产品内容 cpnr = tree.xpath( '//div[@class="describe"]/p' ) # 判断是否有这分类,没有则新建 catgory = p_catgory[ 0 ] products_catgory = ProductsCategory.objects. filter (name = catgory).exists() if products_catgory: products_catgory = ProductsCategory.objects.get(name = catgory) else : products_catgory = ProductsCategory(name = catgory) products_catgory.save() print (products_catgory) # 保存产品图片 image_content = requests.get(url = images_url) ext = images_url.split( '.' )[ - 1 ] # 获取图片类型 filename = '{}.{}' . format (uuid.uuid4(). hex [: 8 ], ext) # 随机生成图片名字 upload_image_file = ContentFile(image_content.content, name = filename) # 将图片保存为django类型 product = Product(title = title[ 0 ], jscs = jscs_str, image = upload_image_file, category = products_catgory) product.save() for td in xntd: product_advantage = ProductAdvantage() product_advantage.content = td product_advantage.product = product product_advantage.save() for cp in cpnr: cp = cp.xpath( 'string(.)' ) product_body = ProductBody() product_body.body = cp product_body.product = product product_body.save() except Exception as e: print (e) print ( '出错url:' + url) |
最后自己手动处理出错url(页面没有获取到技术参数,技术参数是一张图片)
4.总结
1.xpath 获取标签内容时,p标签中嵌套span标签,源码如下
1 2 3 4 5 6 | < div class = "describe" style = "position: relative;" > < p >< span >板 宽:</ span >1500mm</ p > < p >< span >板 厚:</ span >4.5 mm</ p > < p >< span >出料口:</ span >6口</ p > < p >< span >重 量:</ span >6000 kg</ p > </ div > |
使用xpath获取p标签内容
我想得到的效果如下
板 宽:1500mm
板 厚:4.5 mm
出料口:6口
重 量:6000 kg
使用以下xpath 只能分开获取,不是想要的效果
1 | //div[@class="describe"]/p/span/text()|//div[@class="describe"]/p/text() |
百度之后找到的解决办法,使用xpath(‘string(.)')
1.先获取所有p标签
1 | cpnr = tree.xpath( '//div[@class="describe"]/p' ) |
2.使用**string(.)**获取所有标签所有文本
1 | cp = cp.xpath( 'string(.)' ) |
循环遍历所有p标签即可
到此这篇关于python利用xpath爬取网上数据并存储到django模型中的文章就介绍到这了