# _*_ coding : utf-8 _*_
# @Time : 2022-12-25 8:59
# @Author : Kmoon_Hs
# @File : 景区评论——携程
import json
import urllib.request
import urllib.parse
import pymysql
global count
count=0
# 查询景点对应poiID
def get_poiID(word):
url='<https://m.ctrip.com/restapi/h5api/globalsearch/search?'>
data={
'action':'gsonline',
'source':'globalonline',
'keyword':'%s' % word,
't':'1671936524797'
}
data = urllib.parse.urlencode(data)
url=url+data
response=urllib.request.urlopen(url)
content=response.read().decode('utf-8')
obj=json.loads(content)
return obj['data'][0]['poiId']
# 请求对象定制
def create_request(url,poiId,page,pageSize):
# 修改请求头 可能出现需要cookie、token情况
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4173.2 Safari/537.36',
'cookie': 'xxxxxxx',
'referer': '<https://detail.tmall.com/>',
'Content-Type': 'text/plain'
}
# post 请求表单数据
data = {
"arg": {
"channelType": 2,
"collapseType": 0,
"commentTagId": 0, # 评论标签ID 0-默认全部
"pageIndex": '%s' % page, # 第几页
"pageSize": '%s' % pageSize, # 每页显示多少条评论
"poiId": '%s' % poiId, # 景点ID
"sourceType": 1,
"sortType": 3, # 排序类型 3-智能排序 1-时间排序
"starType": 0
}
}
# 编码
# 对于请求参数中嵌套了字典类型的数据,需对嵌套的字典数据进行序列化
# 先将嵌套的data进行序列化
data = json.dumps(data).encode('utf-8')
request=urllib.request.Request(url=url,headers=headers,data=data)
return request
# 模拟浏览器发送请求
def get_content(request):
response=urllib.request.urlopen(request)
content=response.read().decode("utf-8")
return content
# 下载数据到本地
def down_load(content,pageSize):
content = json.loads(content)
datas=[]
# 遍历当前页每一条评论
for i in range(pageSize):
base = content['result']['items'][i]
if base['userInfo']==None:
userNick = '匿名用户'
else:
userNick = base['userInfo']['userNick']
# 有些用户没有打分 暂时没有想到更好的写法
if len(base['scores'])==3:
scores_View=base['scores'][0]['score']
scores_Interest = base['scores'][1]['score']
scores_Cost = base['scores'][2]['score']
elif len(base['scores'])==2:
scores_View = base['scores'][0]['score']
scores_Interest = base['scores'][1]['score']
elif len(base['scores'])==1:
scores_View = base['scores'][0]['score']
else:
scores_View=-1
scores_Interest=-1
scores_Cost=-1
scores=[scores_View,scores_Interest,scores_Cost]
comment=base['content']
address=base['ipLocatedName']
sendTime=str(base['publishTypeTag']).split()[0]
global count
count+=1
# print(userNick,scores,comment,address,sendTime)
datas.append((count,userNick,scores[0],scores[1],scores[2],comment,address,sendTime))
return datas
def CreateDB():
# 建立数据库连接
db = pymysql.connect(host='localhost', user='root', password='123456', db='spider01', charset='utf8mb4')
# 获取游标对象
cursor = db.cursor()
# 创建表,如果表已经存在,注意主键不要重复,否则出错
try:
cursor.execute('create table ScenicspotComments(id int primary key,Spotname varchar(256),userNick varchar(500),scores_View int,scores_Interest int,scores_Cost int,comment varchar(1000),address varchar(256),sendTime varchar(128)) DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;')
except:
print('表已存在')
def InsertDatabase(id,Spotname,userNick,scores_View,scores_Interest,scores_Cost,comment,address,sendTime):
# 建立数据库连接
db = pymysql.connect(host='localhost', user='root', password='123456', db='spider01', charset='utf8mb4')
# 获取游标对象
cursor = db.cursor()
# 插入数据语句
query = "insert into ScenicspotComments values(%d,'%s','%s',%d,%d,%d,'%s','%s','%s')" %(id,Spotname,userNick,scores_View,scores_Interest,scores_Cost,comment,address,sendTime)
cursor.execute(query)
# 关闭游标,提交,关闭数据库连接
# 如果没有这些关闭操作,执行后在数据库中查看不到数据
cursor.close()
db.commit()
db.close()
def SelectDatabase():
# 重新建立数据库连接
db = pymysql.connect(host='localhost', user='root', password='123456', db='spider01',charset='utf8mb4')
cursor = db.cursor()
# 查询数据库并打印内容
cursor.execute('''select * from ScenicspotComments''')
results = cursor.fetchall()
for row in results:
print(row)
# 关闭
cursor.close()
db.commit()
db.close()
# // ['result']['items']
# // 用户名 ['userInfo']['userNick']
# // 用户头像 ['userInfo']['userImage']
# // 评分 ['scores'] 景色[0]['score'] 趣味[1]['score'] 性价比[2]['score']
# // 评论 ['content']
# // 地区 ['ipLocatedName']
# // 发布时间 ['publishTypeTag']
word= [
'桃花源古镇',
'安仁古镇',
'洛带古镇'
'黄龙溪古镇',
'明月山旅游区(潭下村)',
'里耶古城',
'洪江古商城',
'白石山风景区',
'淘金小镇',
'月牙泉小镇'
]
Spotlist= {
'桃花源古镇':79536,
'安仁古镇':90181,
'洛带古镇':82946,
'黄龙溪古镇':81011,
'明月山旅游区(潭下村)':10758280,
'里耶古城':99742,
'洪江古商城':80764,
'白石山风景区':101017,
'淘金小镇':101436,
'月牙泉小镇':30096431
}
if __name__=='__main__':
# 请求对象定制
url='<https://m.ctrip.com/restapi/soa2/13444/json/getCommentCollapseList>'
# 新建数据库
CreateDB()
# 遍历景点
for word,poiId in Spotlist.items():
# print(word,poiId)
spotname=word
# 每个景点页数不统一 增加异常处理 数组越界退出程序
try:
# 遍历页数 重点修改range右值 目前爬取20页
for page in range(1,11):
request = create_request(url, poiId, page, 10)
# 发送请求获取数据
content = get_content(request)
# 解析数据 下载文件
datas=down_load(content,10)
# print(datas)
# 插入数据库
for row in datas:
InsertDatabase(row[0],spotname,row[1],row[2],row[3],row[4],row[5],row[6],row[7])
except(IndexError):
print("=======爬取完成 共{}条评论数据============".format(count))
finally:
print("=======爬取完成 已入库 共{}条评论数据======".format(count))
# 查询数据库内容 是否插入成功
# SelectDatabase()