730 views
python爬虫

Python多线程爬取IBM硬件状态

文章目录

为什么用多线程

io操作不占cpu
 - 网络
 - 磁盘
 - socket
计算占用cpu
 - 逻辑运算
python多线程 -->不适合CPU密集型操作任务,但适合io密集型操作任务

由于请求IBM后台需要的时间较长,所以考虑使用多线程来加快响应速度

#!/usr/bin/env python
#coding:utf8
#++++++++++++description++++++++++++#
"""
@python2.7.5
@author:ying
@contact:1074020480@qq.com
@site: 
@software: PyCharm
@file: IBM_status.py
@time: 2019/9/17 上午8:44
"""
#+++++++++++++++++++++++++++++++++++#
import urllib2,urllib

import http.cookiejar
import ssl, json,threading



def get_status(ip):
    # 跳过ssl验证
    ssl._create_default_https_context = ssl._create_unverified_context
    # 实例化cookie
    cj = http.cookiejar.CookieJar()
    # 保存cookie
    handler = urllib2.HTTPCookieProcessor(cj)
    # 打开网页方式
    opener = urllib2.build_opener(handler)
    # url
    url = 'https://{0}/data/login'.format(ip)
    # 请求头信息
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
        'Referer': 'https://{0}/designs/imm/index.php'.format(ip)
    }
    # 表单数据
    formdata = {
        'user': 'USERID',
        'password': 'PASSW0RD',
        'SessionTimeout': '1200',
    }

    formdata = urllib.urlencode(formdata).encode()
    request = urllib2.Request(url=url, headers=headers)
    response = opener.open(request, data=formdata)
    # 登录成功后返回的数据内容{}
    token = json.loads(response.read())
    # print '1.------',token
    # 第二次请求的头信息
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
        'Referer': 'https://{0}/designs/imm/index-console.php'.format(ip),
        token['token2_name']: token['token2_value']
    }

    # 新url
    geturl = 'https://{0}/designs/imm/dataproviders/imm_status_hardware.php'.format(ip)

    new_request = urllib2.Request(url=geturl, headers=headers)
    response = opener.open(new_request)
    # print '2.-------',response

    res=response.read().decode()
    js=json.loads(res)
    js[ip]=js.pop('items')
    # print js
    return js

def return_res(ip):
    list1 = []
    res = get_status(ip=ip)
    list1.append(res)
    dict1 = {}
    for i in list1:
        dict2 = {}
        for j, k in i.items():
            for l in k[0]['hardware_health']:
                name = l['type']
                status = l['status']
                dict2[name] = status
                dict1[j] = dict2
    return dict1

#由于多线程不带返回值,需要重构一下threading
class MyThread(threading.Thread):
    def __init__(self,func,args=()):
        super(MyThread,self).__init__() 
        self.func = func
        self.args = args
    def run(self):
        self.result = self.func(*self.args)
    def get_result(self):
        try:
            return self.result
        except Exception:
            return None


运行该文件(参考)

import datetime
start_time=datetime.datetime.now()
#
ip_list=['192.168.127.72','192.168.101.1','192.168.101.2','192.168.101.3']
t_objs=[]
res_list=[]
for ip in ip_list:
    t=MyThread(return_res,args=(ip,) )
    t.start()
    t_objs.append(t)
for t in t_objs:
    t.join()             #必须要join一下,不然子线程无法全部正常执行完成
    res_list.append(t.result)
print res_list
# #
end_time=datetime.datetime.now()
total_time=end_time-start_time
print '--------all time-----------',total_time

影子专属博客 赣ICP备17013143号