• 微信公众号:美女很有趣。 工作之余,放松一下,关注即送10G+美女照片!

Python urllib2 模块请求超时的底层实现

开发技术 开发技术 5小时前 1次浏览

前言

最近,笔者在使用 Python2.7 的 urllib2 模块进行 HTTP 相关操作时,遇到请求超时的问题。在创建 urllib2 模块的 Request 对象时,可以通过参数 timeout 指定超时时间:

request = urllib2.Request("http://www.baidu.com")
response = urllib2.urlopen(request, timeout=8)

然而,这里指定的超时时间到底是指什么?是约定时间内没有完成 IO 操作?还是约定时间内无 IO 事件产生?

socket 如何设置超时

在 Linux 环境中,对于一个 Berkeley 套接字,设置超时的方式有三种:

  1. 通过 alarm 函数设置超时,当超时时产生 SIGALRM 信号;
  2. 通过 setsockop t函数和套接字选项 SO_RCVTIMEO、SO_SNDTIMEO,来设置 recv()/send() 的超时时间;
  3. 通过非阻塞 socket 与 IO 多路复用。

urllib2 模块如何实现超时

connect 超时

在上文中提到,可以通过 urllib2.urlopen 函数来设置一个 HTTP 请求对象的超时时间。而在 urllib2 模块中,一个请求对象如何与 socket 关联起来呢?笔者在 socket 模块的 create_connection 函数中加入堆栈打印,输出如下:

File "/usr/lib/python2.7/urllib2.py", line 154, in urlopen
    return opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 429, in open
    response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 447, in _open
    '_open', req)
File "/usr/lib/python2.7/urllib2.py", line 407, in _call_chain
    result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1228, in http_open
    return self.do_open(httplib.HTTPConnection, req)
File "/usr/lib/python2.7/urllib2.py", line 1195, in do_open
    h.request(req.get_method(), req.get_selector(), req.data, headers)
File "/usr/lib/python2.7/httplib.py", line 1057, in request
    self._send_request(method, url, body, headers)
File "/usr/lib/python2.7/httplib.py", line 1097, in _send_request
    self.endheaders(body)
File "/usr/lib/python2.7/httplib.py", line 1053, in endheaders
    self._send_output(message_body)
File "/usr/lib/python2.7/httplib.py", line 897, in _send_output
    self.send(msg)
File "/usr/lib/python2.7/httplib.py", line 859, in send
    self.connect()
File "/usr/lib/python2.7/httplib.py", line 836, in connect
    self.timeout, self.source_address)
File "/usr/lib/python2.7/socket.py", line 557, in create_connection
    traceback.print_stack()

可见,urllib2.urlopen() 最终会调用 socket.create_connection() 来创建 socket 并建立 TCP 连接:

def create_connection(address, timeout=_GLOBAL_DEFAULT_TIMEOUT,
                      source_address=None):
    host, port = address
    err = None
    for res in getaddrinfo(host, port, 0, SOCK_STREAM):
        af, socktype, proto, canonname, sa = res
        sock = None
        try:
            sock = socket(af, socktype, proto)
            if timeout is not _GLOBAL_DEFAULT_TIMEOUT:
                sock.settimeout(timeout)
            if source_address:
                sock.bind(source_address)
            sock.connect(sa)
            return sock

        except error as _:
            err = _
            if sock is not None:
                sock.close()

    if err is not None:
        raise err
    else:
        raise error("getaddrinfo returns an empty list")

在上述代码中,程序首先创建 socket 对象,紧接着根据实参来设置 socket 的超时。socket.settimeout 函数的底层实现如下(仅列出 Linux 下的实现):

static PyObject *
sock_settimeout(PySocketSockObject *s, PyObject *arg)
{
    double timeout;

    if (arg == Py_None)
        timeout = -1.0;
    else {
        timeout = PyFloat_AsDouble(arg);
        if (timeout < 0.0) {
            if (!PyErr_Occurred())
                PyErr_SetString(PyExc_ValueError,
                                "Timeout value out of range");
            return NULL;
        }
    }

    s->sock_timeout = timeout;
    internal_setblocking(s, timeout < 0.0);

    Py_INCREF(Py_None);
    return Py_None;
}


static int
internal_setblocking(PySocketSockObject *s, int block)
{
#ifndef RISCOS
#ifndef MS_WINDOWS
    int delay_flag;
#endif
#endif

    Py_BEGIN_ALLOW_THREADS
#ifdef __BEOS__
    block = !block;
    setsockopt(s->sock_fd, SOL_SOCKET, SO_NONBLOCK,
               (void *)(&block), sizeof(int));
#else
#ifndef RISCOS
#ifndef MS_WINDOWS
#if defined(PYOS_OS2) && !defined(PYCC_GCC)
	//省略
#elif defined(__VMS)
    //省略
#else  /* !PYOS_OS2 && !__VMS */
    delay_flag = fcntl(s->sock_fd, F_GETFL, 0);
    if (block)
        delay_flag &= (~O_NONBLOCK);
    else
        delay_flag |= O_NONBLOCK;
    fcntl(s->sock_fd, F_SETFL, delay_flag);
#endif /* !PYOS_OS2 */
#else /* MS_WINDOWS */
    //省略
#endif /* MS_WINDOWS */
#else /* RISCOS */
   //省略
#endif /* RISCOS */
#endif /* __BEOS__ */
    Py_END_ALLOW_THREADS

    /* Since these don't return anything */
    return 1;
}

可见,urllib2模块创建一个具有超时时间的请求对象,在Linux下,其底层是通过fcntl来设置相应socket为非阻塞模式来实现的。

完成超时设置后,程序会调用socket.connect()来建立TCP连接,该函数的底层实现如下:

static PyObject *
sock_connect(PySocketSockObject *s, PyObject *addro)
{
    sock_addr_t addrbuf;
    int addrlen;
    int res;
    int timeout;

    if (!getsockaddrarg(s, addro, SAS2SA(&addrbuf), &addrlen))
        return NULL;

    Py_BEGIN_ALLOW_THREADS
    res = internal_connect(s, SAS2SA(&addrbuf), addrlen, &timeout);
    Py_END_ALLOW_THREADS

    if (timeout == 1) {
        PyErr_SetString(socket_timeout, "timed out");
        return NULL;
    }
    if (res != 0)
        return s->errorhandler();
    Py_INCREF(Py_None);
    return Py_None;
}

其中核心函数internal_connect的源码如下(仅列出Linux下的实现):

static int
internal_connect(PySocketSockObject *s, struct sockaddr *addr, int addrlen,
                 int *timeoutp)
{
    int res, timeout;

    timeout = 0;
    res = connect(s->sock_fd, addr, addrlen);

#ifdef MS_WINDOWS
	//省略
#else
    
    if (s->sock_timeout > 0.0) {
        if (res < 0 && errno == EINPROGRESS && IS_SELECTABLE(s)) {
            timeout = internal_select(s, 1);
            if (timeout == 0) {
                /* Bug #1019808: in case of an EINPROGRESS,
                   use getsockopt(SO_ERROR) to get the real
                   error. */
                socklen_t res_size = sizeof res;
                (void)getsockopt(s->sock_fd, SOL_SOCKET,
                                 SO_ERROR, &res, &res_size);
                if (res == EISCONN)
                    res = 0;
                errno = res;
            }
            else if (timeout == -1) {
                res = errno;            /* had error */
            }
            else
                res = EWOULDBLOCK;                      /* timed out */
        }
    }

    if (res < 0)
        res = errno;

#endif
    *timeoutp = timeout;

    return res;
}

internal_connect 函数的工作流程可以概括为:

  1. 调用 connect() 来尝试建立连接;
  2. connect() 调用结束,如果 socket 没有设置超时时间,则判断 connect 函数的返回值,若返回值小于 0(出错),则返回错误码 errno;
  3. 如果 socket 设置了超时时间(s->sock_timeout > 0.0),则说明该 socket 为非阻塞模式,那么接下来的工作其实就是非阻塞 connect 的流程:
    • 判断 errno 是否为 EINPROGRESS(TCP 三次握手正在进行中);
    • 若是,通过 IO 多路复用(select、poll)来检测 socket 是否可写;
    • 若发生可写事件,再调用 getsockopt() 来检查 socket 是否出错;
    • 若无错误发生,则表明该 socket 的非阻塞 connect 已完成,TCP 连接已建立。

不信,见 internal_select 函数的实现:

static int
internal_select(PySocketSockObject *s, int writing)
{
    int n;

    /* Nothing to do unless we're in timeout mode (not non-blocking) */
    if (s->sock_timeout <= 0.0)
        return 0;

    /* Guard against closed socket */
    if (s->sock_fd < 0)
        return 0;

    /* Prefer poll, if available, since you can poll() any fd
     * which can't be done with select(). */
#ifdef HAVE_POLL
    {
        struct pollfd pollfd;
        int timeout;

        pollfd.fd = s->sock_fd;
        pollfd.events = writing ? POLLOUT : POLLIN;

        /* s->sock_timeout is in seconds, timeout in ms */
        timeout = (int)(s->sock_timeout * 1000 + 0.5);
        n = poll(&pollfd, 1, timeout);
    }
#else
    {
        /* Construct the arguments to select */
        fd_set fds;
        struct timeval tv;
        tv.tv_sec = (int)s->sock_timeout;
        tv.tv_usec = (int)((s->sock_timeout - tv.tv_sec) * 1e6);
        FD_ZERO(&fds);
        FD_SET(s->sock_fd, &fds);

        /* See if the socket is ready */
        if (writing)
            n = select(s->sock_fd+1, NULL, &fds, NULL, &tv);
        else
            n = select(s->sock_fd+1, &fds, NULL, NULL, &tv);
    }
#endif

    if (n < 0)
        return -1;
    if (n == 0)
        return 1;
    return 0;
}

从上述代码中可见,若 select() 等待可写事件超时,sock_connect 函数则会调用 PyErr_SetString() 来设置异常对象,以提示 Python 层 connect() 超时:

if (timeout == 1) {
	PyErr_SetString(socket_timeout, "timed out");
	return NULL;
}

recv/send 超时

当 TCP 连接成功建立之后,接下来需要处理应用层上的网络数据交互。对应到 urllib2 模块,就是如 HTTP 请求与响应等操作。而这些操作,都是通过调用底层的 sock_recv()、sock_send() 等来实现的。

以 sock_recv 函数为例:

static PyObject *
sock_recv(PySocketSockObject *s, PyObject *args)
{
    int recvlen, flags = 0;
    ssize_t outlen;
    PyObject *buf;

    if (!PyArg_ParseTuple(args, "i|i:recv", &recvlen, &flags))
        return NULL;

    if (recvlen < 0) {
        PyErr_SetString(PyExc_ValueError,
                        "negative buffersize in recv");
        return NULL;
    }

    /* Allocate a new string. */
    buf = PyString_FromStringAndSize((char *) 0, recvlen);
    if (buf == NULL)
        return NULL;

    /* Call the guts */
    outlen = sock_recv_guts(s, PyString_AS_STRING(buf), recvlen, flags);
    if (outlen < 0) {
        /* An error occurred, release the string and return an
           error. */
        Py_DECREF(buf);
        return NULL;
    }
    if (outlen != recvlen) {
        /* We did not read as many bytes as we anticipated, resize the
           string if possible and be successful. */
        if (_PyString_Resize(&buf, outlen) < 0)
            /* Oopsy, not so successful after all. */
            return NULL;
    }

    return buf;
}

其核心函数 sock_recv_guts() 的实现如下:

static ssize_t
sock_recv_guts(PySocketSockObject *s, char* cbuf, int len, int flags)
{
    ssize_t outlen = -1;
    int timeout;
#ifdef __VMS
    int remaining;
    char *read_buf;
#endif

    if (!IS_SELECTABLE(s)) {
        select_error();
        return -1;
    }

#ifndef __VMS
    Py_BEGIN_ALLOW_THREADS
    timeout = internal_select(s, 0);
    if (!timeout)
        outlen = recv(s->sock_fd, cbuf, len, flags);
    Py_END_ALLOW_THREADS

    if (timeout == 1) {
        PyErr_SetString(socket_timeout, "timed out");
        return -1;
    }
    if (outlen < 0) {
        /* Note: the call to errorhandler() ALWAYS indirectly returned
           NULL, so ignore its return value */
        s->errorhandler();
        return -1;
    }
#else
    read_buf = cbuf;
    remaining = len;
    while (remaining != 0) {
        unsigned int segment;
        int nread = -1;

        segment = remaining /SEGMENT_SIZE;
        if (segment != 0) {
            segment = SEGMENT_SIZE;
        }
        else {
            segment = remaining;
        }

        Py_BEGIN_ALLOW_THREADS
        timeout = internal_select(s, 0);
        if (!timeout)
            nread = recv(s->sock_fd, read_buf, segment, flags);
        Py_END_ALLOW_THREADS

        if (timeout == 1) {
            PyErr_SetString(socket_timeout, "timed out");
            return -1;
        }
        if (nread < 0) {
            s->errorhandler();
            return -1;
        }
        if (nread != remaining) {
            read_buf += nread;
            break;
        }

        remaining -= segment;
        read_buf += segment;
    }
    outlen = read_buf - cbuf;
#endif /* !__VMS */

    return outlen;
}

可见,程序首先调用 internal_select() 来检测可读事件。若有可读事件发生,则调用 recv() 进行数据读取(当然未必能读取完整);若超时,则同样调用 PyErr_SetString() 来设置超时的异常对象。

总结

当使用 urllib2 模块创建一个具有超时时间的 HTTP 请求时,其实质是创建一个非阻塞 socket;当对这个 HTTP 请求进行 IO 操作时(如建立 TCP 连接、收发数据),若发生了超时,则表明在超时时间内,没有产生相应的 IO 事件,而非“在超时时间内没有完成 IO 操作”。


程序员灯塔
转载请注明原文链接:Python urllib2 模块请求超时的底层实现
喜欢 (0)