xref: /openbmc/openbmc/poky/bitbake/lib/bb/fetch2/wget.py (revision eb8dc403)
1# ex:ts=4:sw=4:sts=4:et
2# -*- tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*-
3"""
4BitBake 'Fetch' implementations
5
6Classes for obtaining upstream sources for the
7BitBake build tools.
8
9"""
10
11# Copyright (C) 2003, 2004  Chris Larson
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License version 2 as
15# published by the Free Software Foundation.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License along
23# with this program; if not, write to the Free Software Foundation, Inc.,
24# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25#
26# Based on functions from the base bb module, Copyright 2003 Holger Schurig
27
28import re
29import tempfile
30import subprocess
31import os
32import logging
33import errno
34import bb
35import bb.progress
36import urllib.request, urllib.parse, urllib.error
37from   bb.fetch2 import FetchMethod
38from   bb.fetch2 import FetchError
39from   bb.fetch2 import logger
40from   bb.fetch2 import runfetchcmd
41from   bb.utils import export_proxies
42from   bs4 import BeautifulSoup
43from   bs4 import SoupStrainer
44
45class WgetProgressHandler(bb.progress.LineFilterProgressHandler):
46    """
47    Extract progress information from wget output.
48    Note: relies on --progress=dot (with -v or without -q/-nv) being
49    specified on the wget command line.
50    """
51    def __init__(self, d):
52        super(WgetProgressHandler, self).__init__(d)
53        # Send an initial progress event so the bar gets shown
54        self._fire_progress(0)
55
56    def writeline(self, line):
57        percs = re.findall(r'(\d+)%\s+([\d.]+[A-Z])', line)
58        if percs:
59            progress = int(percs[-1][0])
60            rate = percs[-1][1] + '/s'
61            self.update(progress, rate)
62            return False
63        return True
64
65
66class Wget(FetchMethod):
67    """Class to fetch urls via 'wget'"""
68    def supports(self, ud, d):
69        """
70        Check to see if a given url can be fetched with wget.
71        """
72        return ud.type in ['http', 'https', 'ftp']
73
74    def recommends_checksum(self, urldata):
75        return True
76
77    def urldata_init(self, ud, d):
78        if 'protocol' in ud.parm:
79            if ud.parm['protocol'] == 'git':
80                raise bb.fetch2.ParameterError("Invalid protocol - if you wish to fetch from a git repository using http, you need to instead use the git:// prefix with protocol=http", ud.url)
81
82        if 'downloadfilename' in ud.parm:
83            ud.basename = ud.parm['downloadfilename']
84        else:
85            ud.basename = os.path.basename(ud.path)
86
87        ud.localfile = d.expand(urllib.parse.unquote(ud.basename))
88        if not ud.localfile:
89            ud.localfile = d.expand(urllib.parse.unquote(ud.host + ud.path).replace("/", "."))
90
91        self.basecmd = d.getVar("FETCHCMD_wget") or "/usr/bin/env wget -t 2 -T 30 --passive-ftp --no-check-certificate"
92
93    def _runwget(self, ud, d, command, quiet, workdir=None):
94
95        progresshandler = WgetProgressHandler(d)
96
97        logger.debug(2, "Fetching %s using command '%s'" % (ud.url, command))
98        bb.fetch2.check_network_access(d, command, ud.url)
99        runfetchcmd(command + ' --progress=dot -v', d, quiet, log=progresshandler, workdir=workdir)
100
101    def download(self, ud, d):
102        """Fetch urls"""
103
104        fetchcmd = self.basecmd
105
106        if 'downloadfilename' in ud.parm:
107            dldir = d.getVar("DL_DIR")
108            bb.utils.mkdirhier(os.path.dirname(dldir + os.sep + ud.localfile))
109            fetchcmd += " -O " + dldir + os.sep + ud.localfile
110
111        if ud.user and ud.pswd:
112            fetchcmd += " --user=%s --password=%s --auth-no-challenge" % (ud.user, ud.pswd)
113
114        uri = ud.url.split(";")[0]
115        if os.path.exists(ud.localpath):
116            # file exists, but we didnt complete it.. trying again..
117            fetchcmd += d.expand(" -c -P ${DL_DIR} '%s'" % uri)
118        else:
119            fetchcmd += d.expand(" -P ${DL_DIR} '%s'" % uri)
120
121        self._runwget(ud, d, fetchcmd, False)
122
123        # Sanity check since wget can pretend it succeed when it didn't
124        # Also, this used to happen if sourceforge sent us to the mirror page
125        if not os.path.exists(ud.localpath):
126            raise FetchError("The fetch command returned success for url %s but %s doesn't exist?!" % (uri, ud.localpath), uri)
127
128        if os.path.getsize(ud.localpath) == 0:
129            os.remove(ud.localpath)
130            raise FetchError("The fetch of %s resulted in a zero size file?! Deleting and failing since this isn't right." % (uri), uri)
131
132        return True
133
134    def checkstatus(self, fetch, ud, d, try_again=True):
135        import urllib.request, urllib.error, urllib.parse, socket, http.client
136        from urllib.response import addinfourl
137        from bb.fetch2 import FetchConnectionCache
138
139        class HTTPConnectionCache(http.client.HTTPConnection):
140            if fetch.connection_cache:
141                def connect(self):
142                    """Connect to the host and port specified in __init__."""
143
144                    sock = fetch.connection_cache.get_connection(self.host, self.port)
145                    if sock:
146                        self.sock = sock
147                    else:
148                        self.sock = socket.create_connection((self.host, self.port),
149                                    self.timeout, self.source_address)
150                        fetch.connection_cache.add_connection(self.host, self.port, self.sock)
151
152                    if self._tunnel_host:
153                        self._tunnel()
154
155        class CacheHTTPHandler(urllib.request.HTTPHandler):
156            def http_open(self, req):
157                return self.do_open(HTTPConnectionCache, req)
158
159            def do_open(self, http_class, req):
160                """Return an addinfourl object for the request, using http_class.
161
162                http_class must implement the HTTPConnection API from httplib.
163                The addinfourl return value is a file-like object.  It also
164                has methods and attributes including:
165                    - info(): return a mimetools.Message object for the headers
166                    - geturl(): return the original request URL
167                    - code: HTTP status code
168                """
169                host = req.host
170                if not host:
171                    raise urlllib2.URLError('no host given')
172
173                h = http_class(host, timeout=req.timeout) # will parse host:port
174                h.set_debuglevel(self._debuglevel)
175
176                headers = dict(req.unredirected_hdrs)
177                headers.update(dict((k, v) for k, v in list(req.headers.items())
178                            if k not in headers))
179
180                # We want to make an HTTP/1.1 request, but the addinfourl
181                # class isn't prepared to deal with a persistent connection.
182                # It will try to read all remaining data from the socket,
183                # which will block while the server waits for the next request.
184                # So make sure the connection gets closed after the (only)
185                # request.
186
187                # Don't close connection when connection_cache is enabled,
188                if fetch.connection_cache is None:
189                    headers["Connection"] = "close"
190                else:
191                    headers["Connection"] = "Keep-Alive" # Works for HTTP/1.0
192
193                headers = dict(
194                    (name.title(), val) for name, val in list(headers.items()))
195
196                if req._tunnel_host:
197                    tunnel_headers = {}
198                    proxy_auth_hdr = "Proxy-Authorization"
199                    if proxy_auth_hdr in headers:
200                        tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
201                        # Proxy-Authorization should not be sent to origin
202                        # server.
203                        del headers[proxy_auth_hdr]
204                    h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
205
206                try:
207                    h.request(req.get_method(), req.selector, req.data, headers)
208                except socket.error as err: # XXX what error?
209                    # Don't close connection when cache is enabled.
210                    # Instead, try to detect connections that are no longer
211                    # usable (for example, closed unexpectedly) and remove
212                    # them from the cache.
213                    if fetch.connection_cache is None:
214                        h.close()
215                    elif isinstance(err, OSError) and err.errno == errno.EBADF:
216                        # This happens when the server closes the connection despite the Keep-Alive.
217                        # Apparently urllib then uses the file descriptor, expecting it to be
218                        # connected, when in reality the connection is already gone.
219                        # We let the request fail and expect it to be
220                        # tried once more ("try_again" in check_status()),
221                        # with the dead connection removed from the cache.
222                        # If it still fails, we give up, which can happend for bad
223                        # HTTP proxy settings.
224                        fetch.connection_cache.remove_connection(h.host, h.port)
225                    raise urllib.error.URLError(err)
226                else:
227                    try:
228                        r = h.getresponse(buffering=True)
229                    except TypeError: # buffering kw not supported
230                        r = h.getresponse()
231
232                # Pick apart the HTTPResponse object to get the addinfourl
233                # object initialized properly.
234
235                # Wrap the HTTPResponse object in socket's file object adapter
236                # for Windows.  That adapter calls recv(), so delegate recv()
237                # to read().  This weird wrapping allows the returned object to
238                # have readline() and readlines() methods.
239
240                # XXX It might be better to extract the read buffering code
241                # out of socket._fileobject() and into a base class.
242                r.recv = r.read
243
244                # no data, just have to read
245                r.read()
246                class fp_dummy(object):
247                    def read(self):
248                        return ""
249                    def readline(self):
250                        return ""
251                    def close(self):
252                        pass
253                    closed = False
254
255                resp = addinfourl(fp_dummy(), r.msg, req.get_full_url())
256                resp.code = r.status
257                resp.msg = r.reason
258
259                # Close connection when server request it.
260                if fetch.connection_cache is not None:
261                    if 'Connection' in r.msg and r.msg['Connection'] == 'close':
262                        fetch.connection_cache.remove_connection(h.host, h.port)
263
264                return resp
265
266        class HTTPMethodFallback(urllib.request.BaseHandler):
267            """
268            Fallback to GET if HEAD is not allowed (405 HTTP error)
269            """
270            def http_error_405(self, req, fp, code, msg, headers):
271                fp.read()
272                fp.close()
273
274                newheaders = dict((k,v) for k,v in list(req.headers.items())
275                                  if k.lower() not in ("content-length", "content-type"))
276                return self.parent.open(urllib.request.Request(req.get_full_url(),
277                                                        headers=newheaders,
278                                                        origin_req_host=req.origin_req_host,
279                                                        unverifiable=True))
280
281            """
282            Some servers (e.g. GitHub archives, hosted on Amazon S3) return 403
283            Forbidden when they actually mean 405 Method Not Allowed.
284            """
285            http_error_403 = http_error_405
286
287
288        class FixedHTTPRedirectHandler(urllib.request.HTTPRedirectHandler):
289            """
290            urllib2.HTTPRedirectHandler resets the method to GET on redirect,
291            when we want to follow redirects using the original method.
292            """
293            def redirect_request(self, req, fp, code, msg, headers, newurl):
294                newreq = urllib.request.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, newurl)
295                newreq.get_method = lambda: req.get_method()
296                return newreq
297        exported_proxies = export_proxies(d)
298
299        handlers = [FixedHTTPRedirectHandler, HTTPMethodFallback]
300        if export_proxies:
301            handlers.append(urllib.request.ProxyHandler())
302        handlers.append(CacheHTTPHandler())
303        # XXX: Since Python 2.7.9 ssl cert validation is enabled by default
304        # see PEP-0476, this causes verification errors on some https servers
305        # so disable by default.
306        import ssl
307        if hasattr(ssl, '_create_unverified_context'):
308            handlers.append(urllib.request.HTTPSHandler(context=ssl._create_unverified_context()))
309        opener = urllib.request.build_opener(*handlers)
310
311        try:
312            uri = ud.url.split(";")[0]
313            r = urllib.request.Request(uri)
314            r.get_method = lambda: "HEAD"
315            # Some servers (FusionForge, as used on Alioth) require that the
316            # optional Accept header is set.
317            r.add_header("Accept", "*/*")
318            def add_basic_auth(login_str, request):
319                '''Adds Basic auth to http request, pass in login:password as string'''
320                import base64
321                encodeuser = base64.b64encode(login_str.encode('utf-8')).decode("utf-8")
322                authheader =  "Basic %s" % encodeuser
323                r.add_header("Authorization", authheader)
324
325            if ud.user:
326                add_basic_auth(ud.user, r)
327
328            try:
329                import netrc, urllib.parse
330                n = netrc.netrc()
331                login, unused, password = n.authenticators(urllib.parse.urlparse(uri).hostname)
332                add_basic_auth("%s:%s" % (login, password), r)
333            except (TypeError, ImportError, IOError, netrc.NetrcParseError):
334                 pass
335
336            with opener.open(r) as response:
337                pass
338        except urllib.error.URLError as e:
339            if try_again:
340                logger.debug(2, "checkstatus: trying again")
341                return self.checkstatus(fetch, ud, d, False)
342            else:
343                # debug for now to avoid spamming the logs in e.g. remote sstate searches
344                logger.debug(2, "checkstatus() urlopen failed: %s" % e)
345                return False
346        return True
347
348    def _parse_path(self, regex, s):
349        """
350        Find and group name, version and archive type in the given string s
351        """
352
353        m = regex.search(s)
354        if m:
355            pname = ''
356            pver = ''
357            ptype = ''
358
359            mdict = m.groupdict()
360            if 'name' in mdict.keys():
361                pname = mdict['name']
362            if 'pver' in mdict.keys():
363                pver = mdict['pver']
364            if 'type' in mdict.keys():
365                ptype = mdict['type']
366
367            bb.debug(3, "_parse_path: %s, %s, %s" % (pname, pver, ptype))
368
369            return (pname, pver, ptype)
370
371        return None
372
373    def _modelate_version(self, version):
374        if version[0] in ['.', '-']:
375            if version[1].isdigit():
376                version = version[1] + version[0] + version[2:len(version)]
377            else:
378                version = version[1:len(version)]
379
380        version = re.sub('-', '.', version)
381        version = re.sub('_', '.', version)
382        version = re.sub('(rc)+', '.1000.', version)
383        version = re.sub('(beta)+', '.100.', version)
384        version = re.sub('(alpha)+', '.10.', version)
385        if version[0] == 'v':
386            version = version[1:len(version)]
387        return version
388
389    def _vercmp(self, old, new):
390        """
391        Check whether 'new' is newer than 'old' version. We use existing vercmp() for the
392        purpose. PE is cleared in comparison as it's not for build, and PR is cleared too
393        for simplicity as it's somehow difficult to get from various upstream format
394        """
395
396        (oldpn, oldpv, oldsuffix) = old
397        (newpn, newpv, newsuffix) = new
398
399        """
400        Check for a new suffix type that we have never heard of before
401        """
402        if (newsuffix):
403            m = self.suffix_regex_comp.search(newsuffix)
404            if not m:
405                bb.warn("%s has a possible unknown suffix: %s" % (newpn, newsuffix))
406                return False
407
408        """
409        Not our package so ignore it
410        """
411        if oldpn != newpn:
412            return False
413
414        oldpv = self._modelate_version(oldpv)
415        newpv = self._modelate_version(newpv)
416
417        return bb.utils.vercmp(("0", oldpv, ""), ("0", newpv, ""))
418
419    def _fetch_index(self, uri, ud, d):
420        """
421        Run fetch checkstatus to get directory information
422        """
423        f = tempfile.NamedTemporaryFile()
424        with tempfile.TemporaryDirectory(prefix="wget-index-") as workdir, tempfile.NamedTemporaryFile(dir=workdir, prefix="wget-listing-") as f:
425            agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.12) Gecko/20101027 Ubuntu/9.10 (karmic) Firefox/3.6.12"
426            fetchcmd = self.basecmd
427            fetchcmd += " -O " + f.name + " --user-agent='" + agent + "' '" + uri + "'"
428            try:
429                self._runwget(ud, d, fetchcmd, True, workdir=workdir)
430                fetchresult = f.read()
431            except bb.fetch2.BBFetchException:
432                fetchresult = ""
433
434        return fetchresult
435
436    def _check_latest_version(self, url, package, package_regex, current_version, ud, d):
437        """
438        Return the latest version of a package inside a given directory path
439        If error or no version, return ""
440        """
441        valid = 0
442        version = ['', '', '']
443
444        bb.debug(3, "VersionURL: %s" % (url))
445        soup = BeautifulSoup(self._fetch_index(url, ud, d), "html.parser", parse_only=SoupStrainer("a"))
446        if not soup:
447            bb.debug(3, "*** %s NO SOUP" % (url))
448            return ""
449
450        for line in soup.find_all('a', href=True):
451            bb.debug(3, "line['href'] = '%s'" % (line['href']))
452            bb.debug(3, "line = '%s'" % (str(line)))
453
454            newver = self._parse_path(package_regex, line['href'])
455            if not newver:
456                newver = self._parse_path(package_regex, str(line))
457
458            if newver:
459                bb.debug(3, "Upstream version found: %s" % newver[1])
460                if valid == 0:
461                    version = newver
462                    valid = 1
463                elif self._vercmp(version, newver) < 0:
464                    version = newver
465
466        pupver = re.sub('_', '.', version[1])
467
468        bb.debug(3, "*** %s -> UpstreamVersion = %s (CurrentVersion = %s)" %
469                (package, pupver or "N/A", current_version[1]))
470
471        if valid:
472            return pupver
473
474        return ""
475
476    def _check_latest_version_by_dir(self, dirver, package, package_regex,
477            current_version, ud, d):
478        """
479            Scan every directory in order to get upstream version.
480        """
481        version_dir = ['', '', '']
482        version = ['', '', '']
483
484        dirver_regex = re.compile("(?P<pfx>\D*)(?P<ver>(\d+[\.\-_])+(\d+))")
485        s = dirver_regex.search(dirver)
486        if s:
487            version_dir[1] = s.group('ver')
488        else:
489            version_dir[1] = dirver
490
491        dirs_uri = bb.fetch.encodeurl([ud.type, ud.host,
492                ud.path.split(dirver)[0], ud.user, ud.pswd, {}])
493        bb.debug(3, "DirURL: %s, %s" % (dirs_uri, package))
494
495        soup = BeautifulSoup(self._fetch_index(dirs_uri, ud, d), "html.parser", parse_only=SoupStrainer("a"))
496        if not soup:
497            return version[1]
498
499        for line in soup.find_all('a', href=True):
500            s = dirver_regex.search(line['href'].strip("/"))
501            if s:
502                sver = s.group('ver')
503
504                # When prefix is part of the version directory it need to
505                # ensure that only version directory is used so remove previous
506                # directories if exists.
507                #
508                # Example: pfx = '/dir1/dir2/v' and version = '2.5' the expected
509                # result is v2.5.
510                spfx = s.group('pfx').split('/')[-1]
511
512                version_dir_new = ['', sver, '']
513                if self._vercmp(version_dir, version_dir_new) <= 0:
514                    dirver_new = spfx + sver
515                    path = ud.path.replace(dirver, dirver_new, True) \
516                        .split(package)[0]
517                    uri = bb.fetch.encodeurl([ud.type, ud.host, path,
518                        ud.user, ud.pswd, {}])
519
520                    pupver = self._check_latest_version(uri,
521                            package, package_regex, current_version, ud, d)
522                    if pupver:
523                        version[1] = pupver
524
525                    version_dir = version_dir_new
526
527        return version[1]
528
529    def _init_regexes(self, package, ud, d):
530        """
531        Match as many patterns as possible such as:
532                gnome-common-2.20.0.tar.gz (most common format)
533                gtk+-2.90.1.tar.gz
534                xf86-input-synaptics-12.6.9.tar.gz
535                dri2proto-2.3.tar.gz
536                blktool_4.orig.tar.gz
537                libid3tag-0.15.1b.tar.gz
538                unzip552.tar.gz
539                icu4c-3_6-src.tgz
540                genext2fs_1.3.orig.tar.gz
541                gst-fluendo-mp3
542        """
543        # match most patterns which uses "-" as separator to version digits
544        pn_prefix1 = "[a-zA-Z][a-zA-Z0-9]*([-_][a-zA-Z]\w+)*\+?[-_]"
545        # a loose pattern such as for unzip552.tar.gz
546        pn_prefix2 = "[a-zA-Z]+"
547        # a loose pattern such as for 80325-quicky-0.4.tar.gz
548        pn_prefix3 = "[0-9]+[-]?[a-zA-Z]+"
549        # Save the Package Name (pn) Regex for use later
550        pn_regex = "(%s|%s|%s)" % (pn_prefix1, pn_prefix2, pn_prefix3)
551
552        # match version
553        pver_regex = "(([A-Z]*\d+[a-zA-Z]*[\.\-_]*)+)"
554
555        # match arch
556        parch_regex = "-source|_all_"
557
558        # src.rpm extension was added only for rpm package. Can be removed if the rpm
559        # packaged will always be considered as having to be manually upgraded
560        psuffix_regex = "(tar\.gz|tgz|tar\.bz2|zip|xz|tar\.lz|rpm|bz2|orig\.tar\.gz|tar\.xz|src\.tar\.gz|src\.tgz|svnr\d+\.tar\.bz2|stable\.tar\.gz|src\.rpm)"
561
562        # match name, version and archive type of a package
563        package_regex_comp = re.compile("(?P<name>%s?\.?v?)(?P<pver>%s)(?P<arch>%s)?[\.-](?P<type>%s$)"
564                                                    % (pn_regex, pver_regex, parch_regex, psuffix_regex))
565        self.suffix_regex_comp = re.compile(psuffix_regex)
566
567        # compile regex, can be specific by package or generic regex
568        pn_regex = d.getVar('UPSTREAM_CHECK_REGEX')
569        if pn_regex:
570            package_custom_regex_comp = re.compile(pn_regex)
571        else:
572            version = self._parse_path(package_regex_comp, package)
573            if version:
574                package_custom_regex_comp = re.compile(
575                    "(?P<name>%s)(?P<pver>%s)(?P<arch>%s)?[\.-](?P<type>%s)" %
576                    (re.escape(version[0]), pver_regex, parch_regex, psuffix_regex))
577            else:
578                package_custom_regex_comp = None
579
580        return package_custom_regex_comp
581
582    def latest_versionstring(self, ud, d):
583        """
584        Manipulate the URL and try to obtain the latest package version
585
586        sanity check to ensure same name and type.
587        """
588        package = ud.path.split("/")[-1]
589        current_version = ['', d.getVar('PV'), '']
590
591        """possible to have no version in pkg name, such as spectrum-fw"""
592        if not re.search("\d+", package):
593            current_version[1] = re.sub('_', '.', current_version[1])
594            current_version[1] = re.sub('-', '.', current_version[1])
595            return (current_version[1], '')
596
597        package_regex = self._init_regexes(package, ud, d)
598        if package_regex is None:
599            bb.warn("latest_versionstring: package %s don't match pattern" % (package))
600            return ('', '')
601        bb.debug(3, "latest_versionstring, regex: %s" % (package_regex.pattern))
602
603        uri = ""
604        regex_uri = d.getVar("UPSTREAM_CHECK_URI")
605        if not regex_uri:
606            path = ud.path.split(package)[0]
607
608            # search for version matches on folders inside the path, like:
609            # "5.7" in http://download.gnome.org/sources/${PN}/5.7/${PN}-${PV}.tar.gz
610            dirver_regex = re.compile("(?P<dirver>[^/]*(\d+\.)*\d+([-_]r\d+)*)/")
611            m = dirver_regex.search(path)
612            if m:
613                pn = d.getVar('PN')
614                dirver = m.group('dirver')
615
616                dirver_pn_regex = re.compile("%s\d?" % (re.escape(pn)))
617                if not dirver_pn_regex.search(dirver):
618                    return (self._check_latest_version_by_dir(dirver,
619                        package, package_regex, current_version, ud, d), '')
620
621            uri = bb.fetch.encodeurl([ud.type, ud.host, path, ud.user, ud.pswd, {}])
622        else:
623            uri = regex_uri
624
625        return (self._check_latest_version(uri, package, package_regex,
626                current_version, ud, d), '')
627