xref: /openbmc/openbmc/poky/bitbake/lib/bb/fetch2/wget.py (revision 90fd73cb)
1"""
2BitBake 'Fetch' implementations
3
4Classes for obtaining upstream sources for the
5BitBake build tools.
6
7"""
8
9# Copyright (C) 2003, 2004  Chris Larson
10#
11# SPDX-License-Identifier: GPL-2.0-only
12#
13# Based on functions from the base bb module, Copyright 2003 Holger Schurig
14
15import shlex
16import re
17import tempfile
18import os
19import errno
20import bb
21import bb.progress
22import socket
23import http.client
24import urllib.request, urllib.parse, urllib.error
25from   bb.fetch2 import FetchMethod
26from   bb.fetch2 import FetchError
27from   bb.fetch2 import logger
28from   bb.fetch2 import runfetchcmd
29from   bb.utils import export_proxies
30from   bs4 import BeautifulSoup
31from   bs4 import SoupStrainer
32
33class WgetProgressHandler(bb.progress.LineFilterProgressHandler):
34    """
35    Extract progress information from wget output.
36    Note: relies on --progress=dot (with -v or without -q/-nv) being
37    specified on the wget command line.
38    """
39    def __init__(self, d):
40        super(WgetProgressHandler, self).__init__(d)
41        # Send an initial progress event so the bar gets shown
42        self._fire_progress(0)
43
44    def writeline(self, line):
45        percs = re.findall(r'(\d+)%\s+([\d.]+[A-Z])', line)
46        if percs:
47            progress = int(percs[-1][0])
48            rate = percs[-1][1] + '/s'
49            self.update(progress, rate)
50            return False
51        return True
52
53
54class Wget(FetchMethod):
55
56    # CDNs like CloudFlare may do a 'browser integrity test' which can fail
57    # with the standard wget/urllib User-Agent, so pretend to be a modern
58    # browser.
59    user_agent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0"
60
61    """Class to fetch urls via 'wget'"""
62    def supports(self, ud, d):
63        """
64        Check to see if a given url can be fetched with wget.
65        """
66        return ud.type in ['http', 'https', 'ftp']
67
68    def recommends_checksum(self, urldata):
69        return True
70
71    def urldata_init(self, ud, d):
72        if 'protocol' in ud.parm:
73            if ud.parm['protocol'] == 'git':
74                raise bb.fetch2.ParameterError("Invalid protocol - if you wish to fetch from a git repository using http, you need to instead use the git:// prefix with protocol=http", ud.url)
75
76        if 'downloadfilename' in ud.parm:
77            ud.basename = ud.parm['downloadfilename']
78        else:
79            ud.basename = os.path.basename(ud.path)
80
81        ud.localfile = d.expand(urllib.parse.unquote(ud.basename))
82        if not ud.localfile:
83            ud.localfile = d.expand(urllib.parse.unquote(ud.host + ud.path).replace("/", "."))
84
85        self.basecmd = d.getVar("FETCHCMD_wget") or "/usr/bin/env wget -t 2 -T 30 --passive-ftp --no-check-certificate"
86
87    def _runwget(self, ud, d, command, quiet, workdir=None):
88
89        progresshandler = WgetProgressHandler(d)
90
91        logger.debug2("Fetching %s using command '%s'" % (ud.url, command))
92        bb.fetch2.check_network_access(d, command, ud.url)
93        runfetchcmd(command + ' --progress=dot -v', d, quiet, log=progresshandler, workdir=workdir)
94
95    def download(self, ud, d):
96        """Fetch urls"""
97
98        fetchcmd = self.basecmd
99
100        if 'downloadfilename' in ud.parm:
101            localpath = os.path.join(d.getVar("DL_DIR"), ud.localfile)
102            bb.utils.mkdirhier(os.path.dirname(localpath))
103            fetchcmd += " -O %s" % shlex.quote(localpath)
104
105        if ud.user and ud.pswd:
106            fetchcmd += " --user=%s --password=%s --auth-no-challenge" % (ud.user, ud.pswd)
107
108        uri = ud.url.split(";")[0]
109        if os.path.exists(ud.localpath):
110            # file exists, but we didnt complete it.. trying again..
111            fetchcmd += d.expand(" -c -P ${DL_DIR} '%s'" % uri)
112        else:
113            fetchcmd += d.expand(" -P ${DL_DIR} '%s'" % uri)
114
115        self._runwget(ud, d, fetchcmd, False)
116
117        # Sanity check since wget can pretend it succeed when it didn't
118        # Also, this used to happen if sourceforge sent us to the mirror page
119        if not os.path.exists(ud.localpath):
120            raise FetchError("The fetch command returned success for url %s but %s doesn't exist?!" % (uri, ud.localpath), uri)
121
122        if os.path.getsize(ud.localpath) == 0:
123            os.remove(ud.localpath)
124            raise FetchError("The fetch of %s resulted in a zero size file?! Deleting and failing since this isn't right." % (uri), uri)
125
126        return True
127
128    def checkstatus(self, fetch, ud, d, try_again=True):
129        class HTTPConnectionCache(http.client.HTTPConnection):
130            if fetch.connection_cache:
131                def connect(self):
132                    """Connect to the host and port specified in __init__."""
133
134                    sock = fetch.connection_cache.get_connection(self.host, self.port)
135                    if sock:
136                        self.sock = sock
137                    else:
138                        self.sock = socket.create_connection((self.host, self.port),
139                                    self.timeout, self.source_address)
140                        fetch.connection_cache.add_connection(self.host, self.port, self.sock)
141
142                    if self._tunnel_host:
143                        self._tunnel()
144
145        class CacheHTTPHandler(urllib.request.HTTPHandler):
146            def http_open(self, req):
147                return self.do_open(HTTPConnectionCache, req)
148
149            def do_open(self, http_class, req):
150                """Return an addinfourl object for the request, using http_class.
151
152                http_class must implement the HTTPConnection API from httplib.
153                The addinfourl return value is a file-like object.  It also
154                has methods and attributes including:
155                    - info(): return a mimetools.Message object for the headers
156                    - geturl(): return the original request URL
157                    - code: HTTP status code
158                """
159                host = req.host
160                if not host:
161                    raise urllib.error.URLError('no host given')
162
163                h = http_class(host, timeout=req.timeout) # will parse host:port
164                h.set_debuglevel(self._debuglevel)
165
166                headers = dict(req.unredirected_hdrs)
167                headers.update(dict((k, v) for k, v in list(req.headers.items())
168                            if k not in headers))
169
170                # We want to make an HTTP/1.1 request, but the addinfourl
171                # class isn't prepared to deal with a persistent connection.
172                # It will try to read all remaining data from the socket,
173                # which will block while the server waits for the next request.
174                # So make sure the connection gets closed after the (only)
175                # request.
176
177                # Don't close connection when connection_cache is enabled,
178                if fetch.connection_cache is None:
179                    headers["Connection"] = "close"
180                else:
181                    headers["Connection"] = "Keep-Alive" # Works for HTTP/1.0
182
183                headers = dict(
184                    (name.title(), val) for name, val in list(headers.items()))
185
186                if req._tunnel_host:
187                    tunnel_headers = {}
188                    proxy_auth_hdr = "Proxy-Authorization"
189                    if proxy_auth_hdr in headers:
190                        tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
191                        # Proxy-Authorization should not be sent to origin
192                        # server.
193                        del headers[proxy_auth_hdr]
194                    h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
195
196                try:
197                    h.request(req.get_method(), req.selector, req.data, headers)
198                except socket.error as err: # XXX what error?
199                    # Don't close connection when cache is enabled.
200                    # Instead, try to detect connections that are no longer
201                    # usable (for example, closed unexpectedly) and remove
202                    # them from the cache.
203                    if fetch.connection_cache is None:
204                        h.close()
205                    elif isinstance(err, OSError) and err.errno == errno.EBADF:
206                        # This happens when the server closes the connection despite the Keep-Alive.
207                        # Apparently urllib then uses the file descriptor, expecting it to be
208                        # connected, when in reality the connection is already gone.
209                        # We let the request fail and expect it to be
210                        # tried once more ("try_again" in check_status()),
211                        # with the dead connection removed from the cache.
212                        # If it still fails, we give up, which can happend for bad
213                        # HTTP proxy settings.
214                        fetch.connection_cache.remove_connection(h.host, h.port)
215                    raise urllib.error.URLError(err)
216                else:
217                    r = h.getresponse()
218
219                # Pick apart the HTTPResponse object to get the addinfourl
220                # object initialized properly.
221
222                # Wrap the HTTPResponse object in socket's file object adapter
223                # for Windows.  That adapter calls recv(), so delegate recv()
224                # to read().  This weird wrapping allows the returned object to
225                # have readline() and readlines() methods.
226
227                # XXX It might be better to extract the read buffering code
228                # out of socket._fileobject() and into a base class.
229                r.recv = r.read
230
231                # no data, just have to read
232                r.read()
233                class fp_dummy(object):
234                    def read(self):
235                        return ""
236                    def readline(self):
237                        return ""
238                    def close(self):
239                        pass
240                    closed = False
241
242                resp = urllib.response.addinfourl(fp_dummy(), r.msg, req.get_full_url())
243                resp.code = r.status
244                resp.msg = r.reason
245
246                # Close connection when server request it.
247                if fetch.connection_cache is not None:
248                    if 'Connection' in r.msg and r.msg['Connection'] == 'close':
249                        fetch.connection_cache.remove_connection(h.host, h.port)
250
251                return resp
252
253        class HTTPMethodFallback(urllib.request.BaseHandler):
254            """
255            Fallback to GET if HEAD is not allowed (405 HTTP error)
256            """
257            def http_error_405(self, req, fp, code, msg, headers):
258                fp.read()
259                fp.close()
260
261                if req.get_method() != 'GET':
262                    newheaders = dict((k, v) for k, v in list(req.headers.items())
263                                      if k.lower() not in ("content-length", "content-type"))
264                    return self.parent.open(urllib.request.Request(req.get_full_url(),
265                                                            headers=newheaders,
266                                                            origin_req_host=req.origin_req_host,
267                                                            unverifiable=True))
268
269                raise urllib.request.HTTPError(req, code, msg, headers, None)
270
271            # Some servers (e.g. GitHub archives, hosted on Amazon S3) return 403
272            # Forbidden when they actually mean 405 Method Not Allowed.
273            http_error_403 = http_error_405
274
275
276        class FixedHTTPRedirectHandler(urllib.request.HTTPRedirectHandler):
277            """
278            urllib2.HTTPRedirectHandler resets the method to GET on redirect,
279            when we want to follow redirects using the original method.
280            """
281            def redirect_request(self, req, fp, code, msg, headers, newurl):
282                newreq = urllib.request.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, newurl)
283                newreq.get_method = req.get_method
284                return newreq
285        exported_proxies = export_proxies(d)
286
287        handlers = [FixedHTTPRedirectHandler, HTTPMethodFallback]
288        if exported_proxies:
289            handlers.append(urllib.request.ProxyHandler())
290        handlers.append(CacheHTTPHandler())
291        # Since Python 2.7.9 ssl cert validation is enabled by default
292        # see PEP-0476, this causes verification errors on some https servers
293        # so disable by default.
294        import ssl
295        if hasattr(ssl, '_create_unverified_context'):
296            handlers.append(urllib.request.HTTPSHandler(context=ssl._create_unverified_context()))
297        opener = urllib.request.build_opener(*handlers)
298
299        try:
300            uri = ud.url.split(";")[0]
301            r = urllib.request.Request(uri)
302            r.get_method = lambda: "HEAD"
303            # Some servers (FusionForge, as used on Alioth) require that the
304            # optional Accept header is set.
305            r.add_header("Accept", "*/*")
306            r.add_header("User-Agent", self.user_agent)
307            def add_basic_auth(login_str, request):
308                '''Adds Basic auth to http request, pass in login:password as string'''
309                import base64
310                encodeuser = base64.b64encode(login_str.encode('utf-8')).decode("utf-8")
311                authheader = "Basic %s" % encodeuser
312                r.add_header("Authorization", authheader)
313
314            if ud.user and ud.pswd:
315                add_basic_auth(ud.user + ':' + ud.pswd, r)
316
317            try:
318                import netrc
319                n = netrc.netrc()
320                login, unused, password = n.authenticators(urllib.parse.urlparse(uri).hostname)
321                add_basic_auth("%s:%s" % (login, password), r)
322            except (TypeError, ImportError, IOError, netrc.NetrcParseError):
323                pass
324
325            with opener.open(r) as response:
326                pass
327        except urllib.error.URLError as e:
328            if try_again:
329                logger.debug2("checkstatus: trying again")
330                return self.checkstatus(fetch, ud, d, False)
331            else:
332                # debug for now to avoid spamming the logs in e.g. remote sstate searches
333                logger.debug2("checkstatus() urlopen failed: %s" % e)
334                return False
335        except ConnectionResetError as e:
336            if try_again:
337                logger.debug2("checkstatus: trying again")
338                return self.checkstatus(fetch, ud, d, False)
339            else:
340                # debug for now to avoid spamming the logs in e.g. remote sstate searches
341                logger.debug2("checkstatus() urlopen failed: %s" % e)
342                return False
343        return True
344
345    def _parse_path(self, regex, s):
346        """
347        Find and group name, version and archive type in the given string s
348        """
349
350        m = regex.search(s)
351        if m:
352            pname = ''
353            pver = ''
354            ptype = ''
355
356            mdict = m.groupdict()
357            if 'name' in mdict.keys():
358                pname = mdict['name']
359            if 'pver' in mdict.keys():
360                pver = mdict['pver']
361            if 'type' in mdict.keys():
362                ptype = mdict['type']
363
364            bb.debug(3, "_parse_path: %s, %s, %s" % (pname, pver, ptype))
365
366            return (pname, pver, ptype)
367
368        return None
369
370    def _modelate_version(self, version):
371        if version[0] in ['.', '-']:
372            if version[1].isdigit():
373                version = version[1] + version[0] + version[2:len(version)]
374            else:
375                version = version[1:len(version)]
376
377        version = re.sub('-', '.', version)
378        version = re.sub('_', '.', version)
379        version = re.sub('(rc)+', '.1000.', version)
380        version = re.sub('(beta)+', '.100.', version)
381        version = re.sub('(alpha)+', '.10.', version)
382        if version[0] == 'v':
383            version = version[1:len(version)]
384        return version
385
386    def _vercmp(self, old, new):
387        """
388        Check whether 'new' is newer than 'old' version. We use existing vercmp() for the
389        purpose. PE is cleared in comparison as it's not for build, and PR is cleared too
390        for simplicity as it's somehow difficult to get from various upstream format
391        """
392
393        (oldpn, oldpv, oldsuffix) = old
394        (newpn, newpv, newsuffix) = new
395
396        # Check for a new suffix type that we have never heard of before
397        if newsuffix:
398            m = self.suffix_regex_comp.search(newsuffix)
399            if not m:
400                bb.warn("%s has a possible unknown suffix: %s" % (newpn, newsuffix))
401                return False
402
403        # Not our package so ignore it
404        if oldpn != newpn:
405            return False
406
407        oldpv = self._modelate_version(oldpv)
408        newpv = self._modelate_version(newpv)
409
410        return bb.utils.vercmp(("0", oldpv, ""), ("0", newpv, ""))
411
412    def _fetch_index(self, uri, ud, d):
413        """
414        Run fetch checkstatus to get directory information
415        """
416        f = tempfile.NamedTemporaryFile()
417        with tempfile.TemporaryDirectory(prefix="wget-index-") as workdir, tempfile.NamedTemporaryFile(dir=workdir, prefix="wget-listing-") as f:
418            fetchcmd = self.basecmd
419            fetchcmd += " -O " + f.name + " --user-agent='" + self.user_agent + "' '" + uri + "'"
420            try:
421                self._runwget(ud, d, fetchcmd, True, workdir=workdir)
422                fetchresult = f.read()
423            except bb.fetch2.BBFetchException:
424                fetchresult = ""
425
426        return fetchresult
427
428    def _check_latest_version(self, url, package, package_regex, current_version, ud, d):
429        """
430        Return the latest version of a package inside a given directory path
431        If error or no version, return ""
432        """
433        valid = 0
434        version = ['', '', '']
435
436        bb.debug(3, "VersionURL: %s" % (url))
437        soup = BeautifulSoup(self._fetch_index(url, ud, d), "html.parser", parse_only=SoupStrainer("a"))
438        if not soup:
439            bb.debug(3, "*** %s NO SOUP" % (url))
440            return ""
441
442        for line in soup.find_all('a', href=True):
443            bb.debug(3, "line['href'] = '%s'" % (line['href']))
444            bb.debug(3, "line = '%s'" % (str(line)))
445
446            newver = self._parse_path(package_regex, line['href'])
447            if not newver:
448                newver = self._parse_path(package_regex, str(line))
449
450            if newver:
451                bb.debug(3, "Upstream version found: %s" % newver[1])
452                if valid == 0:
453                    version = newver
454                    valid = 1
455                elif self._vercmp(version, newver) < 0:
456                    version = newver
457
458        pupver = re.sub('_', '.', version[1])
459
460        bb.debug(3, "*** %s -> UpstreamVersion = %s (CurrentVersion = %s)" %
461                (package, pupver or "N/A", current_version[1]))
462
463        if valid:
464            return pupver
465
466        return ""
467
468    def _check_latest_version_by_dir(self, dirver, package, package_regex, current_version, ud, d):
469        """
470        Scan every directory in order to get upstream version.
471        """
472        version_dir = ['', '', '']
473        version = ['', '', '']
474
475        dirver_regex = re.compile(r"(?P<pfx>\D*)(?P<ver>(\d+[\.\-_])+(\d+))")
476        s = dirver_regex.search(dirver)
477        if s:
478            version_dir[1] = s.group('ver')
479        else:
480            version_dir[1] = dirver
481
482        dirs_uri = bb.fetch.encodeurl([ud.type, ud.host,
483                ud.path.split(dirver)[0], ud.user, ud.pswd, {}])
484        bb.debug(3, "DirURL: %s, %s" % (dirs_uri, package))
485
486        soup = BeautifulSoup(self._fetch_index(dirs_uri, ud, d), "html.parser", parse_only=SoupStrainer("a"))
487        if not soup:
488            return version[1]
489
490        for line in soup.find_all('a', href=True):
491            s = dirver_regex.search(line['href'].strip("/"))
492            if s:
493                sver = s.group('ver')
494
495                # When prefix is part of the version directory it need to
496                # ensure that only version directory is used so remove previous
497                # directories if exists.
498                #
499                # Example: pfx = '/dir1/dir2/v' and version = '2.5' the expected
500                # result is v2.5.
501                spfx = s.group('pfx').split('/')[-1]
502
503                version_dir_new = ['', sver, '']
504                if self._vercmp(version_dir, version_dir_new) <= 0:
505                    dirver_new = spfx + sver
506                    path = ud.path.replace(dirver, dirver_new, True) \
507                        .split(package)[0]
508                    uri = bb.fetch.encodeurl([ud.type, ud.host, path,
509                        ud.user, ud.pswd, {}])
510
511                    pupver = self._check_latest_version(uri,
512                            package, package_regex, current_version, ud, d)
513                    if pupver:
514                        version[1] = pupver
515
516                    version_dir = version_dir_new
517
518        return version[1]
519
520    def _init_regexes(self, package, ud, d):
521        """
522        Match as many patterns as possible such as:
523                gnome-common-2.20.0.tar.gz (most common format)
524                gtk+-2.90.1.tar.gz
525                xf86-input-synaptics-12.6.9.tar.gz
526                dri2proto-2.3.tar.gz
527                blktool_4.orig.tar.gz
528                libid3tag-0.15.1b.tar.gz
529                unzip552.tar.gz
530                icu4c-3_6-src.tgz
531                genext2fs_1.3.orig.tar.gz
532                gst-fluendo-mp3
533        """
534        # match most patterns which uses "-" as separator to version digits
535        pn_prefix1 = r"[a-zA-Z][a-zA-Z0-9]*([-_][a-zA-Z]\w+)*\+?[-_]"
536        # a loose pattern such as for unzip552.tar.gz
537        pn_prefix2 = r"[a-zA-Z]+"
538        # a loose pattern such as for 80325-quicky-0.4.tar.gz
539        pn_prefix3 = r"[0-9]+[-]?[a-zA-Z]+"
540        # Save the Package Name (pn) Regex for use later
541        pn_regex = r"(%s|%s|%s)" % (pn_prefix1, pn_prefix2, pn_prefix3)
542
543        # match version
544        pver_regex = r"(([A-Z]*\d+[a-zA-Z]*[\.\-_]*)+)"
545
546        # match arch
547        parch_regex = "-source|_all_"
548
549        # src.rpm extension was added only for rpm package. Can be removed if the rpm
550        # packaged will always be considered as having to be manually upgraded
551        psuffix_regex = r"(tar\.gz|tgz|tar\.bz2|zip|xz|tar\.lz|rpm|bz2|orig\.tar\.gz|tar\.xz|src\.tar\.gz|src\.tgz|svnr\d+\.tar\.bz2|stable\.tar\.gz|src\.rpm)"
552
553        # match name, version and archive type of a package
554        package_regex_comp = re.compile(r"(?P<name>%s?\.?v?)(?P<pver>%s)(?P<arch>%s)?[\.-](?P<type>%s$)"
555                                                    % (pn_regex, pver_regex, parch_regex, psuffix_regex))
556        self.suffix_regex_comp = re.compile(psuffix_regex)
557
558        # compile regex, can be specific by package or generic regex
559        pn_regex = d.getVar('UPSTREAM_CHECK_REGEX')
560        if pn_regex:
561            package_custom_regex_comp = re.compile(pn_regex)
562        else:
563            version = self._parse_path(package_regex_comp, package)
564            if version:
565                package_custom_regex_comp = re.compile(
566                    r"(?P<name>%s)(?P<pver>%s)(?P<arch>%s)?[\.-](?P<type>%s)" %
567                    (re.escape(version[0]), pver_regex, parch_regex, psuffix_regex))
568            else:
569                package_custom_regex_comp = None
570
571        return package_custom_regex_comp
572
573    def latest_versionstring(self, ud, d):
574        """
575        Manipulate the URL and try to obtain the latest package version
576
577        sanity check to ensure same name and type.
578        """
579        package = ud.path.split("/")[-1]
580        current_version = ['', d.getVar('PV'), '']
581
582        """possible to have no version in pkg name, such as spectrum-fw"""
583        if not re.search(r"\d+", package):
584            current_version[1] = re.sub('_', '.', current_version[1])
585            current_version[1] = re.sub('-', '.', current_version[1])
586            return (current_version[1], '')
587
588        package_regex = self._init_regexes(package, ud, d)
589        if package_regex is None:
590            bb.warn("latest_versionstring: package %s don't match pattern" % (package))
591            return ('', '')
592        bb.debug(3, "latest_versionstring, regex: %s" % (package_regex.pattern))
593
594        uri = ""
595        regex_uri = d.getVar("UPSTREAM_CHECK_URI")
596        if not regex_uri:
597            path = ud.path.split(package)[0]
598
599            # search for version matches on folders inside the path, like:
600            # "5.7" in http://download.gnome.org/sources/${PN}/5.7/${PN}-${PV}.tar.gz
601            dirver_regex = re.compile(r"(?P<dirver>[^/]*(\d+\.)*\d+([-_]r\d+)*)/")
602            m = dirver_regex.search(path)
603            if m:
604                pn = d.getVar('PN')
605                dirver = m.group('dirver')
606
607                dirver_pn_regex = re.compile(r"%s\d?" % (re.escape(pn)))
608                if not dirver_pn_regex.search(dirver):
609                    return (self._check_latest_version_by_dir(dirver,
610                        package, package_regex, current_version, ud, d), '')
611
612            uri = bb.fetch.encodeurl([ud.type, ud.host, path, ud.user, ud.pswd, {}])
613        else:
614            uri = regex_uri
615
616        return (self._check_latest_version(uri, package, package_regex,
617                current_version, ud, d), '')
618