xref: /openbmc/openbmc/poky/bitbake/lib/bb/fetch2/wget.py (revision 82c905dc)
1"""
2BitBake 'Fetch' implementations
3
4Classes for obtaining upstream sources for the
5BitBake build tools.
6
7"""
8
9# Copyright (C) 2003, 2004  Chris Larson
10#
11# SPDX-License-Identifier: GPL-2.0-only
12#
13# Based on functions from the base bb module, Copyright 2003 Holger Schurig
14
15import shlex
16import re
17import tempfile
18import os
19import errno
20import bb
21import bb.progress
22import socket
23import http.client
24import urllib.request, urllib.parse, urllib.error
25from   bb.fetch2 import FetchMethod
26from   bb.fetch2 import FetchError
27from   bb.fetch2 import logger
28from   bb.fetch2 import runfetchcmd
29from   bb.utils import export_proxies
30from   bs4 import BeautifulSoup
31from   bs4 import SoupStrainer
32
33class WgetProgressHandler(bb.progress.LineFilterProgressHandler):
34    """
35    Extract progress information from wget output.
36    Note: relies on --progress=dot (with -v or without -q/-nv) being
37    specified on the wget command line.
38    """
39    def __init__(self, d):
40        super(WgetProgressHandler, self).__init__(d)
41        # Send an initial progress event so the bar gets shown
42        self._fire_progress(0)
43
44    def writeline(self, line):
45        percs = re.findall(r'(\d+)%\s+([\d.]+[A-Z])', line)
46        if percs:
47            progress = int(percs[-1][0])
48            rate = percs[-1][1] + '/s'
49            self.update(progress, rate)
50            return False
51        return True
52
53
54class Wget(FetchMethod):
55    """Class to fetch urls via 'wget'"""
56    def supports(self, ud, d):
57        """
58        Check to see if a given url can be fetched with wget.
59        """
60        return ud.type in ['http', 'https', 'ftp']
61
62    def recommends_checksum(self, urldata):
63        return True
64
65    def urldata_init(self, ud, d):
66        if 'protocol' in ud.parm:
67            if ud.parm['protocol'] == 'git':
68                raise bb.fetch2.ParameterError("Invalid protocol - if you wish to fetch from a git repository using http, you need to instead use the git:// prefix with protocol=http", ud.url)
69
70        if 'downloadfilename' in ud.parm:
71            ud.basename = ud.parm['downloadfilename']
72        else:
73            ud.basename = os.path.basename(ud.path)
74
75        ud.localfile = d.expand(urllib.parse.unquote(ud.basename))
76        if not ud.localfile:
77            ud.localfile = d.expand(urllib.parse.unquote(ud.host + ud.path).replace("/", "."))
78
79        self.basecmd = d.getVar("FETCHCMD_wget") or "/usr/bin/env wget -t 2 -T 30 --passive-ftp --no-check-certificate"
80
81    def _runwget(self, ud, d, command, quiet, workdir=None):
82
83        progresshandler = WgetProgressHandler(d)
84
85        logger.debug(2, "Fetching %s using command '%s'" % (ud.url, command))
86        bb.fetch2.check_network_access(d, command, ud.url)
87        runfetchcmd(command + ' --progress=dot -v', d, quiet, log=progresshandler, workdir=workdir)
88
89    def download(self, ud, d):
90        """Fetch urls"""
91
92        fetchcmd = self.basecmd
93
94        if 'downloadfilename' in ud.parm:
95            localpath = os.path.join(d.getVar("DL_DIR"), ud.localfile)
96            bb.utils.mkdirhier(os.path.dirname(localpath))
97            fetchcmd += " -O %s" % shlex.quote(localpath)
98
99        if ud.user and ud.pswd:
100            fetchcmd += " --user=%s --password=%s --auth-no-challenge" % (ud.user, ud.pswd)
101
102        uri = ud.url.split(";")[0]
103        if os.path.exists(ud.localpath):
104            # file exists, but we didnt complete it.. trying again..
105            fetchcmd += d.expand(" -c -P ${DL_DIR} '%s'" % uri)
106        else:
107            fetchcmd += d.expand(" -P ${DL_DIR} '%s'" % uri)
108
109        self._runwget(ud, d, fetchcmd, False)
110
111        # Sanity check since wget can pretend it succeed when it didn't
112        # Also, this used to happen if sourceforge sent us to the mirror page
113        if not os.path.exists(ud.localpath):
114            raise FetchError("The fetch command returned success for url %s but %s doesn't exist?!" % (uri, ud.localpath), uri)
115
116        if os.path.getsize(ud.localpath) == 0:
117            os.remove(ud.localpath)
118            raise FetchError("The fetch of %s resulted in a zero size file?! Deleting and failing since this isn't right." % (uri), uri)
119
120        return True
121
122    def checkstatus(self, fetch, ud, d, try_again=True):
123        class HTTPConnectionCache(http.client.HTTPConnection):
124            if fetch.connection_cache:
125                def connect(self):
126                    """Connect to the host and port specified in __init__."""
127
128                    sock = fetch.connection_cache.get_connection(self.host, self.port)
129                    if sock:
130                        self.sock = sock
131                    else:
132                        self.sock = socket.create_connection((self.host, self.port),
133                                    self.timeout, self.source_address)
134                        fetch.connection_cache.add_connection(self.host, self.port, self.sock)
135
136                    if self._tunnel_host:
137                        self._tunnel()
138
139        class CacheHTTPHandler(urllib.request.HTTPHandler):
140            def http_open(self, req):
141                return self.do_open(HTTPConnectionCache, req)
142
143            def do_open(self, http_class, req):
144                """Return an addinfourl object for the request, using http_class.
145
146                http_class must implement the HTTPConnection API from httplib.
147                The addinfourl return value is a file-like object.  It also
148                has methods and attributes including:
149                    - info(): return a mimetools.Message object for the headers
150                    - geturl(): return the original request URL
151                    - code: HTTP status code
152                """
153                host = req.host
154                if not host:
155                    raise urllib.error.URLError('no host given')
156
157                h = http_class(host, timeout=req.timeout) # will parse host:port
158                h.set_debuglevel(self._debuglevel)
159
160                headers = dict(req.unredirected_hdrs)
161                headers.update(dict((k, v) for k, v in list(req.headers.items())
162                            if k not in headers))
163
164                # We want to make an HTTP/1.1 request, but the addinfourl
165                # class isn't prepared to deal with a persistent connection.
166                # It will try to read all remaining data from the socket,
167                # which will block while the server waits for the next request.
168                # So make sure the connection gets closed after the (only)
169                # request.
170
171                # Don't close connection when connection_cache is enabled,
172                if fetch.connection_cache is None:
173                    headers["Connection"] = "close"
174                else:
175                    headers["Connection"] = "Keep-Alive" # Works for HTTP/1.0
176
177                headers = dict(
178                    (name.title(), val) for name, val in list(headers.items()))
179
180                if req._tunnel_host:
181                    tunnel_headers = {}
182                    proxy_auth_hdr = "Proxy-Authorization"
183                    if proxy_auth_hdr in headers:
184                        tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
185                        # Proxy-Authorization should not be sent to origin
186                        # server.
187                        del headers[proxy_auth_hdr]
188                    h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
189
190                try:
191                    h.request(req.get_method(), req.selector, req.data, headers)
192                except socket.error as err: # XXX what error?
193                    # Don't close connection when cache is enabled.
194                    # Instead, try to detect connections that are no longer
195                    # usable (for example, closed unexpectedly) and remove
196                    # them from the cache.
197                    if fetch.connection_cache is None:
198                        h.close()
199                    elif isinstance(err, OSError) and err.errno == errno.EBADF:
200                        # This happens when the server closes the connection despite the Keep-Alive.
201                        # Apparently urllib then uses the file descriptor, expecting it to be
202                        # connected, when in reality the connection is already gone.
203                        # We let the request fail and expect it to be
204                        # tried once more ("try_again" in check_status()),
205                        # with the dead connection removed from the cache.
206                        # If it still fails, we give up, which can happend for bad
207                        # HTTP proxy settings.
208                        fetch.connection_cache.remove_connection(h.host, h.port)
209                    raise urllib.error.URLError(err)
210                else:
211                    try:
212                        r = h.getresponse(buffering=True)
213                    except TypeError: # buffering kw not supported
214                        r = h.getresponse()
215
216                # Pick apart the HTTPResponse object to get the addinfourl
217                # object initialized properly.
218
219                # Wrap the HTTPResponse object in socket's file object adapter
220                # for Windows.  That adapter calls recv(), so delegate recv()
221                # to read().  This weird wrapping allows the returned object to
222                # have readline() and readlines() methods.
223
224                # XXX It might be better to extract the read buffering code
225                # out of socket._fileobject() and into a base class.
226                r.recv = r.read
227
228                # no data, just have to read
229                r.read()
230                class fp_dummy(object):
231                    def read(self):
232                        return ""
233                    def readline(self):
234                        return ""
235                    def close(self):
236                        pass
237                    closed = False
238
239                resp = urllib.response.addinfourl(fp_dummy(), r.msg, req.get_full_url())
240                resp.code = r.status
241                resp.msg = r.reason
242
243                # Close connection when server request it.
244                if fetch.connection_cache is not None:
245                    if 'Connection' in r.msg and r.msg['Connection'] == 'close':
246                        fetch.connection_cache.remove_connection(h.host, h.port)
247
248                return resp
249
250        class HTTPMethodFallback(urllib.request.BaseHandler):
251            """
252            Fallback to GET if HEAD is not allowed (405 HTTP error)
253            """
254            def http_error_405(self, req, fp, code, msg, headers):
255                fp.read()
256                fp.close()
257
258                if req.get_method() != 'GET':
259                    newheaders = dict((k, v) for k, v in list(req.headers.items())
260                                      if k.lower() not in ("content-length", "content-type"))
261                    return self.parent.open(urllib.request.Request(req.get_full_url(),
262                                                            headers=newheaders,
263                                                            origin_req_host=req.origin_req_host,
264                                                            unverifiable=True))
265
266                raise urllib.request.HTTPError(req, code, msg, headers, None)
267
268            # Some servers (e.g. GitHub archives, hosted on Amazon S3) return 403
269            # Forbidden when they actually mean 405 Method Not Allowed.
270            http_error_403 = http_error_405
271
272
273        class FixedHTTPRedirectHandler(urllib.request.HTTPRedirectHandler):
274            """
275            urllib2.HTTPRedirectHandler resets the method to GET on redirect,
276            when we want to follow redirects using the original method.
277            """
278            def redirect_request(self, req, fp, code, msg, headers, newurl):
279                newreq = urllib.request.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, newurl)
280                newreq.get_method = req.get_method
281                return newreq
282        exported_proxies = export_proxies(d)
283
284        handlers = [FixedHTTPRedirectHandler, HTTPMethodFallback]
285        if exported_proxies:
286            handlers.append(urllib.request.ProxyHandler())
287        handlers.append(CacheHTTPHandler())
288        # Since Python 2.7.9 ssl cert validation is enabled by default
289        # see PEP-0476, this causes verification errors on some https servers
290        # so disable by default.
291        import ssl
292        if hasattr(ssl, '_create_unverified_context'):
293            handlers.append(urllib.request.HTTPSHandler(context=ssl._create_unverified_context()))
294        opener = urllib.request.build_opener(*handlers)
295
296        try:
297            uri = ud.url.split(";")[0]
298            r = urllib.request.Request(uri)
299            r.get_method = lambda: "HEAD"
300            # Some servers (FusionForge, as used on Alioth) require that the
301            # optional Accept header is set.
302            r.add_header("Accept", "*/*")
303            r.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.12) Gecko/20101027 Ubuntu/9.10 (karmic) Firefox/3.6.12")
304            def add_basic_auth(login_str, request):
305                '''Adds Basic auth to http request, pass in login:password as string'''
306                import base64
307                encodeuser = base64.b64encode(login_str.encode('utf-8')).decode("utf-8")
308                authheader = "Basic %s" % encodeuser
309                r.add_header("Authorization", authheader)
310
311            if ud.user and ud.pswd:
312                add_basic_auth(ud.user + ':' + ud.pswd, r)
313
314            try:
315                import netrc
316                n = netrc.netrc()
317                login, unused, password = n.authenticators(urllib.parse.urlparse(uri).hostname)
318                add_basic_auth("%s:%s" % (login, password), r)
319            except (TypeError, ImportError, IOError, netrc.NetrcParseError):
320                pass
321
322            with opener.open(r) as response:
323                pass
324        except urllib.error.URLError as e:
325            if try_again:
326                logger.debug(2, "checkstatus: trying again")
327                return self.checkstatus(fetch, ud, d, False)
328            else:
329                # debug for now to avoid spamming the logs in e.g. remote sstate searches
330                logger.debug(2, "checkstatus() urlopen failed: %s" % e)
331                return False
332        return True
333
334    def _parse_path(self, regex, s):
335        """
336        Find and group name, version and archive type in the given string s
337        """
338
339        m = regex.search(s)
340        if m:
341            pname = ''
342            pver = ''
343            ptype = ''
344
345            mdict = m.groupdict()
346            if 'name' in mdict.keys():
347                pname = mdict['name']
348            if 'pver' in mdict.keys():
349                pver = mdict['pver']
350            if 'type' in mdict.keys():
351                ptype = mdict['type']
352
353            bb.debug(3, "_parse_path: %s, %s, %s" % (pname, pver, ptype))
354
355            return (pname, pver, ptype)
356
357        return None
358
359    def _modelate_version(self, version):
360        if version[0] in ['.', '-']:
361            if version[1].isdigit():
362                version = version[1] + version[0] + version[2:len(version)]
363            else:
364                version = version[1:len(version)]
365
366        version = re.sub('-', '.', version)
367        version = re.sub('_', '.', version)
368        version = re.sub('(rc)+', '.1000.', version)
369        version = re.sub('(beta)+', '.100.', version)
370        version = re.sub('(alpha)+', '.10.', version)
371        if version[0] == 'v':
372            version = version[1:len(version)]
373        return version
374
375    def _vercmp(self, old, new):
376        """
377        Check whether 'new' is newer than 'old' version. We use existing vercmp() for the
378        purpose. PE is cleared in comparison as it's not for build, and PR is cleared too
379        for simplicity as it's somehow difficult to get from various upstream format
380        """
381
382        (oldpn, oldpv, oldsuffix) = old
383        (newpn, newpv, newsuffix) = new
384
385        # Check for a new suffix type that we have never heard of before
386        if newsuffix:
387            m = self.suffix_regex_comp.search(newsuffix)
388            if not m:
389                bb.warn("%s has a possible unknown suffix: %s" % (newpn, newsuffix))
390                return False
391
392        # Not our package so ignore it
393        if oldpn != newpn:
394            return False
395
396        oldpv = self._modelate_version(oldpv)
397        newpv = self._modelate_version(newpv)
398
399        return bb.utils.vercmp(("0", oldpv, ""), ("0", newpv, ""))
400
401    def _fetch_index(self, uri, ud, d):
402        """
403        Run fetch checkstatus to get directory information
404        """
405        f = tempfile.NamedTemporaryFile()
406        with tempfile.TemporaryDirectory(prefix="wget-index-") as workdir, tempfile.NamedTemporaryFile(dir=workdir, prefix="wget-listing-") as f:
407            agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.12) Gecko/20101027 Ubuntu/9.10 (karmic) Firefox/3.6.12"
408            fetchcmd = self.basecmd
409            fetchcmd += " -O " + f.name + " --user-agent='" + agent + "' '" + uri + "'"
410            try:
411                self._runwget(ud, d, fetchcmd, True, workdir=workdir)
412                fetchresult = f.read()
413            except bb.fetch2.BBFetchException:
414                fetchresult = ""
415
416        return fetchresult
417
418    def _check_latest_version(self, url, package, package_regex, current_version, ud, d):
419        """
420        Return the latest version of a package inside a given directory path
421        If error or no version, return ""
422        """
423        valid = 0
424        version = ['', '', '']
425
426        bb.debug(3, "VersionURL: %s" % (url))
427        soup = BeautifulSoup(self._fetch_index(url, ud, d), "html.parser", parse_only=SoupStrainer("a"))
428        if not soup:
429            bb.debug(3, "*** %s NO SOUP" % (url))
430            return ""
431
432        for line in soup.find_all('a', href=True):
433            bb.debug(3, "line['href'] = '%s'" % (line['href']))
434            bb.debug(3, "line = '%s'" % (str(line)))
435
436            newver = self._parse_path(package_regex, line['href'])
437            if not newver:
438                newver = self._parse_path(package_regex, str(line))
439
440            if newver:
441                bb.debug(3, "Upstream version found: %s" % newver[1])
442                if valid == 0:
443                    version = newver
444                    valid = 1
445                elif self._vercmp(version, newver) < 0:
446                    version = newver
447
448        pupver = re.sub('_', '.', version[1])
449
450        bb.debug(3, "*** %s -> UpstreamVersion = %s (CurrentVersion = %s)" %
451                (package, pupver or "N/A", current_version[1]))
452
453        if valid:
454            return pupver
455
456        return ""
457
458    def _check_latest_version_by_dir(self, dirver, package, package_regex, current_version, ud, d):
459        """
460        Scan every directory in order to get upstream version.
461        """
462        version_dir = ['', '', '']
463        version = ['', '', '']
464
465        dirver_regex = re.compile(r"(?P<pfx>\D*)(?P<ver>(\d+[\.\-_])+(\d+))")
466        s = dirver_regex.search(dirver)
467        if s:
468            version_dir[1] = s.group('ver')
469        else:
470            version_dir[1] = dirver
471
472        dirs_uri = bb.fetch.encodeurl([ud.type, ud.host,
473                ud.path.split(dirver)[0], ud.user, ud.pswd, {}])
474        bb.debug(3, "DirURL: %s, %s" % (dirs_uri, package))
475
476        soup = BeautifulSoup(self._fetch_index(dirs_uri, ud, d), "html.parser", parse_only=SoupStrainer("a"))
477        if not soup:
478            return version[1]
479
480        for line in soup.find_all('a', href=True):
481            s = dirver_regex.search(line['href'].strip("/"))
482            if s:
483                sver = s.group('ver')
484
485                # When prefix is part of the version directory it need to
486                # ensure that only version directory is used so remove previous
487                # directories if exists.
488                #
489                # Example: pfx = '/dir1/dir2/v' and version = '2.5' the expected
490                # result is v2.5.
491                spfx = s.group('pfx').split('/')[-1]
492
493                version_dir_new = ['', sver, '']
494                if self._vercmp(version_dir, version_dir_new) <= 0:
495                    dirver_new = spfx + sver
496                    path = ud.path.replace(dirver, dirver_new, True) \
497                        .split(package)[0]
498                    uri = bb.fetch.encodeurl([ud.type, ud.host, path,
499                        ud.user, ud.pswd, {}])
500
501                    pupver = self._check_latest_version(uri,
502                            package, package_regex, current_version, ud, d)
503                    if pupver:
504                        version[1] = pupver
505
506                    version_dir = version_dir_new
507
508        return version[1]
509
510    def _init_regexes(self, package, ud, d):
511        """
512        Match as many patterns as possible such as:
513                gnome-common-2.20.0.tar.gz (most common format)
514                gtk+-2.90.1.tar.gz
515                xf86-input-synaptics-12.6.9.tar.gz
516                dri2proto-2.3.tar.gz
517                blktool_4.orig.tar.gz
518                libid3tag-0.15.1b.tar.gz
519                unzip552.tar.gz
520                icu4c-3_6-src.tgz
521                genext2fs_1.3.orig.tar.gz
522                gst-fluendo-mp3
523        """
524        # match most patterns which uses "-" as separator to version digits
525        pn_prefix1 = r"[a-zA-Z][a-zA-Z0-9]*([-_][a-zA-Z]\w+)*\+?[-_]"
526        # a loose pattern such as for unzip552.tar.gz
527        pn_prefix2 = r"[a-zA-Z]+"
528        # a loose pattern such as for 80325-quicky-0.4.tar.gz
529        pn_prefix3 = r"[0-9]+[-]?[a-zA-Z]+"
530        # Save the Package Name (pn) Regex for use later
531        pn_regex = r"(%s|%s|%s)" % (pn_prefix1, pn_prefix2, pn_prefix3)
532
533        # match version
534        pver_regex = r"(([A-Z]*\d+[a-zA-Z]*[\.\-_]*)+)"
535
536        # match arch
537        parch_regex = "-source|_all_"
538
539        # src.rpm extension was added only for rpm package. Can be removed if the rpm
540        # packaged will always be considered as having to be manually upgraded
541        psuffix_regex = r"(tar\.gz|tgz|tar\.bz2|zip|xz|tar\.lz|rpm|bz2|orig\.tar\.gz|tar\.xz|src\.tar\.gz|src\.tgz|svnr\d+\.tar\.bz2|stable\.tar\.gz|src\.rpm)"
542
543        # match name, version and archive type of a package
544        package_regex_comp = re.compile(r"(?P<name>%s?\.?v?)(?P<pver>%s)(?P<arch>%s)?[\.-](?P<type>%s$)"
545                                                    % (pn_regex, pver_regex, parch_regex, psuffix_regex))
546        self.suffix_regex_comp = re.compile(psuffix_regex)
547
548        # compile regex, can be specific by package or generic regex
549        pn_regex = d.getVar('UPSTREAM_CHECK_REGEX')
550        if pn_regex:
551            package_custom_regex_comp = re.compile(pn_regex)
552        else:
553            version = self._parse_path(package_regex_comp, package)
554            if version:
555                package_custom_regex_comp = re.compile(
556                    r"(?P<name>%s)(?P<pver>%s)(?P<arch>%s)?[\.-](?P<type>%s)" %
557                    (re.escape(version[0]), pver_regex, parch_regex, psuffix_regex))
558            else:
559                package_custom_regex_comp = None
560
561        return package_custom_regex_comp
562
563    def latest_versionstring(self, ud, d):
564        """
565        Manipulate the URL and try to obtain the latest package version
566
567        sanity check to ensure same name and type.
568        """
569        package = ud.path.split("/")[-1]
570        current_version = ['', d.getVar('PV'), '']
571
572        """possible to have no version in pkg name, such as spectrum-fw"""
573        if not re.search(r"\d+", package):
574            current_version[1] = re.sub('_', '.', current_version[1])
575            current_version[1] = re.sub('-', '.', current_version[1])
576            return (current_version[1], '')
577
578        package_regex = self._init_regexes(package, ud, d)
579        if package_regex is None:
580            bb.warn("latest_versionstring: package %s don't match pattern" % (package))
581            return ('', '')
582        bb.debug(3, "latest_versionstring, regex: %s" % (package_regex.pattern))
583
584        uri = ""
585        regex_uri = d.getVar("UPSTREAM_CHECK_URI")
586        if not regex_uri:
587            path = ud.path.split(package)[0]
588
589            # search for version matches on folders inside the path, like:
590            # "5.7" in http://download.gnome.org/sources/${PN}/5.7/${PN}-${PV}.tar.gz
591            dirver_regex = re.compile(r"(?P<dirver>[^/]*(\d+\.)*\d+([-_]r\d+)*)/")
592            m = dirver_regex.search(path)
593            if m:
594                pn = d.getVar('PN')
595                dirver = m.group('dirver')
596
597                dirver_pn_regex = re.compile(r"%s\d?" % (re.escape(pn)))
598                if not dirver_pn_regex.search(dirver):
599                    return (self._check_latest_version_by_dir(dirver,
600                        package, package_regex, current_version, ud, d), '')
601
602            uri = bb.fetch.encodeurl([ud.type, ud.host, path, ud.user, ud.pswd, {}])
603        else:
604            uri = regex_uri
605
606        return (self._check_latest_version(uri, package, package_regex,
607                current_version, ud, d), '')
608