xref: /openbmc/openbmc/poky/bitbake/lib/bb/fetch2/wget.py (revision 44b3caf2)
1eb8dc403SDave Cobbley"""
2eb8dc403SDave CobbleyBitBake 'Fetch' implementations
3eb8dc403SDave Cobbley
4eb8dc403SDave CobbleyClasses for obtaining upstream sources for the
5eb8dc403SDave CobbleyBitBake build tools.
6eb8dc403SDave Cobbley
7eb8dc403SDave Cobbley"""
8eb8dc403SDave Cobbley
9eb8dc403SDave Cobbley# Copyright (C) 2003, 2004  Chris Larson
10eb8dc403SDave Cobbley#
11c342db35SBrad Bishop# SPDX-License-Identifier: GPL-2.0-only
12eb8dc403SDave Cobbley#
13eb8dc403SDave Cobbley# Based on functions from the base bb module, Copyright 2003 Holger Schurig
14eb8dc403SDave Cobbley
1582c905dcSAndrew Geisslerimport shlex
16eb8dc403SDave Cobbleyimport re
17eb8dc403SDave Cobbleyimport tempfile
18eb8dc403SDave Cobbleyimport os
19eb8dc403SDave Cobbleyimport errno
20eb8dc403SDave Cobbleyimport bb
21eb8dc403SDave Cobbleyimport bb.progress
2219323693SBrad Bishopimport socket
2319323693SBrad Bishopimport http.client
24eb8dc403SDave Cobbleyimport urllib.request, urllib.parse, urllib.error
25eb8dc403SDave Cobbleyfrom   bb.fetch2 import FetchMethod
26eb8dc403SDave Cobbleyfrom   bb.fetch2 import FetchError
27eb8dc403SDave Cobbleyfrom   bb.fetch2 import logger
28eb8dc403SDave Cobbleyfrom   bb.fetch2 import runfetchcmd
29eb8dc403SDave Cobbleyfrom   bs4 import BeautifulSoup
30eb8dc403SDave Cobbleyfrom   bs4 import SoupStrainer
31eb8dc403SDave Cobbley
32eb8dc403SDave Cobbleyclass WgetProgressHandler(bb.progress.LineFilterProgressHandler):
33eb8dc403SDave Cobbley    """
34eb8dc403SDave Cobbley    Extract progress information from wget output.
35eb8dc403SDave Cobbley    Note: relies on --progress=dot (with -v or without -q/-nv) being
36eb8dc403SDave Cobbley    specified on the wget command line.
37eb8dc403SDave Cobbley    """
38eb8dc403SDave Cobbley    def __init__(self, d):
39eb8dc403SDave Cobbley        super(WgetProgressHandler, self).__init__(d)
40eb8dc403SDave Cobbley        # Send an initial progress event so the bar gets shown
41eb8dc403SDave Cobbley        self._fire_progress(0)
42eb8dc403SDave Cobbley
43eb8dc403SDave Cobbley    def writeline(self, line):
44eb8dc403SDave Cobbley        percs = re.findall(r'(\d+)%\s+([\d.]+[A-Z])', line)
45eb8dc403SDave Cobbley        if percs:
46eb8dc403SDave Cobbley            progress = int(percs[-1][0])
47eb8dc403SDave Cobbley            rate = percs[-1][1] + '/s'
48eb8dc403SDave Cobbley            self.update(progress, rate)
49eb8dc403SDave Cobbley            return False
50eb8dc403SDave Cobbley        return True
51eb8dc403SDave Cobbley
52eb8dc403SDave Cobbley
53eb8dc403SDave Cobbleyclass Wget(FetchMethod):
540ca19ccfSPatrick Williams    """Class to fetch urls via 'wget'"""
55d1e89497SAndrew Geissler
56d1e89497SAndrew Geissler    # CDNs like CloudFlare may do a 'browser integrity test' which can fail
57d1e89497SAndrew Geissler    # with the standard wget/urllib User-Agent, so pretend to be a modern
58d1e89497SAndrew Geissler    # browser.
59d1e89497SAndrew Geissler    user_agent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0"
60d1e89497SAndrew Geissler
610ca19ccfSPatrick Williams    def check_certs(self, d):
620ca19ccfSPatrick Williams        """
630ca19ccfSPatrick Williams        Should certificates be checked?
640ca19ccfSPatrick Williams        """
650ca19ccfSPatrick Williams        return (d.getVar("BB_CHECK_SSL_CERTS") or "1") != "0"
660ca19ccfSPatrick Williams
67eb8dc403SDave Cobbley    def supports(self, ud, d):
68eb8dc403SDave Cobbley        """
69eb8dc403SDave Cobbley        Check to see if a given url can be fetched with wget.
70eb8dc403SDave Cobbley        """
715199d831SAndrew Geissler        return ud.type in ['http', 'https', 'ftp', 'ftps']
72eb8dc403SDave Cobbley
73eb8dc403SDave Cobbley    def recommends_checksum(self, urldata):
74eb8dc403SDave Cobbley        return True
75eb8dc403SDave Cobbley
76eb8dc403SDave Cobbley    def urldata_init(self, ud, d):
77eb8dc403SDave Cobbley        if 'protocol' in ud.parm:
78eb8dc403SDave Cobbley            if ud.parm['protocol'] == 'git':
79eb8dc403SDave Cobbley                raise bb.fetch2.ParameterError("Invalid protocol - if you wish to fetch from a git repository using http, you need to instead use the git:// prefix with protocol=http", ud.url)
80eb8dc403SDave Cobbley
81eb8dc403SDave Cobbley        if 'downloadfilename' in ud.parm:
82eb8dc403SDave Cobbley            ud.basename = ud.parm['downloadfilename']
83eb8dc403SDave Cobbley        else:
84eb8dc403SDave Cobbley            ud.basename = os.path.basename(ud.path)
85eb8dc403SDave Cobbley
86eb8dc403SDave Cobbley        ud.localfile = d.expand(urllib.parse.unquote(ud.basename))
87eb8dc403SDave Cobbley        if not ud.localfile:
88eb8dc403SDave Cobbley            ud.localfile = d.expand(urllib.parse.unquote(ud.host + ud.path).replace("/", "."))
89eb8dc403SDave Cobbley
90*44b3caf2SPatrick Williams        self.basecmd = d.getVar("FETCHCMD_wget") or "/usr/bin/env wget -t 2 -T 30"
91*44b3caf2SPatrick Williams
92*44b3caf2SPatrick Williams        if ud.type == 'ftp' or ud.type == 'ftps':
93*44b3caf2SPatrick Williams            self.basecmd += " --passive-ftp"
940ca19ccfSPatrick Williams
950ca19ccfSPatrick Williams        if not self.check_certs(d):
960ca19ccfSPatrick Williams            self.basecmd += " --no-check-certificate"
97eb8dc403SDave Cobbley
98eb8dc403SDave Cobbley    def _runwget(self, ud, d, command, quiet, workdir=None):
99eb8dc403SDave Cobbley
100eb8dc403SDave Cobbley        progresshandler = WgetProgressHandler(d)
101eb8dc403SDave Cobbley
102d1e89497SAndrew Geissler        logger.debug2("Fetching %s using command '%s'" % (ud.url, command))
103eb8dc403SDave Cobbley        bb.fetch2.check_network_access(d, command, ud.url)
104eb8dc403SDave Cobbley        runfetchcmd(command + ' --progress=dot -v', d, quiet, log=progresshandler, workdir=workdir)
105eb8dc403SDave Cobbley
106eb8dc403SDave Cobbley    def download(self, ud, d):
107eb8dc403SDave Cobbley        """Fetch urls"""
108eb8dc403SDave Cobbley
109eb8dc403SDave Cobbley        fetchcmd = self.basecmd
110eb8dc403SDave Cobbley
11178b72798SAndrew Geissler        localpath = os.path.join(d.getVar("DL_DIR"), ud.localfile) + ".tmp"
11282c905dcSAndrew Geissler        bb.utils.mkdirhier(os.path.dirname(localpath))
11382c905dcSAndrew Geissler        fetchcmd += " -O %s" % shlex.quote(localpath)
114eb8dc403SDave Cobbley
115eb8dc403SDave Cobbley        if ud.user and ud.pswd:
116595f6308SAndrew Geissler            fetchcmd += " --auth-no-challenge"
117595f6308SAndrew Geissler            if ud.parm.get("redirectauth", "1") == "1":
118595f6308SAndrew Geissler                # An undocumented feature of wget is that if the
119595f6308SAndrew Geissler                # username/password are specified on the URI, wget will only
120595f6308SAndrew Geissler                # send the Authorization header to the first host and not to
121595f6308SAndrew Geissler                # any hosts that it is redirected to.  With the increasing
122595f6308SAndrew Geissler                # usage of temporary AWS URLs, this difference now matters as
123595f6308SAndrew Geissler                # AWS will reject any request that has authentication both in
124595f6308SAndrew Geissler                # the query parameters (from the redirect) and in the
125595f6308SAndrew Geissler                # Authorization header.
126595f6308SAndrew Geissler                fetchcmd += " --user=%s --password=%s" % (ud.user, ud.pswd)
127eb8dc403SDave Cobbley
128eb8dc403SDave Cobbley        uri = ud.url.split(";")[0]
129eb8dc403SDave Cobbley        if os.path.exists(ud.localpath):
130eb8dc403SDave Cobbley            # file exists, but we didnt complete it.. trying again..
131eb8dc403SDave Cobbley            fetchcmd += d.expand(" -c -P ${DL_DIR} '%s'" % uri)
132eb8dc403SDave Cobbley        else:
133eb8dc403SDave Cobbley            fetchcmd += d.expand(" -P ${DL_DIR} '%s'" % uri)
134eb8dc403SDave Cobbley
135eb8dc403SDave Cobbley        self._runwget(ud, d, fetchcmd, False)
136eb8dc403SDave Cobbley
13787f5cff0SAndrew Geissler        # Try and verify any checksum now, meaning if it isn't correct, we don't remove the
13887f5cff0SAndrew Geissler        # original file, which might be a race (imagine two recipes referencing the same
13987f5cff0SAndrew Geissler        # source, one with an incorrect checksum)
14087f5cff0SAndrew Geissler        bb.fetch2.verify_checksum(ud, d, localpath=localpath, fatal_nochecksum=False)
14187f5cff0SAndrew Geissler
14278b72798SAndrew Geissler        # Remove the ".tmp" and move the file into position atomically
14378b72798SAndrew Geissler        # Our lock prevents multiple writers but mirroring code may grab incomplete files
14478b72798SAndrew Geissler        os.rename(localpath, localpath[:-4])
14578b72798SAndrew Geissler
146eb8dc403SDave Cobbley        # Sanity check since wget can pretend it succeed when it didn't
147eb8dc403SDave Cobbley        # Also, this used to happen if sourceforge sent us to the mirror page
148eb8dc403SDave Cobbley        if not os.path.exists(ud.localpath):
149eb8dc403SDave Cobbley            raise FetchError("The fetch command returned success for url %s but %s doesn't exist?!" % (uri, ud.localpath), uri)
150eb8dc403SDave Cobbley
151eb8dc403SDave Cobbley        if os.path.getsize(ud.localpath) == 0:
152eb8dc403SDave Cobbley            os.remove(ud.localpath)
153eb8dc403SDave Cobbley            raise FetchError("The fetch of %s resulted in a zero size file?! Deleting and failing since this isn't right." % (uri), uri)
154eb8dc403SDave Cobbley
155eb8dc403SDave Cobbley        return True
156eb8dc403SDave Cobbley
157eb8dc403SDave Cobbley    def checkstatus(self, fetch, ud, d, try_again=True):
158eb8dc403SDave Cobbley        class HTTPConnectionCache(http.client.HTTPConnection):
159eb8dc403SDave Cobbley            if fetch.connection_cache:
160eb8dc403SDave Cobbley                def connect(self):
161eb8dc403SDave Cobbley                    """Connect to the host and port specified in __init__."""
162eb8dc403SDave Cobbley
163eb8dc403SDave Cobbley                    sock = fetch.connection_cache.get_connection(self.host, self.port)
164eb8dc403SDave Cobbley                    if sock:
165eb8dc403SDave Cobbley                        self.sock = sock
166eb8dc403SDave Cobbley                    else:
167eb8dc403SDave Cobbley                        self.sock = socket.create_connection((self.host, self.port),
168eb8dc403SDave Cobbley                                    self.timeout, self.source_address)
169eb8dc403SDave Cobbley                        fetch.connection_cache.add_connection(self.host, self.port, self.sock)
170eb8dc403SDave Cobbley
171eb8dc403SDave Cobbley                    if self._tunnel_host:
172eb8dc403SDave Cobbley                        self._tunnel()
173eb8dc403SDave Cobbley
174eb8dc403SDave Cobbley        class CacheHTTPHandler(urllib.request.HTTPHandler):
175eb8dc403SDave Cobbley            def http_open(self, req):
176eb8dc403SDave Cobbley                return self.do_open(HTTPConnectionCache, req)
177eb8dc403SDave Cobbley
178eb8dc403SDave Cobbley            def do_open(self, http_class, req):
179eb8dc403SDave Cobbley                """Return an addinfourl object for the request, using http_class.
180eb8dc403SDave Cobbley
181eb8dc403SDave Cobbley                http_class must implement the HTTPConnection API from httplib.
182eb8dc403SDave Cobbley                The addinfourl return value is a file-like object.  It also
183eb8dc403SDave Cobbley                has methods and attributes including:
184eb8dc403SDave Cobbley                    - info(): return a mimetools.Message object for the headers
185eb8dc403SDave Cobbley                    - geturl(): return the original request URL
186eb8dc403SDave Cobbley                    - code: HTTP status code
187eb8dc403SDave Cobbley                """
188eb8dc403SDave Cobbley                host = req.host
189eb8dc403SDave Cobbley                if not host:
19019323693SBrad Bishop                    raise urllib.error.URLError('no host given')
191eb8dc403SDave Cobbley
192eb8dc403SDave Cobbley                h = http_class(host, timeout=req.timeout) # will parse host:port
193eb8dc403SDave Cobbley                h.set_debuglevel(self._debuglevel)
194eb8dc403SDave Cobbley
195eb8dc403SDave Cobbley                headers = dict(req.unredirected_hdrs)
196eb8dc403SDave Cobbley                headers.update(dict((k, v) for k, v in list(req.headers.items())
197eb8dc403SDave Cobbley                            if k not in headers))
198eb8dc403SDave Cobbley
199eb8dc403SDave Cobbley                # We want to make an HTTP/1.1 request, but the addinfourl
200eb8dc403SDave Cobbley                # class isn't prepared to deal with a persistent connection.
201eb8dc403SDave Cobbley                # It will try to read all remaining data from the socket,
202eb8dc403SDave Cobbley                # which will block while the server waits for the next request.
203eb8dc403SDave Cobbley                # So make sure the connection gets closed after the (only)
204eb8dc403SDave Cobbley                # request.
205eb8dc403SDave Cobbley
206eb8dc403SDave Cobbley                # Don't close connection when connection_cache is enabled,
207eb8dc403SDave Cobbley                if fetch.connection_cache is None:
208eb8dc403SDave Cobbley                    headers["Connection"] = "close"
209eb8dc403SDave Cobbley                else:
210eb8dc403SDave Cobbley                    headers["Connection"] = "Keep-Alive" # Works for HTTP/1.0
211eb8dc403SDave Cobbley
212eb8dc403SDave Cobbley                headers = dict(
213eb8dc403SDave Cobbley                    (name.title(), val) for name, val in list(headers.items()))
214eb8dc403SDave Cobbley
215eb8dc403SDave Cobbley                if req._tunnel_host:
216eb8dc403SDave Cobbley                    tunnel_headers = {}
217eb8dc403SDave Cobbley                    proxy_auth_hdr = "Proxy-Authorization"
218eb8dc403SDave Cobbley                    if proxy_auth_hdr in headers:
219eb8dc403SDave Cobbley                        tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
220eb8dc403SDave Cobbley                        # Proxy-Authorization should not be sent to origin
221eb8dc403SDave Cobbley                        # server.
222eb8dc403SDave Cobbley                        del headers[proxy_auth_hdr]
223eb8dc403SDave Cobbley                    h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
224eb8dc403SDave Cobbley
225eb8dc403SDave Cobbley                try:
226eb8dc403SDave Cobbley                    h.request(req.get_method(), req.selector, req.data, headers)
227eb8dc403SDave Cobbley                except socket.error as err: # XXX what error?
228eb8dc403SDave Cobbley                    # Don't close connection when cache is enabled.
229eb8dc403SDave Cobbley                    # Instead, try to detect connections that are no longer
230eb8dc403SDave Cobbley                    # usable (for example, closed unexpectedly) and remove
231eb8dc403SDave Cobbley                    # them from the cache.
232eb8dc403SDave Cobbley                    if fetch.connection_cache is None:
233eb8dc403SDave Cobbley                        h.close()
234eb8dc403SDave Cobbley                    elif isinstance(err, OSError) and err.errno == errno.EBADF:
235eb8dc403SDave Cobbley                        # This happens when the server closes the connection despite the Keep-Alive.
236eb8dc403SDave Cobbley                        # Apparently urllib then uses the file descriptor, expecting it to be
237eb8dc403SDave Cobbley                        # connected, when in reality the connection is already gone.
238eb8dc403SDave Cobbley                        # We let the request fail and expect it to be
239eb8dc403SDave Cobbley                        # tried once more ("try_again" in check_status()),
240eb8dc403SDave Cobbley                        # with the dead connection removed from the cache.
2417e0e3c0cSAndrew Geissler                        # If it still fails, we give up, which can happen for bad
242eb8dc403SDave Cobbley                        # HTTP proxy settings.
243eb8dc403SDave Cobbley                        fetch.connection_cache.remove_connection(h.host, h.port)
244eb8dc403SDave Cobbley                    raise urllib.error.URLError(err)
245eb8dc403SDave Cobbley                else:
246eb8dc403SDave Cobbley                    r = h.getresponse()
247eb8dc403SDave Cobbley
248eb8dc403SDave Cobbley                # Pick apart the HTTPResponse object to get the addinfourl
249eb8dc403SDave Cobbley                # object initialized properly.
250eb8dc403SDave Cobbley
251eb8dc403SDave Cobbley                # Wrap the HTTPResponse object in socket's file object adapter
252eb8dc403SDave Cobbley                # for Windows.  That adapter calls recv(), so delegate recv()
253eb8dc403SDave Cobbley                # to read().  This weird wrapping allows the returned object to
254eb8dc403SDave Cobbley                # have readline() and readlines() methods.
255eb8dc403SDave Cobbley
256eb8dc403SDave Cobbley                # XXX It might be better to extract the read buffering code
257eb8dc403SDave Cobbley                # out of socket._fileobject() and into a base class.
258eb8dc403SDave Cobbley                r.recv = r.read
259eb8dc403SDave Cobbley
260eb8dc403SDave Cobbley                # no data, just have to read
261eb8dc403SDave Cobbley                r.read()
262eb8dc403SDave Cobbley                class fp_dummy(object):
263eb8dc403SDave Cobbley                    def read(self):
264eb8dc403SDave Cobbley                        return ""
265eb8dc403SDave Cobbley                    def readline(self):
266eb8dc403SDave Cobbley                        return ""
267eb8dc403SDave Cobbley                    def close(self):
268eb8dc403SDave Cobbley                        pass
269eb8dc403SDave Cobbley                    closed = False
270eb8dc403SDave Cobbley
27119323693SBrad Bishop                resp = urllib.response.addinfourl(fp_dummy(), r.msg, req.get_full_url())
272eb8dc403SDave Cobbley                resp.code = r.status
273eb8dc403SDave Cobbley                resp.msg = r.reason
274eb8dc403SDave Cobbley
275eb8dc403SDave Cobbley                # Close connection when server request it.
276eb8dc403SDave Cobbley                if fetch.connection_cache is not None:
277eb8dc403SDave Cobbley                    if 'Connection' in r.msg and r.msg['Connection'] == 'close':
278eb8dc403SDave Cobbley                        fetch.connection_cache.remove_connection(h.host, h.port)
279eb8dc403SDave Cobbley
280eb8dc403SDave Cobbley                return resp
281eb8dc403SDave Cobbley
282eb8dc403SDave Cobbley        class HTTPMethodFallback(urllib.request.BaseHandler):
283eb8dc403SDave Cobbley            """
284eb8dc403SDave Cobbley            Fallback to GET if HEAD is not allowed (405 HTTP error)
285eb8dc403SDave Cobbley            """
286eb8dc403SDave Cobbley            def http_error_405(self, req, fp, code, msg, headers):
287eb8dc403SDave Cobbley                fp.read()
288eb8dc403SDave Cobbley                fp.close()
289eb8dc403SDave Cobbley
29008902b01SBrad Bishop                if req.get_method() != 'GET':
291eb8dc403SDave Cobbley                    newheaders = dict((k, v) for k, v in list(req.headers.items())
292eb8dc403SDave Cobbley                                      if k.lower() not in ("content-length", "content-type"))
293eb8dc403SDave Cobbley                    return self.parent.open(urllib.request.Request(req.get_full_url(),
294eb8dc403SDave Cobbley                                                            headers=newheaders,
295eb8dc403SDave Cobbley                                                            origin_req_host=req.origin_req_host,
296eb8dc403SDave Cobbley                                                            unverifiable=True))
297eb8dc403SDave Cobbley
29808902b01SBrad Bishop                raise urllib.request.HTTPError(req, code, msg, headers, None)
29919323693SBrad Bishop
30019323693SBrad Bishop            # Some servers (e.g. GitHub archives, hosted on Amazon S3) return 403
30119323693SBrad Bishop            # Forbidden when they actually mean 405 Method Not Allowed.
302eb8dc403SDave Cobbley            http_error_403 = http_error_405
303eb8dc403SDave Cobbley
304eb8dc403SDave Cobbley
305eb8dc403SDave Cobbley        class FixedHTTPRedirectHandler(urllib.request.HTTPRedirectHandler):
306eb8dc403SDave Cobbley            """
307eb8dc403SDave Cobbley            urllib2.HTTPRedirectHandler resets the method to GET on redirect,
308eb8dc403SDave Cobbley            when we want to follow redirects using the original method.
309eb8dc403SDave Cobbley            """
310eb8dc403SDave Cobbley            def redirect_request(self, req, fp, code, msg, headers, newurl):
311eb8dc403SDave Cobbley                newreq = urllib.request.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, newurl)
31219323693SBrad Bishop                newreq.get_method = req.get_method
313eb8dc403SDave Cobbley                return newreq
314eb8dc403SDave Cobbley
3150ca19ccfSPatrick Williams        # We need to update the environment here as both the proxy and HTTPS
3160ca19ccfSPatrick Williams        # handlers need variables set. The proxy needs http_proxy and friends to
3170ca19ccfSPatrick Williams        # be set, and HTTPSHandler ends up calling into openssl to load the
3180ca19ccfSPatrick Williams        # certificates. In buildtools configurations this will be looking at the
3190ca19ccfSPatrick Williams        # wrong place for certificates by default: we set SSL_CERT_FILE to the
3200ca19ccfSPatrick Williams        # right location in the buildtools environment script but as BitBake
3210ca19ccfSPatrick Williams        # prunes prunes the environment this is lost. When binaries are executed
3220ca19ccfSPatrick Williams        # runfetchcmd ensures these values are in the environment, but this is
3230ca19ccfSPatrick Williams        # pure Python so we need to update the environment.
3240ca19ccfSPatrick Williams        #
3250ca19ccfSPatrick Williams        # Avoid tramping the environment too much by using bb.utils.environment
3260ca19ccfSPatrick Williams        # to scope the changes to the build_opener request, which is when the
3270ca19ccfSPatrick Williams        # environment lookups happen.
3287e0e3c0cSAndrew Geissler        newenv = bb.fetch2.get_fetcher_environment(d)
3290ca19ccfSPatrick Williams
3300ca19ccfSPatrick Williams        with bb.utils.environment(**newenv):
331eb8dc403SDave Cobbley            import ssl
3320ca19ccfSPatrick Williams
3330ca19ccfSPatrick Williams            if self.check_certs(d):
3340ca19ccfSPatrick Williams                context = ssl.create_default_context()
3350ca19ccfSPatrick Williams            else:
3360ca19ccfSPatrick Williams                context = ssl._create_unverified_context()
3370ca19ccfSPatrick Williams
3380ca19ccfSPatrick Williams            handlers = [FixedHTTPRedirectHandler,
3390ca19ccfSPatrick Williams                        HTTPMethodFallback,
3400ca19ccfSPatrick Williams                        urllib.request.ProxyHandler(),
3410ca19ccfSPatrick Williams                        CacheHTTPHandler(),
3420ca19ccfSPatrick Williams                        urllib.request.HTTPSHandler(context=context)]
343eb8dc403SDave Cobbley            opener = urllib.request.build_opener(*handlers)
344eb8dc403SDave Cobbley
345eb8dc403SDave Cobbley            try:
346517393d9SAndrew Geissler                uri_base = ud.url.split(";")[0]
347517393d9SAndrew Geissler                uri = "{}://{}{}".format(urllib.parse.urlparse(uri_base).scheme, ud.host, ud.path)
348eb8dc403SDave Cobbley                r = urllib.request.Request(uri)
349eb8dc403SDave Cobbley                r.get_method = lambda: "HEAD"
350eb8dc403SDave Cobbley                # Some servers (FusionForge, as used on Alioth) require that the
351eb8dc403SDave Cobbley                # optional Accept header is set.
352eb8dc403SDave Cobbley                r.add_header("Accept", "*/*")
353d1e89497SAndrew Geissler                r.add_header("User-Agent", self.user_agent)
354eb8dc403SDave Cobbley                def add_basic_auth(login_str, request):
355eb8dc403SDave Cobbley                    '''Adds Basic auth to http request, pass in login:password as string'''
356eb8dc403SDave Cobbley                    import base64
357eb8dc403SDave Cobbley                    encodeuser = base64.b64encode(login_str.encode('utf-8')).decode("utf-8")
358eb8dc403SDave Cobbley                    authheader = "Basic %s" % encodeuser
359eb8dc403SDave Cobbley                    r.add_header("Authorization", authheader)
360eb8dc403SDave Cobbley
36119323693SBrad Bishop                if ud.user and ud.pswd:
36219323693SBrad Bishop                    add_basic_auth(ud.user + ':' + ud.pswd, r)
363eb8dc403SDave Cobbley
364eb8dc403SDave Cobbley                try:
36519323693SBrad Bishop                    import netrc
3666aa7eec5SAndrew Geissler                    auth_data = netrc.netrc().authenticators(urllib.parse.urlparse(uri).hostname)
3676aa7eec5SAndrew Geissler                    if auth_data:
3686aa7eec5SAndrew Geissler                        login, _, password = auth_data
369eb8dc403SDave Cobbley                        add_basic_auth("%s:%s" % (login, password), r)
3706aa7eec5SAndrew Geissler                except (FileNotFoundError, netrc.NetrcParseError):
371eb8dc403SDave Cobbley                    pass
372eb8dc403SDave Cobbley
373595f6308SAndrew Geissler                with opener.open(r, timeout=30) as response:
374eb8dc403SDave Cobbley                    pass
375fc113eadSAndrew Geissler            except (urllib.error.URLError, ConnectionResetError, TimeoutError) as e:
37690fd73cbSAndrew Geissler                if try_again:
37790fd73cbSAndrew Geissler                    logger.debug2("checkstatus: trying again")
37890fd73cbSAndrew Geissler                    return self.checkstatus(fetch, ud, d, False)
37990fd73cbSAndrew Geissler                else:
38090fd73cbSAndrew Geissler                    # debug for now to avoid spamming the logs in e.g. remote sstate searches
381705982a5SPatrick Williams                    logger.debug2("checkstatus() urlopen failed for %s: %s" % (uri,e))
38290fd73cbSAndrew Geissler                    return False
383d159c7fbSAndrew Geissler
384eb8dc403SDave Cobbley        return True
385eb8dc403SDave Cobbley
386eb8dc403SDave Cobbley    def _parse_path(self, regex, s):
387eb8dc403SDave Cobbley        """
388eb8dc403SDave Cobbley        Find and group name, version and archive type in the given string s
389eb8dc403SDave Cobbley        """
390eb8dc403SDave Cobbley
391eb8dc403SDave Cobbley        m = regex.search(s)
392eb8dc403SDave Cobbley        if m:
393eb8dc403SDave Cobbley            pname = ''
394eb8dc403SDave Cobbley            pver = ''
395eb8dc403SDave Cobbley            ptype = ''
396eb8dc403SDave Cobbley
397eb8dc403SDave Cobbley            mdict = m.groupdict()
398eb8dc403SDave Cobbley            if 'name' in mdict.keys():
399eb8dc403SDave Cobbley                pname = mdict['name']
400eb8dc403SDave Cobbley            if 'pver' in mdict.keys():
401eb8dc403SDave Cobbley                pver = mdict['pver']
402eb8dc403SDave Cobbley            if 'type' in mdict.keys():
403eb8dc403SDave Cobbley                ptype = mdict['type']
404eb8dc403SDave Cobbley
405eb8dc403SDave Cobbley            bb.debug(3, "_parse_path: %s, %s, %s" % (pname, pver, ptype))
406eb8dc403SDave Cobbley
407eb8dc403SDave Cobbley            return (pname, pver, ptype)
408eb8dc403SDave Cobbley
409eb8dc403SDave Cobbley        return None
410eb8dc403SDave Cobbley
411eb8dc403SDave Cobbley    def _modelate_version(self, version):
412eb8dc403SDave Cobbley        if version[0] in ['.', '-']:
413eb8dc403SDave Cobbley            if version[1].isdigit():
414eb8dc403SDave Cobbley                version = version[1] + version[0] + version[2:len(version)]
415eb8dc403SDave Cobbley            else:
416eb8dc403SDave Cobbley                version = version[1:len(version)]
417eb8dc403SDave Cobbley
418eb8dc403SDave Cobbley        version = re.sub('-', '.', version)
419eb8dc403SDave Cobbley        version = re.sub('_', '.', version)
420eb8dc403SDave Cobbley        version = re.sub('(rc)+', '.1000.', version)
421eb8dc403SDave Cobbley        version = re.sub('(beta)+', '.100.', version)
422eb8dc403SDave Cobbley        version = re.sub('(alpha)+', '.10.', version)
423eb8dc403SDave Cobbley        if version[0] == 'v':
424eb8dc403SDave Cobbley            version = version[1:len(version)]
425eb8dc403SDave Cobbley        return version
426eb8dc403SDave Cobbley
427eb8dc403SDave Cobbley    def _vercmp(self, old, new):
428eb8dc403SDave Cobbley        """
429eb8dc403SDave Cobbley        Check whether 'new' is newer than 'old' version. We use existing vercmp() for the
430eb8dc403SDave Cobbley        purpose. PE is cleared in comparison as it's not for build, and PR is cleared too
431eb8dc403SDave Cobbley        for simplicity as it's somehow difficult to get from various upstream format
432eb8dc403SDave Cobbley        """
433eb8dc403SDave Cobbley
434eb8dc403SDave Cobbley        (oldpn, oldpv, oldsuffix) = old
435eb8dc403SDave Cobbley        (newpn, newpv, newsuffix) = new
436eb8dc403SDave Cobbley
43719323693SBrad Bishop        # Check for a new suffix type that we have never heard of before
43819323693SBrad Bishop        if newsuffix:
439eb8dc403SDave Cobbley            m = self.suffix_regex_comp.search(newsuffix)
440eb8dc403SDave Cobbley            if not m:
441eb8dc403SDave Cobbley                bb.warn("%s has a possible unknown suffix: %s" % (newpn, newsuffix))
442eb8dc403SDave Cobbley                return False
443eb8dc403SDave Cobbley
44419323693SBrad Bishop        # Not our package so ignore it
445eb8dc403SDave Cobbley        if oldpn != newpn:
446eb8dc403SDave Cobbley            return False
447eb8dc403SDave Cobbley
448eb8dc403SDave Cobbley        oldpv = self._modelate_version(oldpv)
449eb8dc403SDave Cobbley        newpv = self._modelate_version(newpv)
450eb8dc403SDave Cobbley
451eb8dc403SDave Cobbley        return bb.utils.vercmp(("0", oldpv, ""), ("0", newpv, ""))
452eb8dc403SDave Cobbley
453eb8dc403SDave Cobbley    def _fetch_index(self, uri, ud, d):
454eb8dc403SDave Cobbley        """
455eb8dc403SDave Cobbley        Run fetch checkstatus to get directory information
456eb8dc403SDave Cobbley        """
457eb8dc403SDave Cobbley        f = tempfile.NamedTemporaryFile()
458eb8dc403SDave Cobbley        with tempfile.TemporaryDirectory(prefix="wget-index-") as workdir, tempfile.NamedTemporaryFile(dir=workdir, prefix="wget-listing-") as f:
459eb8dc403SDave Cobbley            fetchcmd = self.basecmd
460d1e89497SAndrew Geissler            fetchcmd += " -O " + f.name + " --user-agent='" + self.user_agent + "' '" + uri + "'"
461eb8dc403SDave Cobbley            try:
462eb8dc403SDave Cobbley                self._runwget(ud, d, fetchcmd, True, workdir=workdir)
463eb8dc403SDave Cobbley                fetchresult = f.read()
464eb8dc403SDave Cobbley            except bb.fetch2.BBFetchException:
465eb8dc403SDave Cobbley                fetchresult = ""
466eb8dc403SDave Cobbley
467eb8dc403SDave Cobbley        return fetchresult
468eb8dc403SDave Cobbley
469eb8dc403SDave Cobbley    def _check_latest_version(self, url, package, package_regex, current_version, ud, d):
470eb8dc403SDave Cobbley        """
471eb8dc403SDave Cobbley        Return the latest version of a package inside a given directory path
472eb8dc403SDave Cobbley        If error or no version, return ""
473eb8dc403SDave Cobbley        """
474eb8dc403SDave Cobbley        valid = 0
475eb8dc403SDave Cobbley        version = ['', '', '']
476eb8dc403SDave Cobbley
477eb8dc403SDave Cobbley        bb.debug(3, "VersionURL: %s" % (url))
478eb8dc403SDave Cobbley        soup = BeautifulSoup(self._fetch_index(url, ud, d), "html.parser", parse_only=SoupStrainer("a"))
479eb8dc403SDave Cobbley        if not soup:
480eb8dc403SDave Cobbley            bb.debug(3, "*** %s NO SOUP" % (url))
481eb8dc403SDave Cobbley            return ""
482eb8dc403SDave Cobbley
483eb8dc403SDave Cobbley        for line in soup.find_all('a', href=True):
484eb8dc403SDave Cobbley            bb.debug(3, "line['href'] = '%s'" % (line['href']))
485eb8dc403SDave Cobbley            bb.debug(3, "line = '%s'" % (str(line)))
486eb8dc403SDave Cobbley
487eb8dc403SDave Cobbley            newver = self._parse_path(package_regex, line['href'])
488eb8dc403SDave Cobbley            if not newver:
489eb8dc403SDave Cobbley                newver = self._parse_path(package_regex, str(line))
490eb8dc403SDave Cobbley
491eb8dc403SDave Cobbley            if newver:
492eb8dc403SDave Cobbley                bb.debug(3, "Upstream version found: %s" % newver[1])
493eb8dc403SDave Cobbley                if valid == 0:
494eb8dc403SDave Cobbley                    version = newver
495eb8dc403SDave Cobbley                    valid = 1
496eb8dc403SDave Cobbley                elif self._vercmp(version, newver) < 0:
497eb8dc403SDave Cobbley                    version = newver
498eb8dc403SDave Cobbley
499eb8dc403SDave Cobbley        pupver = re.sub('_', '.', version[1])
500eb8dc403SDave Cobbley
501eb8dc403SDave Cobbley        bb.debug(3, "*** %s -> UpstreamVersion = %s (CurrentVersion = %s)" %
502eb8dc403SDave Cobbley                (package, pupver or "N/A", current_version[1]))
503eb8dc403SDave Cobbley
504eb8dc403SDave Cobbley        if valid:
505eb8dc403SDave Cobbley            return pupver
506eb8dc403SDave Cobbley
507eb8dc403SDave Cobbley        return ""
508eb8dc403SDave Cobbley
50919323693SBrad Bishop    def _check_latest_version_by_dir(self, dirver, package, package_regex, current_version, ud, d):
510eb8dc403SDave Cobbley        """
511eb8dc403SDave Cobbley        Scan every directory in order to get upstream version.
512eb8dc403SDave Cobbley        """
513eb8dc403SDave Cobbley        version_dir = ['', '', '']
514eb8dc403SDave Cobbley        version = ['', '', '']
515eb8dc403SDave Cobbley
516ac69b488SWilliam A. Kennington III        dirver_regex = re.compile(r"(?P<pfx>\D*)(?P<ver>(\d+[\.\-_])*(\d+))")
517eb8dc403SDave Cobbley        s = dirver_regex.search(dirver)
518eb8dc403SDave Cobbley        if s:
519eb8dc403SDave Cobbley            version_dir[1] = s.group('ver')
520eb8dc403SDave Cobbley        else:
521eb8dc403SDave Cobbley            version_dir[1] = dirver
522eb8dc403SDave Cobbley
523eb8dc403SDave Cobbley        dirs_uri = bb.fetch.encodeurl([ud.type, ud.host,
524eb8dc403SDave Cobbley                ud.path.split(dirver)[0], ud.user, ud.pswd, {}])
525eb8dc403SDave Cobbley        bb.debug(3, "DirURL: %s, %s" % (dirs_uri, package))
526eb8dc403SDave Cobbley
527eb8dc403SDave Cobbley        soup = BeautifulSoup(self._fetch_index(dirs_uri, ud, d), "html.parser", parse_only=SoupStrainer("a"))
528eb8dc403SDave Cobbley        if not soup:
529eb8dc403SDave Cobbley            return version[1]
530eb8dc403SDave Cobbley
531eb8dc403SDave Cobbley        for line in soup.find_all('a', href=True):
532eb8dc403SDave Cobbley            s = dirver_regex.search(line['href'].strip("/"))
533eb8dc403SDave Cobbley            if s:
534eb8dc403SDave Cobbley                sver = s.group('ver')
535eb8dc403SDave Cobbley
536eb8dc403SDave Cobbley                # When prefix is part of the version directory it need to
537eb8dc403SDave Cobbley                # ensure that only version directory is used so remove previous
538eb8dc403SDave Cobbley                # directories if exists.
539eb8dc403SDave Cobbley                #
540eb8dc403SDave Cobbley                # Example: pfx = '/dir1/dir2/v' and version = '2.5' the expected
541eb8dc403SDave Cobbley                # result is v2.5.
542eb8dc403SDave Cobbley                spfx = s.group('pfx').split('/')[-1]
543eb8dc403SDave Cobbley
544eb8dc403SDave Cobbley                version_dir_new = ['', sver, '']
545eb8dc403SDave Cobbley                if self._vercmp(version_dir, version_dir_new) <= 0:
546eb8dc403SDave Cobbley                    dirver_new = spfx + sver
547eb8dc403SDave Cobbley                    path = ud.path.replace(dirver, dirver_new, True) \
548eb8dc403SDave Cobbley                        .split(package)[0]
549eb8dc403SDave Cobbley                    uri = bb.fetch.encodeurl([ud.type, ud.host, path,
550eb8dc403SDave Cobbley                        ud.user, ud.pswd, {}])
551eb8dc403SDave Cobbley
552eb8dc403SDave Cobbley                    pupver = self._check_latest_version(uri,
553eb8dc403SDave Cobbley                            package, package_regex, current_version, ud, d)
554eb8dc403SDave Cobbley                    if pupver:
555eb8dc403SDave Cobbley                        version[1] = pupver
556eb8dc403SDave Cobbley
557eb8dc403SDave Cobbley                    version_dir = version_dir_new
558eb8dc403SDave Cobbley
559eb8dc403SDave Cobbley        return version[1]
560eb8dc403SDave Cobbley
561eb8dc403SDave Cobbley    def _init_regexes(self, package, ud, d):
562eb8dc403SDave Cobbley        """
563eb8dc403SDave Cobbley        Match as many patterns as possible such as:
564eb8dc403SDave Cobbley                gnome-common-2.20.0.tar.gz (most common format)
565eb8dc403SDave Cobbley                gtk+-2.90.1.tar.gz
566eb8dc403SDave Cobbley                xf86-input-synaptics-12.6.9.tar.gz
567eb8dc403SDave Cobbley                dri2proto-2.3.tar.gz
568eb8dc403SDave Cobbley                blktool_4.orig.tar.gz
569eb8dc403SDave Cobbley                libid3tag-0.15.1b.tar.gz
570eb8dc403SDave Cobbley                unzip552.tar.gz
571eb8dc403SDave Cobbley                icu4c-3_6-src.tgz
572eb8dc403SDave Cobbley                genext2fs_1.3.orig.tar.gz
573eb8dc403SDave Cobbley                gst-fluendo-mp3
574eb8dc403SDave Cobbley        """
575eb8dc403SDave Cobbley        # match most patterns which uses "-" as separator to version digits
57619323693SBrad Bishop        pn_prefix1 = r"[a-zA-Z][a-zA-Z0-9]*([-_][a-zA-Z]\w+)*\+?[-_]"
577eb8dc403SDave Cobbley        # a loose pattern such as for unzip552.tar.gz
57819323693SBrad Bishop        pn_prefix2 = r"[a-zA-Z]+"
579eb8dc403SDave Cobbley        # a loose pattern such as for 80325-quicky-0.4.tar.gz
58019323693SBrad Bishop        pn_prefix3 = r"[0-9]+[-]?[a-zA-Z]+"
581eb8dc403SDave Cobbley        # Save the Package Name (pn) Regex for use later
58219323693SBrad Bishop        pn_regex = r"(%s|%s|%s)" % (pn_prefix1, pn_prefix2, pn_prefix3)
583eb8dc403SDave Cobbley
584eb8dc403SDave Cobbley        # match version
58519323693SBrad Bishop        pver_regex = r"(([A-Z]*\d+[a-zA-Z]*[\.\-_]*)+)"
586eb8dc403SDave Cobbley
587eb8dc403SDave Cobbley        # match arch
588eb8dc403SDave Cobbley        parch_regex = "-source|_all_"
589eb8dc403SDave Cobbley
590eb8dc403SDave Cobbley        # src.rpm extension was added only for rpm package. Can be removed if the rpm
591eb8dc403SDave Cobbley        # packaged will always be considered as having to be manually upgraded
592595f6308SAndrew Geissler        psuffix_regex = r"(tar\.\w+|tgz|zip|xz|rpm|bz2|orig\.tar\.\w+|src\.tar\.\w+|src\.tgz|svnr\d+\.tar\.\w+|stable\.tar\.\w+|src\.rpm)"
593eb8dc403SDave Cobbley
594eb8dc403SDave Cobbley        # match name, version and archive type of a package
59519323693SBrad Bishop        package_regex_comp = re.compile(r"(?P<name>%s?\.?v?)(?P<pver>%s)(?P<arch>%s)?[\.-](?P<type>%s$)"
596eb8dc403SDave Cobbley                                                    % (pn_regex, pver_regex, parch_regex, psuffix_regex))
597eb8dc403SDave Cobbley        self.suffix_regex_comp = re.compile(psuffix_regex)
598eb8dc403SDave Cobbley
599eb8dc403SDave Cobbley        # compile regex, can be specific by package or generic regex
600eb8dc403SDave Cobbley        pn_regex = d.getVar('UPSTREAM_CHECK_REGEX')
601eb8dc403SDave Cobbley        if pn_regex:
602eb8dc403SDave Cobbley            package_custom_regex_comp = re.compile(pn_regex)
603eb8dc403SDave Cobbley        else:
604eb8dc403SDave Cobbley            version = self._parse_path(package_regex_comp, package)
605eb8dc403SDave Cobbley            if version:
606eb8dc403SDave Cobbley                package_custom_regex_comp = re.compile(
60719323693SBrad Bishop                    r"(?P<name>%s)(?P<pver>%s)(?P<arch>%s)?[\.-](?P<type>%s)" %
608eb8dc403SDave Cobbley                    (re.escape(version[0]), pver_regex, parch_regex, psuffix_regex))
609eb8dc403SDave Cobbley            else:
610eb8dc403SDave Cobbley                package_custom_regex_comp = None
611eb8dc403SDave Cobbley
612eb8dc403SDave Cobbley        return package_custom_regex_comp
613eb8dc403SDave Cobbley
614eb8dc403SDave Cobbley    def latest_versionstring(self, ud, d):
615eb8dc403SDave Cobbley        """
616eb8dc403SDave Cobbley        Manipulate the URL and try to obtain the latest package version
617eb8dc403SDave Cobbley
618eb8dc403SDave Cobbley        sanity check to ensure same name and type.
619eb8dc403SDave Cobbley        """
620eb8dc403SDave Cobbley        package = ud.path.split("/")[-1]
621eb8dc403SDave Cobbley        current_version = ['', d.getVar('PV'), '']
622eb8dc403SDave Cobbley
623eb8dc403SDave Cobbley        """possible to have no version in pkg name, such as spectrum-fw"""
62419323693SBrad Bishop        if not re.search(r"\d+", package):
625eb8dc403SDave Cobbley            current_version[1] = re.sub('_', '.', current_version[1])
626eb8dc403SDave Cobbley            current_version[1] = re.sub('-', '.', current_version[1])
627eb8dc403SDave Cobbley            return (current_version[1], '')
628eb8dc403SDave Cobbley
629eb8dc403SDave Cobbley        package_regex = self._init_regexes(package, ud, d)
630eb8dc403SDave Cobbley        if package_regex is None:
631eb8dc403SDave Cobbley            bb.warn("latest_versionstring: package %s don't match pattern" % (package))
632eb8dc403SDave Cobbley            return ('', '')
633eb8dc403SDave Cobbley        bb.debug(3, "latest_versionstring, regex: %s" % (package_regex.pattern))
634eb8dc403SDave Cobbley
635eb8dc403SDave Cobbley        uri = ""
636eb8dc403SDave Cobbley        regex_uri = d.getVar("UPSTREAM_CHECK_URI")
637eb8dc403SDave Cobbley        if not regex_uri:
638eb8dc403SDave Cobbley            path = ud.path.split(package)[0]
639eb8dc403SDave Cobbley
640eb8dc403SDave Cobbley            # search for version matches on folders inside the path, like:
641eb8dc403SDave Cobbley            # "5.7" in http://download.gnome.org/sources/${PN}/5.7/${PN}-${PV}.tar.gz
64219323693SBrad Bishop            dirver_regex = re.compile(r"(?P<dirver>[^/]*(\d+\.)*\d+([-_]r\d+)*)/")
643517393d9SAndrew Geissler            m = dirver_regex.findall(path)
644eb8dc403SDave Cobbley            if m:
645eb8dc403SDave Cobbley                pn = d.getVar('PN')
646517393d9SAndrew Geissler                dirver = m[-1][0]
647eb8dc403SDave Cobbley
64819323693SBrad Bishop                dirver_pn_regex = re.compile(r"%s\d?" % (re.escape(pn)))
649eb8dc403SDave Cobbley                if not dirver_pn_regex.search(dirver):
650eb8dc403SDave Cobbley                    return (self._check_latest_version_by_dir(dirver,
651eb8dc403SDave Cobbley                        package, package_regex, current_version, ud, d), '')
652eb8dc403SDave Cobbley
653eb8dc403SDave Cobbley            uri = bb.fetch.encodeurl([ud.type, ud.host, path, ud.user, ud.pswd, {}])
654eb8dc403SDave Cobbley        else:
655eb8dc403SDave Cobbley            uri = regex_uri
656eb8dc403SDave Cobbley
657eb8dc403SDave Cobbley        return (self._check_latest_version(uri, package, package_regex,
658eb8dc403SDave Cobbley                current_version, ud, d), '')
659