1""" 2BitBake 'Fetch' implementations 3 4Classes for obtaining upstream sources for the 5BitBake build tools. 6 7""" 8 9# Copyright (C) 2003, 2004 Chris Larson 10# 11# SPDX-License-Identifier: GPL-2.0-only 12# 13# Based on functions from the base bb module, Copyright 2003 Holger Schurig 14 15import shlex 16import re 17import tempfile 18import os 19import errno 20import bb 21import bb.progress 22import socket 23import http.client 24import urllib.request, urllib.parse, urllib.error 25from bb.fetch2 import FetchMethod 26from bb.fetch2 import FetchError 27from bb.fetch2 import logger 28from bb.fetch2 import runfetchcmd 29from bb.utils import export_proxies 30from bs4 import BeautifulSoup 31from bs4 import SoupStrainer 32 33class WgetProgressHandler(bb.progress.LineFilterProgressHandler): 34 """ 35 Extract progress information from wget output. 36 Note: relies on --progress=dot (with -v or without -q/-nv) being 37 specified on the wget command line. 38 """ 39 def __init__(self, d): 40 super(WgetProgressHandler, self).__init__(d) 41 # Send an initial progress event so the bar gets shown 42 self._fire_progress(0) 43 44 def writeline(self, line): 45 percs = re.findall(r'(\d+)%\s+([\d.]+[A-Z])', line) 46 if percs: 47 progress = int(percs[-1][0]) 48 rate = percs[-1][1] + '/s' 49 self.update(progress, rate) 50 return False 51 return True 52 53 54class Wget(FetchMethod): 55 56 # CDNs like CloudFlare may do a 'browser integrity test' which can fail 57 # with the standard wget/urllib User-Agent, so pretend to be a modern 58 # browser. 59 user_agent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0" 60 61 """Class to fetch urls via 'wget'""" 62 def supports(self, ud, d): 63 """ 64 Check to see if a given url can be fetched with wget. 65 """ 66 return ud.type in ['http', 'https', 'ftp'] 67 68 def recommends_checksum(self, urldata): 69 return True 70 71 def urldata_init(self, ud, d): 72 if 'protocol' in ud.parm: 73 if ud.parm['protocol'] == 'git': 74 raise bb.fetch2.ParameterError("Invalid protocol - if you wish to fetch from a git repository using http, you need to instead use the git:// prefix with protocol=http", ud.url) 75 76 if 'downloadfilename' in ud.parm: 77 ud.basename = ud.parm['downloadfilename'] 78 else: 79 ud.basename = os.path.basename(ud.path) 80 81 ud.localfile = d.expand(urllib.parse.unquote(ud.basename)) 82 if not ud.localfile: 83 ud.localfile = d.expand(urllib.parse.unquote(ud.host + ud.path).replace("/", ".")) 84 85 self.basecmd = d.getVar("FETCHCMD_wget") or "/usr/bin/env wget -t 2 -T 30 --passive-ftp --no-check-certificate" 86 87 def _runwget(self, ud, d, command, quiet, workdir=None): 88 89 progresshandler = WgetProgressHandler(d) 90 91 logger.debug2("Fetching %s using command '%s'" % (ud.url, command)) 92 bb.fetch2.check_network_access(d, command, ud.url) 93 runfetchcmd(command + ' --progress=dot -v', d, quiet, log=progresshandler, workdir=workdir) 94 95 def download(self, ud, d): 96 """Fetch urls""" 97 98 fetchcmd = self.basecmd 99 100 if 'downloadfilename' in ud.parm: 101 localpath = os.path.join(d.getVar("DL_DIR"), ud.localfile) 102 bb.utils.mkdirhier(os.path.dirname(localpath)) 103 fetchcmd += " -O %s" % shlex.quote(localpath) 104 105 if ud.user and ud.pswd: 106 fetchcmd += " --user=%s --password=%s --auth-no-challenge" % (ud.user, ud.pswd) 107 108 uri = ud.url.split(";")[0] 109 if os.path.exists(ud.localpath): 110 # file exists, but we didnt complete it.. trying again.. 111 fetchcmd += d.expand(" -c -P ${DL_DIR} '%s'" % uri) 112 else: 113 fetchcmd += d.expand(" -P ${DL_DIR} '%s'" % uri) 114 115 self._runwget(ud, d, fetchcmd, False) 116 117 # Sanity check since wget can pretend it succeed when it didn't 118 # Also, this used to happen if sourceforge sent us to the mirror page 119 if not os.path.exists(ud.localpath): 120 raise FetchError("The fetch command returned success for url %s but %s doesn't exist?!" % (uri, ud.localpath), uri) 121 122 if os.path.getsize(ud.localpath) == 0: 123 os.remove(ud.localpath) 124 raise FetchError("The fetch of %s resulted in a zero size file?! Deleting and failing since this isn't right." % (uri), uri) 125 126 return True 127 128 def checkstatus(self, fetch, ud, d, try_again=True): 129 class HTTPConnectionCache(http.client.HTTPConnection): 130 if fetch.connection_cache: 131 def connect(self): 132 """Connect to the host and port specified in __init__.""" 133 134 sock = fetch.connection_cache.get_connection(self.host, self.port) 135 if sock: 136 self.sock = sock 137 else: 138 self.sock = socket.create_connection((self.host, self.port), 139 self.timeout, self.source_address) 140 fetch.connection_cache.add_connection(self.host, self.port, self.sock) 141 142 if self._tunnel_host: 143 self._tunnel() 144 145 class CacheHTTPHandler(urllib.request.HTTPHandler): 146 def http_open(self, req): 147 return self.do_open(HTTPConnectionCache, req) 148 149 def do_open(self, http_class, req): 150 """Return an addinfourl object for the request, using http_class. 151 152 http_class must implement the HTTPConnection API from httplib. 153 The addinfourl return value is a file-like object. It also 154 has methods and attributes including: 155 - info(): return a mimetools.Message object for the headers 156 - geturl(): return the original request URL 157 - code: HTTP status code 158 """ 159 host = req.host 160 if not host: 161 raise urllib.error.URLError('no host given') 162 163 h = http_class(host, timeout=req.timeout) # will parse host:port 164 h.set_debuglevel(self._debuglevel) 165 166 headers = dict(req.unredirected_hdrs) 167 headers.update(dict((k, v) for k, v in list(req.headers.items()) 168 if k not in headers)) 169 170 # We want to make an HTTP/1.1 request, but the addinfourl 171 # class isn't prepared to deal with a persistent connection. 172 # It will try to read all remaining data from the socket, 173 # which will block while the server waits for the next request. 174 # So make sure the connection gets closed after the (only) 175 # request. 176 177 # Don't close connection when connection_cache is enabled, 178 if fetch.connection_cache is None: 179 headers["Connection"] = "close" 180 else: 181 headers["Connection"] = "Keep-Alive" # Works for HTTP/1.0 182 183 headers = dict( 184 (name.title(), val) for name, val in list(headers.items())) 185 186 if req._tunnel_host: 187 tunnel_headers = {} 188 proxy_auth_hdr = "Proxy-Authorization" 189 if proxy_auth_hdr in headers: 190 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr] 191 # Proxy-Authorization should not be sent to origin 192 # server. 193 del headers[proxy_auth_hdr] 194 h.set_tunnel(req._tunnel_host, headers=tunnel_headers) 195 196 try: 197 h.request(req.get_method(), req.selector, req.data, headers) 198 except socket.error as err: # XXX what error? 199 # Don't close connection when cache is enabled. 200 # Instead, try to detect connections that are no longer 201 # usable (for example, closed unexpectedly) and remove 202 # them from the cache. 203 if fetch.connection_cache is None: 204 h.close() 205 elif isinstance(err, OSError) and err.errno == errno.EBADF: 206 # This happens when the server closes the connection despite the Keep-Alive. 207 # Apparently urllib then uses the file descriptor, expecting it to be 208 # connected, when in reality the connection is already gone. 209 # We let the request fail and expect it to be 210 # tried once more ("try_again" in check_status()), 211 # with the dead connection removed from the cache. 212 # If it still fails, we give up, which can happend for bad 213 # HTTP proxy settings. 214 fetch.connection_cache.remove_connection(h.host, h.port) 215 raise urllib.error.URLError(err) 216 else: 217 r = h.getresponse() 218 219 # Pick apart the HTTPResponse object to get the addinfourl 220 # object initialized properly. 221 222 # Wrap the HTTPResponse object in socket's file object adapter 223 # for Windows. That adapter calls recv(), so delegate recv() 224 # to read(). This weird wrapping allows the returned object to 225 # have readline() and readlines() methods. 226 227 # XXX It might be better to extract the read buffering code 228 # out of socket._fileobject() and into a base class. 229 r.recv = r.read 230 231 # no data, just have to read 232 r.read() 233 class fp_dummy(object): 234 def read(self): 235 return "" 236 def readline(self): 237 return "" 238 def close(self): 239 pass 240 closed = False 241 242 resp = urllib.response.addinfourl(fp_dummy(), r.msg, req.get_full_url()) 243 resp.code = r.status 244 resp.msg = r.reason 245 246 # Close connection when server request it. 247 if fetch.connection_cache is not None: 248 if 'Connection' in r.msg and r.msg['Connection'] == 'close': 249 fetch.connection_cache.remove_connection(h.host, h.port) 250 251 return resp 252 253 class HTTPMethodFallback(urllib.request.BaseHandler): 254 """ 255 Fallback to GET if HEAD is not allowed (405 HTTP error) 256 """ 257 def http_error_405(self, req, fp, code, msg, headers): 258 fp.read() 259 fp.close() 260 261 if req.get_method() != 'GET': 262 newheaders = dict((k, v) for k, v in list(req.headers.items()) 263 if k.lower() not in ("content-length", "content-type")) 264 return self.parent.open(urllib.request.Request(req.get_full_url(), 265 headers=newheaders, 266 origin_req_host=req.origin_req_host, 267 unverifiable=True)) 268 269 raise urllib.request.HTTPError(req, code, msg, headers, None) 270 271 # Some servers (e.g. GitHub archives, hosted on Amazon S3) return 403 272 # Forbidden when they actually mean 405 Method Not Allowed. 273 http_error_403 = http_error_405 274 275 276 class FixedHTTPRedirectHandler(urllib.request.HTTPRedirectHandler): 277 """ 278 urllib2.HTTPRedirectHandler resets the method to GET on redirect, 279 when we want to follow redirects using the original method. 280 """ 281 def redirect_request(self, req, fp, code, msg, headers, newurl): 282 newreq = urllib.request.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, newurl) 283 newreq.get_method = req.get_method 284 return newreq 285 exported_proxies = export_proxies(d) 286 287 handlers = [FixedHTTPRedirectHandler, HTTPMethodFallback] 288 if exported_proxies: 289 handlers.append(urllib.request.ProxyHandler()) 290 handlers.append(CacheHTTPHandler()) 291 # Since Python 2.7.9 ssl cert validation is enabled by default 292 # see PEP-0476, this causes verification errors on some https servers 293 # so disable by default. 294 import ssl 295 if hasattr(ssl, '_create_unverified_context'): 296 handlers.append(urllib.request.HTTPSHandler(context=ssl._create_unverified_context())) 297 opener = urllib.request.build_opener(*handlers) 298 299 try: 300 uri = ud.url.split(";")[0] 301 r = urllib.request.Request(uri) 302 r.get_method = lambda: "HEAD" 303 # Some servers (FusionForge, as used on Alioth) require that the 304 # optional Accept header is set. 305 r.add_header("Accept", "*/*") 306 r.add_header("User-Agent", self.user_agent) 307 def add_basic_auth(login_str, request): 308 '''Adds Basic auth to http request, pass in login:password as string''' 309 import base64 310 encodeuser = base64.b64encode(login_str.encode('utf-8')).decode("utf-8") 311 authheader = "Basic %s" % encodeuser 312 r.add_header("Authorization", authheader) 313 314 if ud.user and ud.pswd: 315 add_basic_auth(ud.user + ':' + ud.pswd, r) 316 317 try: 318 import netrc 319 n = netrc.netrc() 320 login, unused, password = n.authenticators(urllib.parse.urlparse(uri).hostname) 321 add_basic_auth("%s:%s" % (login, password), r) 322 except (TypeError, ImportError, IOError, netrc.NetrcParseError): 323 pass 324 325 with opener.open(r) as response: 326 pass 327 except urllib.error.URLError as e: 328 if try_again: 329 logger.debug2("checkstatus: trying again") 330 return self.checkstatus(fetch, ud, d, False) 331 else: 332 # debug for now to avoid spamming the logs in e.g. remote sstate searches 333 logger.debug2("checkstatus() urlopen failed: %s" % e) 334 return False 335 except ConnectionResetError as e: 336 if try_again: 337 logger.debug2("checkstatus: trying again") 338 return self.checkstatus(fetch, ud, d, False) 339 else: 340 # debug for now to avoid spamming the logs in e.g. remote sstate searches 341 logger.debug2("checkstatus() urlopen failed: %s" % e) 342 return False 343 return True 344 345 def _parse_path(self, regex, s): 346 """ 347 Find and group name, version and archive type in the given string s 348 """ 349 350 m = regex.search(s) 351 if m: 352 pname = '' 353 pver = '' 354 ptype = '' 355 356 mdict = m.groupdict() 357 if 'name' in mdict.keys(): 358 pname = mdict['name'] 359 if 'pver' in mdict.keys(): 360 pver = mdict['pver'] 361 if 'type' in mdict.keys(): 362 ptype = mdict['type'] 363 364 bb.debug(3, "_parse_path: %s, %s, %s" % (pname, pver, ptype)) 365 366 return (pname, pver, ptype) 367 368 return None 369 370 def _modelate_version(self, version): 371 if version[0] in ['.', '-']: 372 if version[1].isdigit(): 373 version = version[1] + version[0] + version[2:len(version)] 374 else: 375 version = version[1:len(version)] 376 377 version = re.sub('-', '.', version) 378 version = re.sub('_', '.', version) 379 version = re.sub('(rc)+', '.1000.', version) 380 version = re.sub('(beta)+', '.100.', version) 381 version = re.sub('(alpha)+', '.10.', version) 382 if version[0] == 'v': 383 version = version[1:len(version)] 384 return version 385 386 def _vercmp(self, old, new): 387 """ 388 Check whether 'new' is newer than 'old' version. We use existing vercmp() for the 389 purpose. PE is cleared in comparison as it's not for build, and PR is cleared too 390 for simplicity as it's somehow difficult to get from various upstream format 391 """ 392 393 (oldpn, oldpv, oldsuffix) = old 394 (newpn, newpv, newsuffix) = new 395 396 # Check for a new suffix type that we have never heard of before 397 if newsuffix: 398 m = self.suffix_regex_comp.search(newsuffix) 399 if not m: 400 bb.warn("%s has a possible unknown suffix: %s" % (newpn, newsuffix)) 401 return False 402 403 # Not our package so ignore it 404 if oldpn != newpn: 405 return False 406 407 oldpv = self._modelate_version(oldpv) 408 newpv = self._modelate_version(newpv) 409 410 return bb.utils.vercmp(("0", oldpv, ""), ("0", newpv, "")) 411 412 def _fetch_index(self, uri, ud, d): 413 """ 414 Run fetch checkstatus to get directory information 415 """ 416 f = tempfile.NamedTemporaryFile() 417 with tempfile.TemporaryDirectory(prefix="wget-index-") as workdir, tempfile.NamedTemporaryFile(dir=workdir, prefix="wget-listing-") as f: 418 fetchcmd = self.basecmd 419 fetchcmd += " -O " + f.name + " --user-agent='" + self.user_agent + "' '" + uri + "'" 420 try: 421 self._runwget(ud, d, fetchcmd, True, workdir=workdir) 422 fetchresult = f.read() 423 except bb.fetch2.BBFetchException: 424 fetchresult = "" 425 426 return fetchresult 427 428 def _check_latest_version(self, url, package, package_regex, current_version, ud, d): 429 """ 430 Return the latest version of a package inside a given directory path 431 If error or no version, return "" 432 """ 433 valid = 0 434 version = ['', '', ''] 435 436 bb.debug(3, "VersionURL: %s" % (url)) 437 soup = BeautifulSoup(self._fetch_index(url, ud, d), "html.parser", parse_only=SoupStrainer("a")) 438 if not soup: 439 bb.debug(3, "*** %s NO SOUP" % (url)) 440 return "" 441 442 for line in soup.find_all('a', href=True): 443 bb.debug(3, "line['href'] = '%s'" % (line['href'])) 444 bb.debug(3, "line = '%s'" % (str(line))) 445 446 newver = self._parse_path(package_regex, line['href']) 447 if not newver: 448 newver = self._parse_path(package_regex, str(line)) 449 450 if newver: 451 bb.debug(3, "Upstream version found: %s" % newver[1]) 452 if valid == 0: 453 version = newver 454 valid = 1 455 elif self._vercmp(version, newver) < 0: 456 version = newver 457 458 pupver = re.sub('_', '.', version[1]) 459 460 bb.debug(3, "*** %s -> UpstreamVersion = %s (CurrentVersion = %s)" % 461 (package, pupver or "N/A", current_version[1])) 462 463 if valid: 464 return pupver 465 466 return "" 467 468 def _check_latest_version_by_dir(self, dirver, package, package_regex, current_version, ud, d): 469 """ 470 Scan every directory in order to get upstream version. 471 """ 472 version_dir = ['', '', ''] 473 version = ['', '', ''] 474 475 dirver_regex = re.compile(r"(?P<pfx>\D*)(?P<ver>(\d+[\.\-_])+(\d+))") 476 s = dirver_regex.search(dirver) 477 if s: 478 version_dir[1] = s.group('ver') 479 else: 480 version_dir[1] = dirver 481 482 dirs_uri = bb.fetch.encodeurl([ud.type, ud.host, 483 ud.path.split(dirver)[0], ud.user, ud.pswd, {}]) 484 bb.debug(3, "DirURL: %s, %s" % (dirs_uri, package)) 485 486 soup = BeautifulSoup(self._fetch_index(dirs_uri, ud, d), "html.parser", parse_only=SoupStrainer("a")) 487 if not soup: 488 return version[1] 489 490 for line in soup.find_all('a', href=True): 491 s = dirver_regex.search(line['href'].strip("/")) 492 if s: 493 sver = s.group('ver') 494 495 # When prefix is part of the version directory it need to 496 # ensure that only version directory is used so remove previous 497 # directories if exists. 498 # 499 # Example: pfx = '/dir1/dir2/v' and version = '2.5' the expected 500 # result is v2.5. 501 spfx = s.group('pfx').split('/')[-1] 502 503 version_dir_new = ['', sver, ''] 504 if self._vercmp(version_dir, version_dir_new) <= 0: 505 dirver_new = spfx + sver 506 path = ud.path.replace(dirver, dirver_new, True) \ 507 .split(package)[0] 508 uri = bb.fetch.encodeurl([ud.type, ud.host, path, 509 ud.user, ud.pswd, {}]) 510 511 pupver = self._check_latest_version(uri, 512 package, package_regex, current_version, ud, d) 513 if pupver: 514 version[1] = pupver 515 516 version_dir = version_dir_new 517 518 return version[1] 519 520 def _init_regexes(self, package, ud, d): 521 """ 522 Match as many patterns as possible such as: 523 gnome-common-2.20.0.tar.gz (most common format) 524 gtk+-2.90.1.tar.gz 525 xf86-input-synaptics-12.6.9.tar.gz 526 dri2proto-2.3.tar.gz 527 blktool_4.orig.tar.gz 528 libid3tag-0.15.1b.tar.gz 529 unzip552.tar.gz 530 icu4c-3_6-src.tgz 531 genext2fs_1.3.orig.tar.gz 532 gst-fluendo-mp3 533 """ 534 # match most patterns which uses "-" as separator to version digits 535 pn_prefix1 = r"[a-zA-Z][a-zA-Z0-9]*([-_][a-zA-Z]\w+)*\+?[-_]" 536 # a loose pattern such as for unzip552.tar.gz 537 pn_prefix2 = r"[a-zA-Z]+" 538 # a loose pattern such as for 80325-quicky-0.4.tar.gz 539 pn_prefix3 = r"[0-9]+[-]?[a-zA-Z]+" 540 # Save the Package Name (pn) Regex for use later 541 pn_regex = r"(%s|%s|%s)" % (pn_prefix1, pn_prefix2, pn_prefix3) 542 543 # match version 544 pver_regex = r"(([A-Z]*\d+[a-zA-Z]*[\.\-_]*)+)" 545 546 # match arch 547 parch_regex = "-source|_all_" 548 549 # src.rpm extension was added only for rpm package. Can be removed if the rpm 550 # packaged will always be considered as having to be manually upgraded 551 psuffix_regex = r"(tar\.gz|tgz|tar\.bz2|zip|xz|tar\.lz|rpm|bz2|orig\.tar\.gz|tar\.xz|src\.tar\.gz|src\.tgz|svnr\d+\.tar\.bz2|stable\.tar\.gz|src\.rpm)" 552 553 # match name, version and archive type of a package 554 package_regex_comp = re.compile(r"(?P<name>%s?\.?v?)(?P<pver>%s)(?P<arch>%s)?[\.-](?P<type>%s$)" 555 % (pn_regex, pver_regex, parch_regex, psuffix_regex)) 556 self.suffix_regex_comp = re.compile(psuffix_regex) 557 558 # compile regex, can be specific by package or generic regex 559 pn_regex = d.getVar('UPSTREAM_CHECK_REGEX') 560 if pn_regex: 561 package_custom_regex_comp = re.compile(pn_regex) 562 else: 563 version = self._parse_path(package_regex_comp, package) 564 if version: 565 package_custom_regex_comp = re.compile( 566 r"(?P<name>%s)(?P<pver>%s)(?P<arch>%s)?[\.-](?P<type>%s)" % 567 (re.escape(version[0]), pver_regex, parch_regex, psuffix_regex)) 568 else: 569 package_custom_regex_comp = None 570 571 return package_custom_regex_comp 572 573 def latest_versionstring(self, ud, d): 574 """ 575 Manipulate the URL and try to obtain the latest package version 576 577 sanity check to ensure same name and type. 578 """ 579 package = ud.path.split("/")[-1] 580 current_version = ['', d.getVar('PV'), ''] 581 582 """possible to have no version in pkg name, such as spectrum-fw""" 583 if not re.search(r"\d+", package): 584 current_version[1] = re.sub('_', '.', current_version[1]) 585 current_version[1] = re.sub('-', '.', current_version[1]) 586 return (current_version[1], '') 587 588 package_regex = self._init_regexes(package, ud, d) 589 if package_regex is None: 590 bb.warn("latest_versionstring: package %s don't match pattern" % (package)) 591 return ('', '') 592 bb.debug(3, "latest_versionstring, regex: %s" % (package_regex.pattern)) 593 594 uri = "" 595 regex_uri = d.getVar("UPSTREAM_CHECK_URI") 596 if not regex_uri: 597 path = ud.path.split(package)[0] 598 599 # search for version matches on folders inside the path, like: 600 # "5.7" in http://download.gnome.org/sources/${PN}/5.7/${PN}-${PV}.tar.gz 601 dirver_regex = re.compile(r"(?P<dirver>[^/]*(\d+\.)*\d+([-_]r\d+)*)/") 602 m = dirver_regex.search(path) 603 if m: 604 pn = d.getVar('PN') 605 dirver = m.group('dirver') 606 607 dirver_pn_regex = re.compile(r"%s\d?" % (re.escape(pn))) 608 if not dirver_pn_regex.search(dirver): 609 return (self._check_latest_version_by_dir(dirver, 610 package, package_regex, current_version, ud, d), '') 611 612 uri = bb.fetch.encodeurl([ud.type, ud.host, path, ud.user, ud.pswd, {}]) 613 else: 614 uri = regex_uri 615 616 return (self._check_latest_version(uri, package, package_regex, 617 current_version, ud, d), '') 618