1# ex:ts=4:sw=4:sts=4:et 2# -*- tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- 3""" 4BitBake 'Fetch' implementations 5 6Classes for obtaining upstream sources for the 7BitBake build tools. 8 9""" 10 11# Copyright (C) 2003, 2004 Chris Larson 12# 13# This program is free software; you can redistribute it and/or modify 14# it under the terms of the GNU General Public License version 2 as 15# published by the Free Software Foundation. 16# 17# This program is distributed in the hope that it will be useful, 18# but WITHOUT ANY WARRANTY; without even the implied warranty of 19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20# GNU General Public License for more details. 21# 22# You should have received a copy of the GNU General Public License along 23# with this program; if not, write to the Free Software Foundation, Inc., 24# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 25# 26# Based on functions from the base bb module, Copyright 2003 Holger Schurig 27 28import re 29import tempfile 30import subprocess 31import os 32import logging 33import errno 34import bb 35import bb.progress 36import urllib.request, urllib.parse, urllib.error 37from bb.fetch2 import FetchMethod 38from bb.fetch2 import FetchError 39from bb.fetch2 import logger 40from bb.fetch2 import runfetchcmd 41from bb.utils import export_proxies 42from bs4 import BeautifulSoup 43from bs4 import SoupStrainer 44 45class WgetProgressHandler(bb.progress.LineFilterProgressHandler): 46 """ 47 Extract progress information from wget output. 48 Note: relies on --progress=dot (with -v or without -q/-nv) being 49 specified on the wget command line. 50 """ 51 def __init__(self, d): 52 super(WgetProgressHandler, self).__init__(d) 53 # Send an initial progress event so the bar gets shown 54 self._fire_progress(0) 55 56 def writeline(self, line): 57 percs = re.findall(r'(\d+)%\s+([\d.]+[A-Z])', line) 58 if percs: 59 progress = int(percs[-1][0]) 60 rate = percs[-1][1] + '/s' 61 self.update(progress, rate) 62 return False 63 return True 64 65 66class Wget(FetchMethod): 67 """Class to fetch urls via 'wget'""" 68 def supports(self, ud, d): 69 """ 70 Check to see if a given url can be fetched with wget. 71 """ 72 return ud.type in ['http', 'https', 'ftp'] 73 74 def recommends_checksum(self, urldata): 75 return True 76 77 def urldata_init(self, ud, d): 78 if 'protocol' in ud.parm: 79 if ud.parm['protocol'] == 'git': 80 raise bb.fetch2.ParameterError("Invalid protocol - if you wish to fetch from a git repository using http, you need to instead use the git:// prefix with protocol=http", ud.url) 81 82 if 'downloadfilename' in ud.parm: 83 ud.basename = ud.parm['downloadfilename'] 84 else: 85 ud.basename = os.path.basename(ud.path) 86 87 ud.localfile = d.expand(urllib.parse.unquote(ud.basename)) 88 if not ud.localfile: 89 ud.localfile = d.expand(urllib.parse.unquote(ud.host + ud.path).replace("/", ".")) 90 91 self.basecmd = d.getVar("FETCHCMD_wget") or "/usr/bin/env wget -t 2 -T 30 --passive-ftp --no-check-certificate" 92 93 def _runwget(self, ud, d, command, quiet, workdir=None): 94 95 progresshandler = WgetProgressHandler(d) 96 97 logger.debug(2, "Fetching %s using command '%s'" % (ud.url, command)) 98 bb.fetch2.check_network_access(d, command, ud.url) 99 runfetchcmd(command + ' --progress=dot -v', d, quiet, log=progresshandler, workdir=workdir) 100 101 def download(self, ud, d): 102 """Fetch urls""" 103 104 fetchcmd = self.basecmd 105 106 if 'downloadfilename' in ud.parm: 107 dldir = d.getVar("DL_DIR") 108 bb.utils.mkdirhier(os.path.dirname(dldir + os.sep + ud.localfile)) 109 fetchcmd += " -O " + dldir + os.sep + ud.localfile 110 111 if ud.user and ud.pswd: 112 fetchcmd += " --user=%s --password=%s --auth-no-challenge" % (ud.user, ud.pswd) 113 114 uri = ud.url.split(";")[0] 115 if os.path.exists(ud.localpath): 116 # file exists, but we didnt complete it.. trying again.. 117 fetchcmd += d.expand(" -c -P ${DL_DIR} '%s'" % uri) 118 else: 119 fetchcmd += d.expand(" -P ${DL_DIR} '%s'" % uri) 120 121 self._runwget(ud, d, fetchcmd, False) 122 123 # Sanity check since wget can pretend it succeed when it didn't 124 # Also, this used to happen if sourceforge sent us to the mirror page 125 if not os.path.exists(ud.localpath): 126 raise FetchError("The fetch command returned success for url %s but %s doesn't exist?!" % (uri, ud.localpath), uri) 127 128 if os.path.getsize(ud.localpath) == 0: 129 os.remove(ud.localpath) 130 raise FetchError("The fetch of %s resulted in a zero size file?! Deleting and failing since this isn't right." % (uri), uri) 131 132 return True 133 134 def checkstatus(self, fetch, ud, d, try_again=True): 135 import urllib.request, urllib.error, urllib.parse, socket, http.client 136 from urllib.response import addinfourl 137 from bb.fetch2 import FetchConnectionCache 138 139 class HTTPConnectionCache(http.client.HTTPConnection): 140 if fetch.connection_cache: 141 def connect(self): 142 """Connect to the host and port specified in __init__.""" 143 144 sock = fetch.connection_cache.get_connection(self.host, self.port) 145 if sock: 146 self.sock = sock 147 else: 148 self.sock = socket.create_connection((self.host, self.port), 149 self.timeout, self.source_address) 150 fetch.connection_cache.add_connection(self.host, self.port, self.sock) 151 152 if self._tunnel_host: 153 self._tunnel() 154 155 class CacheHTTPHandler(urllib.request.HTTPHandler): 156 def http_open(self, req): 157 return self.do_open(HTTPConnectionCache, req) 158 159 def do_open(self, http_class, req): 160 """Return an addinfourl object for the request, using http_class. 161 162 http_class must implement the HTTPConnection API from httplib. 163 The addinfourl return value is a file-like object. It also 164 has methods and attributes including: 165 - info(): return a mimetools.Message object for the headers 166 - geturl(): return the original request URL 167 - code: HTTP status code 168 """ 169 host = req.host 170 if not host: 171 raise urlllib2.URLError('no host given') 172 173 h = http_class(host, timeout=req.timeout) # will parse host:port 174 h.set_debuglevel(self._debuglevel) 175 176 headers = dict(req.unredirected_hdrs) 177 headers.update(dict((k, v) for k, v in list(req.headers.items()) 178 if k not in headers)) 179 180 # We want to make an HTTP/1.1 request, but the addinfourl 181 # class isn't prepared to deal with a persistent connection. 182 # It will try to read all remaining data from the socket, 183 # which will block while the server waits for the next request. 184 # So make sure the connection gets closed after the (only) 185 # request. 186 187 # Don't close connection when connection_cache is enabled, 188 if fetch.connection_cache is None: 189 headers["Connection"] = "close" 190 else: 191 headers["Connection"] = "Keep-Alive" # Works for HTTP/1.0 192 193 headers = dict( 194 (name.title(), val) for name, val in list(headers.items())) 195 196 if req._tunnel_host: 197 tunnel_headers = {} 198 proxy_auth_hdr = "Proxy-Authorization" 199 if proxy_auth_hdr in headers: 200 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr] 201 # Proxy-Authorization should not be sent to origin 202 # server. 203 del headers[proxy_auth_hdr] 204 h.set_tunnel(req._tunnel_host, headers=tunnel_headers) 205 206 try: 207 h.request(req.get_method(), req.selector, req.data, headers) 208 except socket.error as err: # XXX what error? 209 # Don't close connection when cache is enabled. 210 # Instead, try to detect connections that are no longer 211 # usable (for example, closed unexpectedly) and remove 212 # them from the cache. 213 if fetch.connection_cache is None: 214 h.close() 215 elif isinstance(err, OSError) and err.errno == errno.EBADF: 216 # This happens when the server closes the connection despite the Keep-Alive. 217 # Apparently urllib then uses the file descriptor, expecting it to be 218 # connected, when in reality the connection is already gone. 219 # We let the request fail and expect it to be 220 # tried once more ("try_again" in check_status()), 221 # with the dead connection removed from the cache. 222 # If it still fails, we give up, which can happend for bad 223 # HTTP proxy settings. 224 fetch.connection_cache.remove_connection(h.host, h.port) 225 raise urllib.error.URLError(err) 226 else: 227 try: 228 r = h.getresponse(buffering=True) 229 except TypeError: # buffering kw not supported 230 r = h.getresponse() 231 232 # Pick apart the HTTPResponse object to get the addinfourl 233 # object initialized properly. 234 235 # Wrap the HTTPResponse object in socket's file object adapter 236 # for Windows. That adapter calls recv(), so delegate recv() 237 # to read(). This weird wrapping allows the returned object to 238 # have readline() and readlines() methods. 239 240 # XXX It might be better to extract the read buffering code 241 # out of socket._fileobject() and into a base class. 242 r.recv = r.read 243 244 # no data, just have to read 245 r.read() 246 class fp_dummy(object): 247 def read(self): 248 return "" 249 def readline(self): 250 return "" 251 def close(self): 252 pass 253 closed = False 254 255 resp = addinfourl(fp_dummy(), r.msg, req.get_full_url()) 256 resp.code = r.status 257 resp.msg = r.reason 258 259 # Close connection when server request it. 260 if fetch.connection_cache is not None: 261 if 'Connection' in r.msg and r.msg['Connection'] == 'close': 262 fetch.connection_cache.remove_connection(h.host, h.port) 263 264 return resp 265 266 class HTTPMethodFallback(urllib.request.BaseHandler): 267 """ 268 Fallback to GET if HEAD is not allowed (405 HTTP error) 269 """ 270 def http_error_405(self, req, fp, code, msg, headers): 271 fp.read() 272 fp.close() 273 274 newheaders = dict((k,v) for k,v in list(req.headers.items()) 275 if k.lower() not in ("content-length", "content-type")) 276 return self.parent.open(urllib.request.Request(req.get_full_url(), 277 headers=newheaders, 278 origin_req_host=req.origin_req_host, 279 unverifiable=True)) 280 281 """ 282 Some servers (e.g. GitHub archives, hosted on Amazon S3) return 403 283 Forbidden when they actually mean 405 Method Not Allowed. 284 """ 285 http_error_403 = http_error_405 286 287 288 class FixedHTTPRedirectHandler(urllib.request.HTTPRedirectHandler): 289 """ 290 urllib2.HTTPRedirectHandler resets the method to GET on redirect, 291 when we want to follow redirects using the original method. 292 """ 293 def redirect_request(self, req, fp, code, msg, headers, newurl): 294 newreq = urllib.request.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, newurl) 295 newreq.get_method = lambda: req.get_method() 296 return newreq 297 exported_proxies = export_proxies(d) 298 299 handlers = [FixedHTTPRedirectHandler, HTTPMethodFallback] 300 if export_proxies: 301 handlers.append(urllib.request.ProxyHandler()) 302 handlers.append(CacheHTTPHandler()) 303 # XXX: Since Python 2.7.9 ssl cert validation is enabled by default 304 # see PEP-0476, this causes verification errors on some https servers 305 # so disable by default. 306 import ssl 307 if hasattr(ssl, '_create_unverified_context'): 308 handlers.append(urllib.request.HTTPSHandler(context=ssl._create_unverified_context())) 309 opener = urllib.request.build_opener(*handlers) 310 311 try: 312 uri = ud.url.split(";")[0] 313 r = urllib.request.Request(uri) 314 r.get_method = lambda: "HEAD" 315 # Some servers (FusionForge, as used on Alioth) require that the 316 # optional Accept header is set. 317 r.add_header("Accept", "*/*") 318 def add_basic_auth(login_str, request): 319 '''Adds Basic auth to http request, pass in login:password as string''' 320 import base64 321 encodeuser = base64.b64encode(login_str.encode('utf-8')).decode("utf-8") 322 authheader = "Basic %s" % encodeuser 323 r.add_header("Authorization", authheader) 324 325 if ud.user: 326 add_basic_auth(ud.user, r) 327 328 try: 329 import netrc, urllib.parse 330 n = netrc.netrc() 331 login, unused, password = n.authenticators(urllib.parse.urlparse(uri).hostname) 332 add_basic_auth("%s:%s" % (login, password), r) 333 except (TypeError, ImportError, IOError, netrc.NetrcParseError): 334 pass 335 336 with opener.open(r) as response: 337 pass 338 except urllib.error.URLError as e: 339 if try_again: 340 logger.debug(2, "checkstatus: trying again") 341 return self.checkstatus(fetch, ud, d, False) 342 else: 343 # debug for now to avoid spamming the logs in e.g. remote sstate searches 344 logger.debug(2, "checkstatus() urlopen failed: %s" % e) 345 return False 346 return True 347 348 def _parse_path(self, regex, s): 349 """ 350 Find and group name, version and archive type in the given string s 351 """ 352 353 m = regex.search(s) 354 if m: 355 pname = '' 356 pver = '' 357 ptype = '' 358 359 mdict = m.groupdict() 360 if 'name' in mdict.keys(): 361 pname = mdict['name'] 362 if 'pver' in mdict.keys(): 363 pver = mdict['pver'] 364 if 'type' in mdict.keys(): 365 ptype = mdict['type'] 366 367 bb.debug(3, "_parse_path: %s, %s, %s" % (pname, pver, ptype)) 368 369 return (pname, pver, ptype) 370 371 return None 372 373 def _modelate_version(self, version): 374 if version[0] in ['.', '-']: 375 if version[1].isdigit(): 376 version = version[1] + version[0] + version[2:len(version)] 377 else: 378 version = version[1:len(version)] 379 380 version = re.sub('-', '.', version) 381 version = re.sub('_', '.', version) 382 version = re.sub('(rc)+', '.1000.', version) 383 version = re.sub('(beta)+', '.100.', version) 384 version = re.sub('(alpha)+', '.10.', version) 385 if version[0] == 'v': 386 version = version[1:len(version)] 387 return version 388 389 def _vercmp(self, old, new): 390 """ 391 Check whether 'new' is newer than 'old' version. We use existing vercmp() for the 392 purpose. PE is cleared in comparison as it's not for build, and PR is cleared too 393 for simplicity as it's somehow difficult to get from various upstream format 394 """ 395 396 (oldpn, oldpv, oldsuffix) = old 397 (newpn, newpv, newsuffix) = new 398 399 """ 400 Check for a new suffix type that we have never heard of before 401 """ 402 if (newsuffix): 403 m = self.suffix_regex_comp.search(newsuffix) 404 if not m: 405 bb.warn("%s has a possible unknown suffix: %s" % (newpn, newsuffix)) 406 return False 407 408 """ 409 Not our package so ignore it 410 """ 411 if oldpn != newpn: 412 return False 413 414 oldpv = self._modelate_version(oldpv) 415 newpv = self._modelate_version(newpv) 416 417 return bb.utils.vercmp(("0", oldpv, ""), ("0", newpv, "")) 418 419 def _fetch_index(self, uri, ud, d): 420 """ 421 Run fetch checkstatus to get directory information 422 """ 423 f = tempfile.NamedTemporaryFile() 424 with tempfile.TemporaryDirectory(prefix="wget-index-") as workdir, tempfile.NamedTemporaryFile(dir=workdir, prefix="wget-listing-") as f: 425 agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.12) Gecko/20101027 Ubuntu/9.10 (karmic) Firefox/3.6.12" 426 fetchcmd = self.basecmd 427 fetchcmd += " -O " + f.name + " --user-agent='" + agent + "' '" + uri + "'" 428 try: 429 self._runwget(ud, d, fetchcmd, True, workdir=workdir) 430 fetchresult = f.read() 431 except bb.fetch2.BBFetchException: 432 fetchresult = "" 433 434 return fetchresult 435 436 def _check_latest_version(self, url, package, package_regex, current_version, ud, d): 437 """ 438 Return the latest version of a package inside a given directory path 439 If error or no version, return "" 440 """ 441 valid = 0 442 version = ['', '', ''] 443 444 bb.debug(3, "VersionURL: %s" % (url)) 445 soup = BeautifulSoup(self._fetch_index(url, ud, d), "html.parser", parse_only=SoupStrainer("a")) 446 if not soup: 447 bb.debug(3, "*** %s NO SOUP" % (url)) 448 return "" 449 450 for line in soup.find_all('a', href=True): 451 bb.debug(3, "line['href'] = '%s'" % (line['href'])) 452 bb.debug(3, "line = '%s'" % (str(line))) 453 454 newver = self._parse_path(package_regex, line['href']) 455 if not newver: 456 newver = self._parse_path(package_regex, str(line)) 457 458 if newver: 459 bb.debug(3, "Upstream version found: %s" % newver[1]) 460 if valid == 0: 461 version = newver 462 valid = 1 463 elif self._vercmp(version, newver) < 0: 464 version = newver 465 466 pupver = re.sub('_', '.', version[1]) 467 468 bb.debug(3, "*** %s -> UpstreamVersion = %s (CurrentVersion = %s)" % 469 (package, pupver or "N/A", current_version[1])) 470 471 if valid: 472 return pupver 473 474 return "" 475 476 def _check_latest_version_by_dir(self, dirver, package, package_regex, 477 current_version, ud, d): 478 """ 479 Scan every directory in order to get upstream version. 480 """ 481 version_dir = ['', '', ''] 482 version = ['', '', ''] 483 484 dirver_regex = re.compile("(?P<pfx>\D*)(?P<ver>(\d+[\.\-_])+(\d+))") 485 s = dirver_regex.search(dirver) 486 if s: 487 version_dir[1] = s.group('ver') 488 else: 489 version_dir[1] = dirver 490 491 dirs_uri = bb.fetch.encodeurl([ud.type, ud.host, 492 ud.path.split(dirver)[0], ud.user, ud.pswd, {}]) 493 bb.debug(3, "DirURL: %s, %s" % (dirs_uri, package)) 494 495 soup = BeautifulSoup(self._fetch_index(dirs_uri, ud, d), "html.parser", parse_only=SoupStrainer("a")) 496 if not soup: 497 return version[1] 498 499 for line in soup.find_all('a', href=True): 500 s = dirver_regex.search(line['href'].strip("/")) 501 if s: 502 sver = s.group('ver') 503 504 # When prefix is part of the version directory it need to 505 # ensure that only version directory is used so remove previous 506 # directories if exists. 507 # 508 # Example: pfx = '/dir1/dir2/v' and version = '2.5' the expected 509 # result is v2.5. 510 spfx = s.group('pfx').split('/')[-1] 511 512 version_dir_new = ['', sver, ''] 513 if self._vercmp(version_dir, version_dir_new) <= 0: 514 dirver_new = spfx + sver 515 path = ud.path.replace(dirver, dirver_new, True) \ 516 .split(package)[0] 517 uri = bb.fetch.encodeurl([ud.type, ud.host, path, 518 ud.user, ud.pswd, {}]) 519 520 pupver = self._check_latest_version(uri, 521 package, package_regex, current_version, ud, d) 522 if pupver: 523 version[1] = pupver 524 525 version_dir = version_dir_new 526 527 return version[1] 528 529 def _init_regexes(self, package, ud, d): 530 """ 531 Match as many patterns as possible such as: 532 gnome-common-2.20.0.tar.gz (most common format) 533 gtk+-2.90.1.tar.gz 534 xf86-input-synaptics-12.6.9.tar.gz 535 dri2proto-2.3.tar.gz 536 blktool_4.orig.tar.gz 537 libid3tag-0.15.1b.tar.gz 538 unzip552.tar.gz 539 icu4c-3_6-src.tgz 540 genext2fs_1.3.orig.tar.gz 541 gst-fluendo-mp3 542 """ 543 # match most patterns which uses "-" as separator to version digits 544 pn_prefix1 = "[a-zA-Z][a-zA-Z0-9]*([-_][a-zA-Z]\w+)*\+?[-_]" 545 # a loose pattern such as for unzip552.tar.gz 546 pn_prefix2 = "[a-zA-Z]+" 547 # a loose pattern such as for 80325-quicky-0.4.tar.gz 548 pn_prefix3 = "[0-9]+[-]?[a-zA-Z]+" 549 # Save the Package Name (pn) Regex for use later 550 pn_regex = "(%s|%s|%s)" % (pn_prefix1, pn_prefix2, pn_prefix3) 551 552 # match version 553 pver_regex = "(([A-Z]*\d+[a-zA-Z]*[\.\-_]*)+)" 554 555 # match arch 556 parch_regex = "-source|_all_" 557 558 # src.rpm extension was added only for rpm package. Can be removed if the rpm 559 # packaged will always be considered as having to be manually upgraded 560 psuffix_regex = "(tar\.gz|tgz|tar\.bz2|zip|xz|tar\.lz|rpm|bz2|orig\.tar\.gz|tar\.xz|src\.tar\.gz|src\.tgz|svnr\d+\.tar\.bz2|stable\.tar\.gz|src\.rpm)" 561 562 # match name, version and archive type of a package 563 package_regex_comp = re.compile("(?P<name>%s?\.?v?)(?P<pver>%s)(?P<arch>%s)?[\.-](?P<type>%s$)" 564 % (pn_regex, pver_regex, parch_regex, psuffix_regex)) 565 self.suffix_regex_comp = re.compile(psuffix_regex) 566 567 # compile regex, can be specific by package or generic regex 568 pn_regex = d.getVar('UPSTREAM_CHECK_REGEX') 569 if pn_regex: 570 package_custom_regex_comp = re.compile(pn_regex) 571 else: 572 version = self._parse_path(package_regex_comp, package) 573 if version: 574 package_custom_regex_comp = re.compile( 575 "(?P<name>%s)(?P<pver>%s)(?P<arch>%s)?[\.-](?P<type>%s)" % 576 (re.escape(version[0]), pver_regex, parch_regex, psuffix_regex)) 577 else: 578 package_custom_regex_comp = None 579 580 return package_custom_regex_comp 581 582 def latest_versionstring(self, ud, d): 583 """ 584 Manipulate the URL and try to obtain the latest package version 585 586 sanity check to ensure same name and type. 587 """ 588 package = ud.path.split("/")[-1] 589 current_version = ['', d.getVar('PV'), ''] 590 591 """possible to have no version in pkg name, such as spectrum-fw""" 592 if not re.search("\d+", package): 593 current_version[1] = re.sub('_', '.', current_version[1]) 594 current_version[1] = re.sub('-', '.', current_version[1]) 595 return (current_version[1], '') 596 597 package_regex = self._init_regexes(package, ud, d) 598 if package_regex is None: 599 bb.warn("latest_versionstring: package %s don't match pattern" % (package)) 600 return ('', '') 601 bb.debug(3, "latest_versionstring, regex: %s" % (package_regex.pattern)) 602 603 uri = "" 604 regex_uri = d.getVar("UPSTREAM_CHECK_URI") 605 if not regex_uri: 606 path = ud.path.split(package)[0] 607 608 # search for version matches on folders inside the path, like: 609 # "5.7" in http://download.gnome.org/sources/${PN}/5.7/${PN}-${PV}.tar.gz 610 dirver_regex = re.compile("(?P<dirver>[^/]*(\d+\.)*\d+([-_]r\d+)*)/") 611 m = dirver_regex.search(path) 612 if m: 613 pn = d.getVar('PN') 614 dirver = m.group('dirver') 615 616 dirver_pn_regex = re.compile("%s\d?" % (re.escape(pn))) 617 if not dirver_pn_regex.search(dirver): 618 return (self._check_latest_version_by_dir(dirver, 619 package, package_regex, current_version, ud, d), '') 620 621 uri = bb.fetch.encodeurl([ud.type, ud.host, path, ud.user, ud.pswd, {}]) 622 else: 623 uri = regex_uri 624 625 return (self._check_latest_version(uri, package, package_regex, 626 current_version, ud, d), '') 627