1# This script is used as a bitbake task to create a new python manifest
2# $ bitbake python -c create_manifest
3#
4# Our goal is to keep python-core as small as posible and add other python
5# packages only when the user needs them, hence why we split upstream python
6# into several packages.
7#
8# In a very simplistic way what this does is:
9# Launch python and see specifically what is required for it to run at a minimum
10#
11# Go through the python-manifest file and launch a separate task for every single
12# one of the files on each package, this task will check what was required for that
13# specific module to run, these modules will be called dependencies.
14# The output of such task will be a list of the modules or dependencies that were
15# found for that file.
16#
17# Such output will be parsed by this script, we will look for each dependency on the
18# manifest and if we find that another package already includes it, then we will add
19# that package as an RDEPENDS to the package we are currently checking; in case we dont
20# find the current dependency on any other package we will add it to the current package
21# as part of FILES.
22#
23#
24# This way we will create a new manifest from the data structure that was built during
25# this process, on this new manifest each package will contain specifically only
26# what it needs to run.
27#
28# There are some caveats which we try to deal with, such as repeated files on different
29# packages, packages that include folders, wildcards, and special packages.
30# Its also important to note that this method only works for python files, and shared
31# libraries. Static libraries, header files and binaries need to be dealt with manually.
32#
33# This script differs from its python2 version mostly on how shared libraries are handled
34# The manifest file for python3 has an extra field which contains the cached files for
35# each package.
36# Tha method to handle cached files does not work when a module includes a folder which
37# itself contains the pycache folder, gladly this is almost never the case.
38#
39# Author: Alejandro Enedino Hernandez Samaniego <alejandro at enedino dot org>
40
41
42import sys
43import subprocess
44import json
45import os
46import collections
47
48if '-d' in sys.argv:
49    debugFlag = '-d'
50else:
51    debugFlag = ''
52
53# Get python version from ${PYTHON_MAJMIN}
54pyversion = str(sys.argv[1])
55
56# Hack to get native python search path (for folders), not fond of it but it works for now
57pivot = 'recipe-sysroot-native'
58for p in sys.path:
59    if pivot in p:
60        nativelibfolder = p[:p.find(pivot)+len(pivot)]
61
62# Empty dict to hold the whole manifest
63new_manifest = collections.OrderedDict()
64
65# Check for repeated files, folders and wildcards
66allfiles = []
67repeated = []
68wildcards = []
69
70hasfolders = []
71allfolders = []
72
73def isFolder(value):
74    value = value.replace('${PYTHON_MAJMIN}',pyversion)
75    if os.path.isdir(value.replace('${libdir}',nativelibfolder+'/usr/lib')) or os.path.isdir(value.replace('${libdir}',nativelibfolder+'/usr/lib64')) or os.path.isdir(value.replace('${libdir}',nativelibfolder+'/usr/lib32')):
76        return True
77    else:
78        return False
79
80def isCached(item):
81    if '__pycache__' in item:
82        return True
83    else:
84        return False
85
86def prepend_comments(comments, json_manifest):
87    with open(json_manifest, 'r+') as manifest:
88        json_contents = manifest.read()
89        manifest.seek(0, 0)
90        manifest.write(comments + json_contents)
91
92def print_indent(msg, offset):
93    for l in msg.splitlines():
94        msg = ' ' * offset + l
95        print(msg)
96
97
98# Read existing JSON manifest
99with open('python3-manifest.json') as manifest:
100    # The JSON format doesn't allow comments so we hack the call to keep the comments using a marker
101    manifest_str =  manifest.read()
102    json_start = manifest_str.find('# EOC') + 6 # EOC + \n
103    manifest.seek(0)
104    comments = manifest.read(json_start)
105    manifest_str = manifest.read()
106    old_manifest = json.loads(manifest_str, object_pairs_hook=collections.OrderedDict)
107
108#
109# First pass to get core-package functionality, because we base everything on the fact that core is actually working
110# Not exactly the same so it should not be a function
111#
112
113print_indent('Getting dependencies for package: core', 0)
114
115
116# This special call gets the core dependencies and
117# appends to the old manifest so it doesnt hurt what it
118# currently holds.
119# This way when other packages check for dependencies
120# on the new core package, they will still find them
121# even when checking the old_manifest
122
123output = subprocess.check_output([sys.executable, 'get_module_deps3.py', 'python-core-package', '%s' % debugFlag]).decode('utf8')
124for coredep in output.split():
125    coredep = coredep.replace(pyversion,'${PYTHON_MAJMIN}')
126    if isCached(coredep):
127        if coredep not in old_manifest['core']['cached']:
128            old_manifest['core']['cached'].append(coredep)
129    else:
130        if coredep not in old_manifest['core']['files']:
131            old_manifest['core']['files'].append(coredep)
132
133
134# The second step is to loop through the existing files contained in the core package
135# according to the old manifest, identify if they are  modules, or some other type
136# of file that we cant import (directories, binaries, configs) in which case we
137# can only assume they were added correctly (manually) so we ignore those and
138# pass them to the manifest directly.
139
140for filedep in old_manifest['core']['files']:
141    if isFolder(filedep):
142        if isCached(filedep):
143            if filedep not in old_manifest['core']['cached']:
144                old_manifest['core']['cached'].append(filedep)
145        else:
146            if filedep not in old_manifest['core']['files']:
147                old_manifest['core']['files'].append(filedep)
148        continue
149    if '${bindir}' in filedep:
150        if filedep not in old_manifest['core']['files']:
151            old_manifest['core']['files'].append(filedep)
152        continue
153    if filedep == '':
154        continue
155    if '${includedir}' in filedep:
156        if filedep not in old_manifest['core']['files']:
157            old_manifest['core']['files'].append(filedep)
158        continue
159
160    # Get actual module name , shouldnt be affected by libdir/bindir, etc.
161    pymodule = os.path.splitext(os.path.basename(os.path.normpath(filedep)))[0]
162
163    # We now know that were dealing with a python module, so we can import it
164    # and check what its dependencies are.
165    # We launch a separate task for each module for deterministic behavior.
166    # Each module will only import what is necessary for it to work in specific.
167    # The output of each task will contain each module's dependencies
168
169    print_indent('Getting dependencies for module: %s' % pymodule, 2)
170    output = subprocess.check_output([sys.executable, 'get_module_deps3.py', '%s' % pymodule, '%s' % debugFlag]).decode('utf8')
171    print_indent('The following dependencies were found for module %s:\n' % pymodule, 4)
172    print_indent(output, 6)
173
174
175    for pymodule_dep in output.split():
176        pymodule_dep = pymodule_dep.replace(pyversion,'${PYTHON_MAJMIN}')
177
178        if isCached(pymodule_dep):
179            if pymodule_dep not in old_manifest['core']['cached']:
180                old_manifest['core']['cached'].append(pymodule_dep)
181        else:
182            if pymodule_dep not in old_manifest['core']['files']:
183                old_manifest['core']['files'].append(pymodule_dep)
184
185
186# At this point we are done with the core package.
187# The old_manifest dictionary is updated only for the core package because
188# all others will use this a base.
189
190
191print('\n\nChecking for directories...\n')
192# To improve the script speed, we check which packages contain directories
193# since we will be looping through (only) those later.
194for pypkg in old_manifest:
195    for filedep in old_manifest[pypkg]['files']:
196        if isFolder(filedep):
197            print_indent('%s is a directory' % filedep, 2)
198            if pypkg not in hasfolders:
199                hasfolders.append(pypkg)
200            if filedep not in allfolders:
201                allfolders.append(filedep)
202
203
204
205# This is the main loop that will handle each package.
206# It works in a similar fashion than the step before, but
207# we will now be updating a new dictionary that will eventually
208# become the new manifest.
209#
210# The following loops though all packages in the manifest,
211# through all files on each of them, and checks whether or not
212# they are modules and can be imported.
213# If they can be imported, then it checks for dependencies for
214# each of them by launching a separate task.
215# The output of that task is then parsed and the manifest is updated
216# accordingly, wether it should add the module on FILES for the current package
217# or if that module already belongs to another package then the current one
218# will RDEPEND on it
219
220for pypkg in old_manifest:
221    # Use an empty dict as data structure to hold data for each package and fill it up
222    new_manifest[pypkg] = collections.OrderedDict()
223    new_manifest[pypkg]['summary'] = old_manifest[pypkg]['summary']
224    new_manifest[pypkg]['rdepends'] = []
225    new_manifest[pypkg]['files'] = []
226    new_manifest[pypkg]['cached'] = old_manifest[pypkg]['cached']
227
228    # All packages should depend on core
229    if pypkg != 'core':
230        new_manifest[pypkg]['rdepends'].append('core')
231        new_manifest[pypkg]['cached'] = []
232
233    print('\n')
234    print('--------------------------')
235    print('Handling package %s' % pypkg)
236    print('--------------------------')
237
238    # Handle special cases, we assume that when they were manually added
239    # to the manifest we knew what we were doing.
240    special_packages = ['misc', 'modules', 'dev', 'tests']
241    if pypkg in special_packages or 'staticdev' in pypkg:
242        print_indent('Passing %s package directly' % pypkg, 2)
243        new_manifest[pypkg] = old_manifest[pypkg]
244        continue
245
246    for filedep in old_manifest[pypkg]['files']:
247        # We already handled core on the first pass, we can ignore it now
248        if pypkg == 'core':
249            if filedep not in new_manifest[pypkg]['files']:
250                new_manifest[pypkg]['files'].append(filedep)
251            continue
252
253        # Handle/ignore what we cant import
254        if isFolder(filedep):
255            new_manifest[pypkg]['files'].append(filedep)
256            # Asyncio (and others) are both the package and the folder name, we should not skip those...
257            path,mod = os.path.split(filedep)
258            if mod != pypkg:
259                continue
260        if '${bindir}' in filedep:
261            if filedep not in new_manifest[pypkg]['files']:
262                new_manifest[pypkg]['files'].append(filedep)
263            continue
264        if filedep == '':
265            continue
266        if '${includedir}' in filedep:
267            if filedep not in new_manifest[pypkg]['files']:
268                new_manifest[pypkg]['files'].append(filedep)
269            continue
270
271        # Get actual module name , shouldnt be affected by libdir/bindir, etc.
272        # We need to check if the imported module comes from another (e.g. sqlite3.dump)
273        path, pymodule = os.path.split(filedep)
274        path = os.path.basename(path)
275        pymodule = os.path.splitext(os.path.basename(pymodule))[0]
276
277        # If this condition is met, it means we need to import it from another module
278        # or its the folder itself (e.g. unittest)
279        if path == pypkg:
280            if pymodule:
281                pymodule = path + '.' + pymodule
282            else:
283                pymodule = path
284
285
286
287        # We now know that were dealing with a python module, so we can import it
288        # and check what its dependencies are.
289        # We launch a separate task for each module for deterministic behavior.
290        # Each module will only import what is necessary for it to work in specific.
291        # The output of each task will contain each module's dependencies
292
293        print_indent('\nGetting dependencies for module: %s' % pymodule, 2)
294        output = subprocess.check_output([sys.executable, 'get_module_deps3.py', '%s' % pymodule, '%s' % debugFlag]).decode('utf8')
295        print_indent('The following dependencies were found for module %s:\n' % pymodule, 4)
296        print_indent(output, 6)
297
298        reportFILES = []
299        reportRDEPS = []
300
301        for pymodule_dep in output.split():
302
303            # Warning: This first part is ugly
304            # One of the dependencies that was found, could be inside of one of the folders included by another package
305            # We need to check if this happens so we can add the package containing the folder as an rdependency
306            # e.g. Folder encodings contained in codecs
307            # This would be solved if no packages included any folders
308
309            # This can be done in two ways:
310            # 1 - We assume that if we take out the filename from the path we would get
311            #   the folder string, then we would check if folder string is in the list of folders
312            #   This would not work if a package contains a folder which contains another folder
313            #   e.g. path/folder1/folder2/filename  folder_string= path/folder1/folder2
314            #   folder_string would not match any value contained in the list of folders
315            #
316            # 2 - We do it the other way around, checking if the folder is contained in the path
317            #   e.g. path/folder1/folder2/filename  folder_string= path/folder1/folder2
318            #   is folder_string inside path/folder1/folder2/filename?,
319            #   Yes, it works, but we waste a couple of milliseconds.
320
321            pymodule_dep = pymodule_dep.replace(pyversion,'${PYTHON_MAJMIN}')
322            inFolders = False
323            for folder in allfolders:
324                # The module could have a directory named after it, e.g. xml, if we take out the filename from the path
325                # we'll end up with ${libdir}, and we want ${libdir}/xml
326                if isFolder(pymodule_dep):
327                    check_path = pymodule_dep
328                else:
329                    check_path = os.path.dirname(pymodule_dep)
330                if folder in check_path :
331                    inFolders = True # Did we find a folder?
332                    folderFound = False # Second flag to break inner for
333                    # Loop only through packages which contain folders
334                    for pypkg_with_folder in hasfolders:
335                        if (folderFound == False):
336                            # print('Checking folder %s on package %s' % (pymodule_dep,pypkg_with_folder))
337                            for folder_dep in old_manifest[pypkg_with_folder]['files'] or folder_dep in old_manifest[pypkg_with_folder]['cached']:
338                                if folder_dep == folder:
339                                    print ('%s directory found in %s' % (folder, pypkg_with_folder))
340                                    folderFound = True
341                                    if pypkg_with_folder not in new_manifest[pypkg]['rdepends'] and pypkg_with_folder != pypkg:
342                                        new_manifest[pypkg]['rdepends'].append(pypkg_with_folder)
343                        else:
344                            break
345
346            # A folder was found so we're done with this item, we can go on
347            if inFolders:
348                continue
349
350
351
352            # No directories beyond this point
353            # We might already have this module on the dictionary since it could depend on a (previously checked) module
354            if pymodule_dep not in new_manifest[pypkg]['files'] and pymodule_dep not in new_manifest[pypkg]['cached']:
355                # Handle core as a special package, we already did it so we pass it to NEW data structure directly
356                if pypkg == 'core':
357                    print('Adding %s to %s FILES' % (pymodule_dep, pypkg))
358                    if pymodule_dep.endswith('*'):
359                        wildcards.append(pymodule_dep)
360                    if isCached(pymodule_dep):
361                        new_manifest[pypkg]['cached'].append(pymodule_dep)
362                    else:
363                        new_manifest[pypkg]['files'].append(pymodule_dep)
364
365                    # Check for repeated files
366                    if pymodule_dep not in allfiles:
367                        allfiles.append(pymodule_dep)
368                    else:
369                        if pymodule_dep not in repeated:
370                            repeated.append(pymodule_dep)
371                else:
372
373
374                    # Last step: Figure out if we this belongs to FILES or RDEPENDS
375                    # We check if this module is already contained on another package, so we add that one
376                    # as an RDEPENDS, or if its not, it means it should be contained on the current
377                    # package, and we should add it to FILES
378                    for possible_rdep in old_manifest:
379                        # Debug
380                        # print('Checking %s ' % pymodule_dep + ' in %s' % possible_rdep)
381                        if pymodule_dep in old_manifest[possible_rdep]['files'] or pymodule_dep in old_manifest[possible_rdep]['cached']:
382                            # Since were nesting, we need to check its not the same pypkg
383                            if(possible_rdep != pypkg):
384                                if possible_rdep not in new_manifest[pypkg]['rdepends']:
385                                    # Add it to the new manifest data struct as RDEPENDS since it contains something this module needs
386                                    reportRDEPS.append('Adding %s to %s RDEPENDS, because it contains %s\n' % (possible_rdep, pypkg, pymodule_dep))
387                                    new_manifest[pypkg]['rdepends'].append(possible_rdep)
388                                break
389                    else:
390
391                      # Since this module wasnt found on another package, it is not an RDEP,
392                      # so we add it to FILES for this package.
393                      # A module shouldn't contain itself (${libdir}/python3/sqlite3 shouldnt be on sqlite3 files)
394                      if os.path.basename(pymodule_dep) != pypkg:
395                        reportFILES.append(('Adding %s to %s FILES\n' % (pymodule_dep, pypkg)))
396                        if isCached(pymodule_dep):
397                            new_manifest[pypkg]['cached'].append(pymodule_dep)
398                        else:
399                            new_manifest[pypkg]['files'].append(pymodule_dep)
400                        if pymodule_dep.endswith('*'):
401                            wildcards.append(pymodule_dep)
402                        if pymodule_dep not in allfiles:
403                            allfiles.append(pymodule_dep)
404                        else:
405                            if pymodule_dep not in repeated:
406                                repeated.append(pymodule_dep)
407
408        print('\n')
409        print('#################################')
410        print('Summary for module %s' % pymodule)
411        print('FILES found for module %s:' % pymodule)
412        print(''.join(reportFILES))
413        print('RDEPENDS found for module %s:' % pymodule)
414        print(''.join(reportRDEPS))
415        print('#################################')
416
417print('The following FILES contain wildcards, please check if they are necessary')
418print(wildcards)
419print('The following FILES contain folders, please check if they are necessary')
420print(hasfolders)
421
422
423# Sort it just so it looks nicer
424for pypkg in new_manifest:
425    new_manifest[pypkg]['files'].sort()
426    new_manifest[pypkg]['cached'].sort()
427    new_manifest[pypkg]['rdepends'].sort()
428
429# Create the manifest from the data structure that was built
430with open('python3-manifest.json.new','w') as outfile:
431    json.dump(new_manifest,outfile, indent=4)
432    outfile.write('\n')
433
434prepend_comments(comments,'python3-manifest.json.new')
435
436if (repeated):
437    error_msg = '\n\nERROR:\n'
438    error_msg += 'The following files were found in more than one package),\n'
439    error_msg += 'this is likely to happen when new files are introduced after an upgrade,\n'
440    error_msg += 'please check which package should get it,\n modify the manifest accordingly and re-run the create_manifest task:\n'
441    error_msg += '\n'.join(repeated)
442    error_msg += '\n'
443    sys.exit(error_msg)
444
445