1# Copyright 2013-2018 The Distro Tracker Developers
2# See the COPYRIGHT file at the top-level directory of this distribution and
3# at https://deb.li/DTAuthors
4#
5# This file is part of Distro Tracker. It is subject to the license terms
6# in the LICENSE file found in the top-level directory of this
7# distribution and at https://deb.li/DTLicense. No part of Distro Tracker,
8# including this file, may be copied, modified, propagated, or distributed
9# except according to the terms contained in the LICENSE file.
10"""Utilities for processing Debian package information."""
11import os
12import re
13import shutil
14import subprocess
15import tarfile
17import apt
19import apt_pkg
21from debian import deb822
23from django.conf import settings
24from django.urls import reverse
25from django.utils.encoding import force_bytes
27from distro_tracker.core.utils.email_messages import \
28 name_and_address_from_string as parse_address
29from distro_tracker.core.utils.email_messages import \
30 names_and_addresses_from_string as parse_addresses
33def package_hashdir(package_name):
34 """
35 Returns the name of the hash directory used to avoid having too
36 many entries in a single directory. It's usually the first letter
37 of the package except for lib* packages where it's the first 4
38 letters.
40 :param package_name: The package name.
41 :type package_name: str
43 :returns: Name of the hash directory.
44 :rtype: str
45 """
46 if package_name is None:
47 return None
48 if package_name.startswith('lib'):
49 return package_name[0:4]
50 else:
51 return package_name[0:1]
54def package_url(package_name):
55 """
56 Returns the URL of the page dedicated to this package name.
58 :param package_name: The package name.
59 :type package_name: str or PackageName model
61 :returns: Name of the hash directory.
62 :rtype: str
63 """
64 if package_name is None:
65 return None
66 return reverse('dtracker-package-page',
67 kwargs={'package_name': str(package_name)})
70def extract_vcs_information(stanza):
71 """
72 Extracts the VCS information from a package's Sources entry.
74 :param stanza: The ``Sources`` entry from which to extract the VCS info.
75 Maps ``Sources`` key names to values.
76 :type stanza: dict
78 :returns: VCS information regarding the package. Contains the following
79 keys: type[, browser, url, branch]
80 :rtype: dict
81 """
82 vcs = {}
83 for key, value in stanza.items():
84 key = key.lower()
85 if key == 'vcs-browser':
86 vcs['browser'] = value
87 elif key.startswith('vcs-'):
88 vcs['type'] = key[4:]
89 vcs['url'] = value
90 if vcs['type'] == 'git':
91 match = re.match(r'(?P<url>.*?)\s+-b\s*(?P<branch>\S+)', value)
92 if match:
93 vcs['url'] = match.group('url')
94 vcs['branch'] = match.group('branch')
95 return vcs
98def extract_dsc_file_name(stanza):
99 """
100 Extracts the name of the .dsc file from a package's Sources entry.
102 :param stanza: The ``Sources`` entry from which to extract the VCS info.
103 Maps ``Sources`` key names to values.
104 :type stanza: dict
106 """
107 for field in ('checksums-sha256', 'checksums-sha1', 'files'):
108 for entry in stanza.get(field, []):
109 if entry.get('name', '').endswith('.dsc'): 109 ↛ 108line 109 didn't jump to line 108, because the condition on line 109 was never false
110 return entry['name']
112 return None
115def extract_information_from_sources_entry(stanza):
116 """
117 Extracts information from a ``Sources`` file entry and returns it in the
118 form of a dictionary.
120 :param stanza: The raw entry's key-value pairs.
121 :type stanza: Case-insensitive dict
122 """
123 binaries = [
124 binary.strip()
125 for binary in stanza['binary'].split(',')
126 ]
127 entry = {
128 'version': stanza['version'],
129 'homepage': stanza.get('homepage', ''),
130 'priority': stanza.get('priority', ''),
131 'section': stanza.get('section', ''),
132 'architectures': stanza['architecture'].split(),
133 'binary_packages': binaries,
134 'maintainer': parse_address(stanza['maintainer']),
135 'uploaders': parse_addresses(stanza.get('uploaders', '')),
136 'standards_version': stanza.get('standards-version', ''),
137 'vcs': extract_vcs_information(stanza),
138 'dsc_file_name': extract_dsc_file_name(stanza),
139 'directory': stanza.get('directory', ''),
140 }
142 return entry
145def extract_information_from_packages_entry(stanza):
146 """
147 Extracts information from a ``Packages`` file entry and returns it in the
148 form of a dictionary.
150 :param stanza: The raw entry's key-value pairs.
151 :type stanza: Case-insensitive dict
152 """
153 entry = {
154 'version': stanza['version'],
155 'short_description': stanza.get('description', '')[:300],
156 }
158 return entry
161class SourcePackageRetrieveError(Exception):
162 pass
165class AptCache(object):
166 """
167 A class for handling cached package information.
168 """
169 DEFAULT_MAX_SIZE = 1 * 1024 ** 3 # 1 GiB
170 QUILT_FORMAT = '3.0 (quilt)'
172 class AcquireProgress(apt.progress.base.AcquireProgress):
173 """
174 Instances of this class can be passed to :meth:`apt.cache.Cache.update`
175 calls.
176 It provides a way to track which files were changed and which were not
177 by an update operation.
178 """
179 def __init__(self, *args, **kwargs):
180 super(AptCache.AcquireProgress, self).__init__(*args, **kwargs)
181 self.fetched = []
182 self.hit = []
184 def done(self, item):
185 self.fetched.append(os.path.split(item.owner.destfile)[1])
187 def ims_hit(self, item):
188 self.hit.append(os.path.split(item.owner.destfile)[1])
190 def pulse(self, owner):
191 return True
193 def __init__(self):
194 # The root cache directory is a subdirectory in the
195 # DISTRO_TRACKER_CACHE_DIRECTORY
196 self.cache_root_dir = os.path.join(
197 settings.DISTRO_TRACKER_CACHE_DIRECTORY,
198 'apt-cache'
199 )
200 self.sources_list_path = os.path.join(
201 self.cache_root_dir, 'etc', 'sources.list')
202 self.conf_file_path = os.path.join(self.cache_root_dir,
203 'etc', 'apt.conf')
204 os.environ['APT_CONFIG'] = self.conf_file_path
206 self.sources = []
207 self.packages = []
208 self.cache_max_size = getattr(
209 settings, 'DISTRO_TRACKER_APT_CACHE_MAX_SIZE',
210 self.DEFAULT_MAX_SIZE)
211 #: The directory where source package files are cached
212 self.source_cache_directory = os.path.join(self.cache_root_dir,
213 'packages')
214 self._cache_size = None # Evaluate the cache size lazily
216 self.configure_cache()
218 @property
219 def cache_size(self):
220 if self._cache_size is None:
221 self._cache_size = \
222 self.get_directory_size(self.source_cache_directory)
223 return self._cache_size
225 def get_directory_size(self, directory_path):
226 """
227 Returns the total space taken by the given directory in bytes.
229 :param directory_path: The path to the directory
230 :type directory_path: string
232 :rtype: int
233 """
234 # Convert the directory path to bytes to make sure all os calls deal
235 # with bytes, not unicode objects.
236 # This way any file names with invalid utf-8 names, are correctly
237 # handled, without causing an error.
238 directory_path = force_bytes(directory_path)
239 total_size = 0
240 for dirpath, dirnames, filenames in os.walk(directory_path):
241 for file_name in filenames:
242 file_path = os.path.join(dirpath, file_name)
243 stat = os.lstat(file_path)
244 total_size += stat.st_size
246 return total_size
248 def clear_cache(self):
249 """
250 Removes all cache information. This causes the next update to retrieve
251 fresh repository files.
252 """
253 self._remove_dir(self.cache_root_dir)
254 self.configure_cache()
256 def update_sources_list(self):
257 """
258 Updates the ``sources.list`` file used to list repositories for which
259 package information should be cached.
260 """
261 from distro_tracker.core.models import Repository
263 directory = os.path.dirname(self.sources_list_path)
264 if not os.path.exists(directory):
265 os.makedirs(directory)
267 with open(self.sources_list_path, 'w') as sources_list:
268 for repository in Repository.objects.all():
269 sources_list.write(repository.sources_list_entry + '\n')
271 def update_apt_conf(self):
272 """
273 Updates the ``apt.conf`` file which gives general settings for the
274 :class:`apt.cache.Cache`.
276 In particular, this updates the list of all architectures which should
277 be considered in package updates based on architectures that the
278 repositories support.
279 """
280 from distro_tracker.core.models import Architecture
282 with open(self.conf_file_path, 'w') as conf_file:
283 conf_file.write('APT::Architectures { ')
284 for architecture in Architecture.objects.all():
285 conf_file.write('"{arch}"; '.format(arch=architecture))
286 conf_file.write('};\n')
287 conf_file.write('Acquire::CompressionTypes::Order:: "xz";\n')
288 conf_file.write('Dir "{}/";\n'.format(self.cache_root_dir))
289 conf_file.write('Dir::State "state/";\n')
290 conf_file.write('Dir::State::status "dpkg-status";\n')
291 conf_file.write('Dir::Etc "etc/";\n')
292 conf_file.write('Dir::Etc::sourcelist "{src}";\n'.format(
293 src=self.sources_list_path))
294 conf_file.write('Dir::Etc::Trusted "{src}";\n'.format(
295 src=settings.DISTRO_TRACKER_TRUSTED_GPG_MAIN_FILE))
296 conf_file.write('Dir::Etc::TrustedParts "{src}";\n'.format(
297 src=settings.DISTRO_TRACKER_TRUSTED_GPG_PARTS_DIR))
299 def configure_cache(self):
300 """
301 Configures the cache based on the most current repository information.
302 """
303 self.update_sources_list()
304 self.update_apt_conf()
305 # Clean up the configuration we might have read during "import apt"
306 for root_key in apt_pkg.config.list():
307 apt_pkg.config.clear(root_key)
308 # Load the proper configuration
309 apt_pkg.init()
310 # Ensure we have the required directories
311 for apt_dir in [apt_pkg.config.find_dir('Dir::State::lists'),
312 apt_pkg.config.find_dir('Dir::Etc::sourceparts'),
313 apt_pkg.config.find_dir('Dir::Cache::archives')]:
314 if not os.path.exists(apt_dir):
315 os.makedirs(apt_dir)
317 def _index_file_full_path(self, file_name):
318 """
319 Returns the absolute path for the given cached index file.
321 :param file_name: The name of the cached index file.
322 :type file_name: string
324 :rtype: string
325 """
326 return os.path.join(
327 apt_pkg.config.find_dir('Dir::State::lists'),
328 file_name
329 )
331 def _match_index_file_to_repository(self, sources_file):
332 """
333 Returns a two-tuple ``(class:`Repository <distro_tracker.core.
334 models.Repository>`, component)``. The class:`Repository
335 <distro_tracker.core.models.Repository>` instance which matches the
336 given cached ``Sources`` file and the ``component`` of the ``Source``.
338 :rtype: (:class:`Repository <distro_tracker.core.models.Repository>`,
339 string)
340 """
341 from distro_tracker.core.models import Repository
343 sources_list = apt_pkg.SourceList()
344 sources_list.read_main_list()
345 component_url = None
346 component = None
347 for entry in sources_list.list:
348 for index_file in entry.index_files:
349 if os.path.basename(sources_file) in index_file.describe:
350 base_url, component, _ = index_file.describe.split(None, 2)
351 base_url = base_url.rstrip('/')
352 component_url = base_url + '/' + component
353 break
355 components = component.split('/')
356 if len(components) >= 2:
357 component = components[1].strip()
359 for repository in Repository.objects.all():
360 if component_url in repository.component_urls:
361 return repository, component
363 def _get_all_cached_files(self):
364 """
365 Returns a list of all cached files.
366 """
367 lists_directory = apt_pkg.config.find_dir('Dir::State::lists')
368 try:
369 return [
370 os.path.join(lists_directory, file_name)
371 for file_name in os.listdir(lists_directory)
372 if os.path.isfile(os.path.join(lists_directory, file_name))
373 ]
374 except OSError:
375 # The directory structure does not exist => nothing is cached
376 return []
378 def get_cached_files(self, filter_function=None):
379 """
380 Returns cached files, optionally filtered by the given
381 ``filter_function``
383 :param filter_function: Takes a file name as the only parameter and
384 returns a :class:`bool` indicating whether it should be included
385 in the result.
386 :type filter_function: callable
388 :returns: A list of cached file names
389 :rtype: list
390 """
391 if filter_function is None: 391 ↛ 393line 391 didn't jump to line 393, because the condition on line 391 was never true
392 # Include all files if the filter function is not provided
393 def filter_function(x):
394 return True
396 return [
397 file_name
398 for file_name in self._get_all_cached_files()
399 if filter_function(file_name)
400 ]
402 def get_sources_files_for_repository(self, repository):
403 """
404 Returns all ``Sources`` files which are cached for the given
405 repository.
407 For instance, ``Sources`` files for different suites are cached
408 separately.
410 :param repository: The repository for which to return all cached
411 ``Sources`` files
412 :type repository: :class:`Repository
413 <distro_tracker.core.models.Repository>`
415 :rtype: ``iterable`` of strings
416 """
417 return self.get_cached_files(
418 lambda file_name: (
419 file_name.endswith('Sources') and
420 self._match_index_file_to_repository(
421 file_name)[0] == repository))
423 def get_packages_files_for_repository(self, repository):
424 """
425 Returns all ``Packages`` files which are cached for the given
426 repository.
428 For instance, ``Packages`` files for different suites are cached
429 separately.
431 :param repository: The repository for which to return all cached
432 ``Packages`` files
433 :type repository: :class:`Repository
434 <distro_tracker.core.models.Repository>`
436 :rtype: ``iterable`` of strings
437 """
438 return self.get_cached_files(
439 lambda file_name: (
440 file_name.endswith('Packages') and
441 self._match_index_file_to_repository(
442 file_name)[0] == repository))
444 def update_repositories(self, force_download=False):
445 """
446 Initiates a cache update.
448 :param force_download: If set to ``True`` causes the cache to be
449 cleared before starting the update, thus making sure all index
450 files are downloaded again.
452 :returns: A two-tuple ``(updated_sources, updated_packages)``. Each of
453 the tuple's members is a list of
454 (:class:`Repository <distro_tracker.core.models.Repository>`,
455 ``component``, ``file_name``) tuple representing the repository
456 which was updated, component, and the file which contains the fresh
457 information. The file is either a ``Sources`` or a ``Packages``
458 file respectively.
459 """
460 if force_download:
461 self.clear_cache()
463 self.configure_cache()
465 cache = apt.Cache(rootdir=self.cache_root_dir)
466 progress = AptCache.AcquireProgress()
467 cache.update(progress)
469 updated_sources = []
470 updated_packages = []
471 for fetched_file in progress.fetched:
472 if fetched_file.endswith('Sources'):
473 dest = updated_sources
474 elif fetched_file.endswith('Packages'):
475 dest = updated_packages
476 else:
477 continue
478 repository, component = self._match_index_file_to_repository(
479 fetched_file)
480 dest.append((
481 repository, component, self._index_file_full_path(fetched_file)
482 ))
484 return updated_sources, updated_packages
486 def _get_format(self, record):
487 """
488 Returns the Format field value of the given source package record.
489 """
490 record = deb822.Deb822(record)
491 return record['format']
493 def _extract_quilt_package_debian_tar(self, debian_tar_path, outdir):
494 """
495 Extracts the given tarball to the given output directory.
496 """
497 with tarfile.open(debian_tar_path) as archive_file:
499 def is_within_directory(directory, target):
500 abs_directory = os.path.abspath(directory)
501 abs_target = os.path.abspath(target)
503 prefix = os.path.commonprefix([abs_directory, abs_target])
505 return prefix == abs_directory
507 def safe_extract(tar, path="."):
508 for member in tar.getmembers():
509 member_path = os.path.join(path, member.name)
510 if not is_within_directory(path, member_path):
511 raise Exception("Attempted Path Traversal in Tar File")
513 tar.extractall(path)
515 safe_extract(archive_file, outdir)
517 def get_package_source_cache_directory(self, package_name):
518 """
519 Returns the path to the directory where a particular source package is
520 cached.
522 :param package_name: The name of the source package
523 :type package_name: string
525 :rtype: string
526 """
527 package_hash = (
528 package_name[0]
529 if not package_name.startswith('lib') else
530 package_name[:4]
531 )
532 return os.path.join(
533 self.source_cache_directory,
534 package_hash,
535 package_name)
537 def get_source_version_cache_directory(self, package_name, version):
538 """
539 Returns the path to the directory where a particular source package
540 version files are extracted.
542 :param package_name: The name of the source package
543 :type package_name: string
545 :param version: The version of the source package
546 :type version: string
548 :rtype: string
549 """
550 package_dir = self.get_package_source_cache_directory(package_name)
551 return os.path.join(package_dir, package_name + '-' + version)
553 def _remove_dir(self, directory_path):
554 """
555 Removes the given directory, including any subdirectories and files.
556 The method makes sure to correctly handle the situation where the
557 directory contains files with names which are invalid utf-8.
558 """
559 # Convert the directory path to bytes to make sure all os calls deal
560 # with bytes, not unicode objects.
561 # This way any file names with invalid utf-8 names, are correctly
562 # handled, without causing an error.
563 directory_path = force_bytes(directory_path)
564 if os.path.exists(directory_path):
565 shutil.rmtree(directory_path)
567 def clear_cached_sources(self):
568 """
569 Clears all cached package source files.
570 """
571 self._remove_dir(self.source_cache_directory)
572 self._cache_size = self.get_directory_size(self.source_cache_directory)
574 def _get_apt_source_records(self, source_name, version):
575 """
576 Returns a :class:`apt_pkg.SourceRecords` instance where the given
577 source package is the current working record.
578 """
579 apt.Cache(rootdir=self.cache_root_dir) # must be pre-created
580 source_records = apt_pkg.SourceRecords()
581 source_records.restart()
582 # Find the cached record matching this source package and version
583 found = False
584 while source_records.lookup(source_name):
585 if source_records.version == version:
586 found = True
587 break
589 if not found:
590 # Package version does not exist in the cache
591 raise SourcePackageRetrieveError(
592 "Could not retrieve package {pkg} version {ver}:"
593 " No such version found in the cache".format(
594 pkg=source_name, ver=version))
596 return source_records
598 def _extract_dpkg_source(self, retrieved_files, outdir):
599 """
600 Uses dpkg-source to extract the source package.
601 """
602 dsc_file_path = next(
603 file_path
604 for file_path in retrieved_files
605 if file_path.endswith('.dsc'))
606 dsc_file_path = os.path.abspath(dsc_file_path)
607 outdir = os.path.abspath(outdir)
608 subprocess.check_output(["dpkg-source", "-x", dsc_file_path, outdir],
609 stderr=subprocess.STDOUT)
611 def _apt_acquire_package(self,
612 source_records,
613 dest_dir_path,
614 debian_directory_only):
615 """
616 Using :class:`apt_pkg.Acquire`, retrieves the source files for the
617 source package described by the current source_records record.
619 :param source_records: The record describing the source package whose
620 files should be retrieved.
621 :type source_records: :class:`apt_pkg.Acquire`
623 :param dest_dir_path: The path to the directory where the downloaded
624 files should be saved.
625 :type dest_dir_path: string
627 :param debian_directory_only: A flag indicating whether only the debian
628 directory should be downloaded.
630 :returns: A list of absolute paths of all retrieved source files.
631 :rtype: list of strings
632 """
633 package_format = self._get_format(source_records.record)
634 # A reference to each AcquireFile instance must be kept
635 files = []
636 acquire = apt_pkg.Acquire(apt.progress.base.AcquireProgress())
637 for srcfile in source_records.files:
638 base = os.path.basename(srcfile.path)
639 dest_file_path = os.path.join(dest_dir_path, base)
640 if debian_directory_only and package_format == self.QUILT_FORMAT:
641 if srcfile.type != 'diff':
642 # Only retrieve the .debian.tar.* file for quilt packages
643 # when only the debian directory is wanted
644 continue
645 files.append(apt_pkg.AcquireFile(
646 acquire,
647 source_records.index.archive_uri(srcfile.path),
648 srcfile.hashes,
649 srcfile.size,
650 base,
651 destfile=dest_file_path
652 ))
654 acquire.run()
656 # Check if all items are correctly retrieved and build the list of file
657 # paths.
658 retrieved_paths = []
659 for item in acquire.items:
660 if item.status != item.STAT_DONE:
661 raise SourcePackageRetrieveError(
662 'Could not retrieve file {file}: {error}'.format(
663 file=item.destfile,
664 error=item.error_text.decode('utf-8')))
665 retrieved_paths.append(item.destfile)
667 return retrieved_paths
669 def retrieve_source(self, source_name, version,
670 debian_directory_only=False):
671 """
672 Retrieve the source package files for the given source package version.
674 :param source_name: The name of the source package
675 :type source_name: string
676 :param version: The version of the source package
677 :type version: string
678 :param debian_directory_only: Flag indicating if the method should try
679 to retrieve only the debian directory of the source package. This
680 is usually only possible when the package format is 3.0 (quilt).
681 :type debian_directory_only: Boolean
683 :returns: The path to the directory containing the extracted source
684 package files.
685 :rtype: string
686 """
687 if self.cache_size > self.cache_max_size:
688 # If the maximum allowed cache size has been exceeded,
689 # clear the cache
690 self.clear_cached_sources()
692 source_records = self._get_apt_source_records(source_name, version)
694 dest_dir_path = self.get_package_source_cache_directory(source_name)
695 if not os.path.exists(dest_dir_path): 695 ↛ 698line 695 didn't jump to line 698, because the condition on line 695 was never false
696 os.makedirs(dest_dir_path)
697 # Remember the size of the directory in the beginning
698 old_size = self.get_directory_size(dest_dir_path)
700 # Download the source files
701 retrieved_files = self._apt_acquire_package(
702 source_records, dest_dir_path, debian_directory_only)
704 # Extract the retrieved source files
705 outdir = self.get_source_version_cache_directory(source_name, version)
706 # dpkg-source expects this directory not to exist
707 self._remove_dir(outdir)
709 package_format = self._get_format(source_records.record)
710 if debian_directory_only and package_format == self.QUILT_FORMAT: 710 ↛ 712line 710 didn't jump to line 712, because the condition on line 710 was never true
711 # dpkg-source cannot extract an incomplete package
712 self._extract_quilt_package_debian_tar(retrieved_files[0], outdir)
713 else:
714 # Let dpkg-source handle the extraction in all other cases
715 self._extract_dpkg_source(retrieved_files, outdir)
717 # Update the current cache size based on the changes made by getting
718 # this source package.
719 new_size = self.get_directory_size(dest_dir_path)
720 size_delta = new_size - old_size
721 self._cache_size += size_delta
723 return outdir
726def html_package_list(packages):
727 """Return a HTML-formatted list of packages."""
728 packages_html = []
729 for package in packages:
730 if "/" in package:
731 (source_package_name, remain) = package.split("/", 1)
732 remain = "/%s" % (remain,)
733 else:
734 (source_package_name, remain) = (package, "")
735 html = '<a href="{}">{}</a>{}'.format(
736 package_url(source_package_name), source_package_name, remain)
737 packages_html.append(html)
739 return ', '.join(packages_html)