1# Copyright 2018 The Distro Tracker Developers 

2# See the COPYRIGHT file at the top-level directory of this distribution and 

3# at https://deb.li/DTAuthors 

4# 

5# This file is part of Distro Tracker. It is subject to the license terms 

6# in the LICENSE file found in the top-level directory of this 

7# distribution and at https://deb.li/DTLicense. No part of Distro Tracker, 

8# including this file, may be copied, modified, propagated, or distributed 

9# except according to the terms contained in the LICENSE file. 

10""" 

11Mixins to combine to create powerful tasks. 

12 

13""" 

14import logging 

15 

16from debian.debian_support import version_compare 

17 

18from django.db import transaction 

19 

20from distro_tracker.core.models import ( 

21 PackageData, 

22 Repository, 

23 SourcePackage, 

24 SourcePackageRepositoryEntry, 

25) 

26 

27logger = logging.getLogger('distro_tracker.tasks') 

28 

29 

30class ProcessItems(object): 

31 """ 

32 Base class for all Process* mixins. Those mixins defines a list of 

33 items that the task should process. 

34 """ 

35 

36 def __init__(self): 

37 self.register_event_handler('execute-started', 

38 self.handle_fake_update_parameter) 

39 self.register_event_handler('execute-finished', 

40 self.items_cleanup_processed_list) 

41 super().__init__() 

42 

43 def item_to_key(self, item): 

44 """ 

45 Converts an item to process into a unique string representation 

46 than can be used to record the fact that the item has been processed. 

47 

48 :param object item: Any kind of object. 

49 :return: A unique string representation of the object. 

50 :rtype: str 

51 """ 

52 return str(item) 

53 

54 def item_describe(self, item): 

55 """ 

56 Converts an item into a dictionnary with the most important 

57 data of the item that we want to save for later when the item 

58 will have vanished. 

59 

60 :param object item: Any kind of object. 

61 :return: A dictionnary describing the object. 

62 :rtype: dict 

63 """ 

64 return {} 

65 

66 def item_mark_processed(self, *args): 

67 """ 

68 Mark an item as having been processed. This records the key associated 

69 to the item in a ``processed`` dictionnary within the persistent 

70 data of the task. 

71 

72 :param args: list of items to mark as having been processed 

73 """ 

74 processed = self.data.setdefault('processed', {}) 

75 for item in args: 

76 processed[self.item_to_key(item)] = self.item_describe(item) 

77 self.data_mark_modified() 

78 

79 def item_needs_processing(self, item): 

80 """ 

81 Verifies if the item needs to be processed or not. 

82 

83 :param object item: the item to check 

84 :return: True if the obect is not recorded as having already been 

85 processed, False otherwise. 

86 :rtype: bool 

87 """ 

88 processed = self.data.setdefault('processed', {}) 

89 return self.item_to_key(item) not in processed 

90 

91 def items_all(self): 

92 """ 

93 This method returns an iterable of all the existing items, including 

94 those that have already been processed and those which are going to be 

95 processed. 

96 

97 :return: All the existing items. 

98 :rtype: An iterable, can be an iterator or a list, set, tuple. 

99 """ 

100 raise NotImplementedError("ProcessItems.items_all() must be overriden.") 

101 

102 def items_to_process(self): 

103 """ 

104 This method returns the items that have to be processed by the task. 

105 

106 Its default implementation in :class:`ProcessItems` is to iterate over 

107 the items returned by :meth:`items_all` and to :func:`yield` those where 

108 :meth:`item_needs_processing` returns True. 

109 

110 If the `force_update` parameter is set to True, then it returns all the 

111 items without calling :meth:`item_needs_processing`. 

112 """ 

113 for item in self.items_all(): 

114 if self.force_update or self.item_needs_processing(item): 

115 yield item 

116 

117 def items_all_keys(self): 

118 """ 

119 This method returns all the keys corresponding to valid-existing 

120 items. 

121 

122 Its main purpose is to be able to compute the list of keys 

123 in the 'already-processed' list that are no-longer relevant and can be 

124 dropped. 

125 

126 Its default implementation is to iterate over items returned by 

127 :meth:`items_all` and call :meth:`item_to_key` on them. This method 

128 can thus be overrident when there are more efficient ways to implement 

129 this logic. 

130 

131 :return: the set of keys of the valid objects 

132 :rtype: set 

133 """ 

134 return set([self.item_to_key(x) for x in self.items_all()]) 

135 

136 def items_to_cleanup(self): 

137 """ 

138 This method returns an iterators returning a tuple 

139 (key, description) for old items that have been processed 

140 in the past but are no longer existing in :meth:`all_items`. 

141 

142 The description is the value returned by :meth:`item_describe` 

143 at the time when the item has been processed. The key is the value 

144 returned by :meth:`item_to_key` at the time when the item has been 

145 processed. 

146 

147 :return: (key, description) 

148 :rtype: tuple 

149 """ 

150 processed = self.data.setdefault('processed', {}) 

151 processed_set = set(processed.keys()) 

152 unused_keys = processed_set.difference(self.items_all_keys()) 

153 for key in unused_keys: 

154 yield (key, processed[key]) 

155 

156 def items_cleanup_processed_list(self): 

157 """ 

158 This method drops unused keys from the list of processed items. 

159 

160 To identify unused keys, it computes the difference between the 

161 set of keys present in the 'processed' list and the set of keys 

162 returned by :meth:`items_all_keys`. 

163 """ 

164 processed = self.data.setdefault('processed', {}) 

165 modified = False 

166 for key, _ in self.items_to_cleanup(): 

167 del processed[key] 

168 modified = True 

169 if modified: 

170 self.data_mark_modified() 

171 

172 def items_fake_processed_list(self): 

173 """ 

174 This method goes over all items to process and marks them as processed. 

175 This is useful to fake the whole update process and bootstrap an 

176 iterative process where we don't want the initial run to process 

177 all existing entries. 

178 """ 

179 for item in self.items_to_process(): 

180 self.item_mark_processed(item) 

181 

182 def handle_fake_update_parameter(self): 

183 """ 

184 This method is registered as an execute-started event handler and 

185 marks all items as processed even before the task has a chance to 

186 process them. 

187 """ 

188 if self.fake_update: 

189 self.items_fake_processed_list() 

190 

191 

192class ProcessModel(ProcessItems): 

193 """ 

194 With this mixin, the list of items to be processed is a list of objects 

195 retrieved through the database model specified in the :attr:`model` 

196 attribute. Sub-classes should thus at least override this attribute. 

197 """ 

198 

199 #: The database model defining the list of items to process 

200 model = None 

201 

202 def items_all(self): 

203 return self.items_extend_queryset(self.model.objects.all()) 

204 

205 def items_to_process(self): 

206 items = self.items_all() 

207 # Exclude the items already processed, unless --force-update tells us to 

208 # reprocess all entries 

209 if not self.force_update: 

210 processed = self.data.setdefault('processed', {}) 

211 # XXX: might not be the right thing when primary key is not the id 

212 processed_keys = list(map(lambda x: int(x), processed.keys())) 

213 items = items.exclude(pk__in=processed_keys) 

214 return items 

215 

216 def items_extend_queryset(self, queryset): 

217 """ 

218 This method can be overriden by sub-classes to customize the queryset 

219 returned by :meth:`items_all`. The normal queryset is passed as 

220 parameter and the method should return the modified queryset. 

221 

222 :param QuerySet queryset: the original queryset 

223 :return: the modified queryset 

224 :rtype: QuerySet 

225 """ 

226 return queryset 

227 

228 def item_to_key(self, item): 

229 """ 

230 For database objects, we use the primary key as the key for the 

231 processed list. 

232 

233 :param item: an instance of the associated model 

234 :return: the value of its primary key 

235 """ 

236 return str(item.pk) 

237 

238 def items_all_keys(self): 

239 # Better implementation with an optimized query 

240 return set(map(lambda x: str(x), 

241 self.items_all().values_list('pk', flat=True))) 

242 

243 def item_describe(self, item): 

244 data = super().item_describe(item) 

245 for field_name in getattr(self, 'fields_to_save', []): 

246 field = getattr(item, field_name) 

247 if callable(field): 

248 field = field() 

249 data[field_name] = field 

250 return data 

251 

252 

253class ProcessSourcePackage(ProcessModel): 

254 """ 

255 Process all :class:`~distro_tracker.core.models.SourcePackage` objects. 

256 """ 

257 model = SourcePackage 

258 fields_to_save = ('name', 'version') 

259 

260 

261class ProcessSrcRepoEntry(ProcessModel): 

262 """ 

263 Process all 

264 :class:`~distro_tracker.core.models.SourcePackageRepositoryEntry`. 

265 """ 

266 

267 model = SourcePackageRepositoryEntry 

268 

269 def items_extend_queryset(self, queryset): 

270 return queryset.select_related( 

271 'source_package__source_package_name', 'repository') 

272 

273 def item_describe(self, item): 

274 data = super().item_describe(item) 

275 data['name'] = item.source_package.name 

276 data['version'] = item.source_package.version 

277 data['repository'] = item.repository.shorthand 

278 data['repository_id'] = item.repository.id 

279 return data 

280 

281 

282class ProcessSrcRepoEntryInDefaultRepository(ProcessSrcRepoEntry): 

283 """ 

284 Process 

285 :class:`~distro_tracker.core.models.SourcePackageRepositoryEntry`. 

286 from the default repository. 

287 """ 

288 

289 def items_extend_queryset(self, queryset): 

290 queryset = super().items_extend_queryset(queryset) 

291 return queryset.filter(repository__default=True) 

292 

293 

294class ProcessMainRepoEntry(ProcessItems): 

295 """ 

296 Process the main 

297 :class:`~distro_tracker.core.models.SourcePackageRepositoryEntry` 

298 for each package. The main entry is defined as being the one existing in the 

299 default repository. If there's no default entry for a given package, then 

300 it's the entry with the biggest version that is taken. If there are still 

301 two entries, then we take the one in the repository with the biggest 

302 "position". 

303 """ 

304 

305 def __init__(self): 

306 super().__init__() 

307 self.main_entries = None 

308 self.register_event_handler('execute-started', 

309 self.clear_main_entries_cache) 

310 self.register_event_handler('execute-finished', 

311 self.clear_main_entries_cache) 

312 self.register_event_handler('execute-failed', 

313 self.clear_main_entries_cache) 

314 

315 def clear_main_entries_cache(self): 

316 self.main_entries = None 

317 

318 def items_all(self): 

319 if self.main_entries is not None: 

320 return self.main_entries.values() 

321 

322 main_entries = {} 

323 

324 def register_entry(entry): 

325 name = entry.source_package.name 

326 version = entry.source_package.version 

327 if name not in main_entries: 

328 main_entries[name] = entry 

329 else: 

330 selected_version = main_entries[name].source_package.version 

331 if version_compare(selected_version, version) < 0: 

332 main_entries[name] = entry 

333 elif version_compare(selected_version, version) == 0: 

334 # If both versions are equal, we use the repository with the 

335 # biggest position 

336 if (entry.repository.position > 

337 main_entries[name].repository.position): 

338 main_entries[name] = entry 

339 

340 # First identify entries from the default repository 

341 qs = SourcePackageRepositoryEntry.objects.filter( 

342 repository__default=True).select_related( 

343 'source_package__source_package_name', 

344 'repository') 

345 

346 for entry in qs: 

347 register_entry(entry) 

348 

349 # Then again for all the other remaining packages 

350 qs = SourcePackageRepositoryEntry.objects.exclude( 

351 source_package__source_package_name__name__in=main_entries.keys() 

352 ).select_related( 

353 'source_package__source_package_name', 

354 'repository' 

355 ) 

356 for entry in qs: 

357 register_entry(entry) 

358 

359 self.main_entries = main_entries 

360 return self.main_entries.values() 

361 

362 def item_to_key(self, item): 

363 return str(item.id) 

364 

365 def item_describe(self, item): 

366 return { 

367 'name': item.source_package.name, 

368 'version': item.source_package.version, 

369 'repository': item.repository.shorthand, 

370 } 

371 

372 

373class ProcessRepositoryUpdates(ProcessSrcRepoEntry): 

374 """ 

375 Watch repositories and generates updates operations to be processed. 

376 

377 :meth:`items_to_process` returns repository entries but you can query 

378 :meth:`is_new_source_package` on the associated source package to know 

379 if the source package was already present in another repository in the 

380 previous run or not. 

381 

382 There's a new :meth:`iter_removals_by_repository` to find out packages 

383 which have been dropped from the repository. 

384 """ 

385 

386 def __init__(self): 

387 super().__init__() 

388 self.register_event_handler('execute-started', 

389 self.compute_known_packages) 

390 

391 def compute_known_packages(self): 

392 """ 

393 Goes over the list of formerly processed items and builds lists to 

394 quickly lookup wether a given package is new or not. 

395 """ 

396 self.pkglist = { 

397 'all': {}, 

398 } 

399 self.srcpkglist = { 

400 'all': {}, 

401 } 

402 for data in self.data.get('processed', {}).values(): 

403 key = '%s_%s' % (data['name'], data['version']) 

404 self.pkglist['all'][data['name']] = True 

405 self.srcpkglist['all'][key] = True 

406 repo_pkglist = self.pkglist.setdefault(data['repository_id'], {}) 

407 repo_srcpkglist = self.srcpkglist.setdefault(data['repository_id'], 

408 {}) 

409 repo_pkglist[data['name']] = True 

410 repo_srcpkglist[key] = True 

411 

412 def is_new_source_package(self, srcpkg): 

413 """ 

414 Returns True if the source package was not present in the former run, 

415 False otherwise. 

416 

417 The existence of the source package is deducted from the list of already 

418 processed entries (with the help of :meth:`compute_known_packages` which 

419 is called at the start of the :meth:`execute` method. 

420 

421 :param srcpkg: the source package 

422 :type srcpkg: :class:`~distro_tracker.core.models.SourcePackage` 

423 :returns: True if never seen, False otherwise 

424 :rtype: bool 

425 """ 

426 key = '%s_%s' % (srcpkg.name, srcpkg.version) 

427 return key not in self.srcpkglist['all'] 

428 

429 def iter_removals_by_repository(self): 

430 """ 

431 Returns an iterator to process all package removals that happened in all 

432 the repositories. The iterator yields tuples with the package name (as 

433 a string) and the repository object. 

434 """ 

435 for repository in Repository.objects.all(): 

436 if repository.id not in self.pkglist: 

437 continue 

438 qs = repository.source_packages.all() 

439 new_pkglist = set( 

440 qs.values_list('source_package_name__name', flat=True)) 

441 for package in self.pkglist[repository.id]: 

442 if package not in new_pkglist: 

443 yield (package, repository) 

444 

445 

446class PackageTagging(object): 

447 """ 

448 A task mixin that helps to maintain a set of package tags: 

449 by untagging packages that no longer should be tagged and by 

450 tagging packages that should. 

451 

452 Subclasses must define: 

453 - `TAG_NAME`: defines the key for PackageData to be updated. One must define 

454 keys matching `tag:.*` 

455 - `TAG_DISPLAY_NAME`: defines the display name for the tag 

456 - `TAG_COLOR_TYPE`: defines the color type to be used while rendering 

457 content related to the tag. It must be defined based on the tag severity. 

458 One may use one of the following options: success, danger, warning, or info. 

459 - `TAG_DESCRIPTION`: defines a help text to be displayed with a 'title' 

460 attribute 

461 - `TAG_TABLE_TITLE`: the title of the table showing all the packages 

462 with this tag 

463 

464 Also, subclasses must implement the :func:`packages_to_tag` function to 

465 define the list of packages that must be tagged. 

466 """ 

467 TAG_NAME = None 

468 TAG_DISPLAY_NAME = '' 

469 TAG_COLOR_TYPE = '' 

470 TAG_DESCRIPTION = '' 

471 TAG_TABLE_TITLE = '' 

472 

473 def packages_to_tag(self): 

474 """ 

475 Subclasses must override this method to return the list of packages 

476 that must be tagged with the tag defined by `TAG_NAME` 

477 """ 

478 return [] 

479 

480 def execute_package_tagging(self): 

481 with transaction.atomic(): 

482 # Clear previous TaggedItems 

483 PackageData.objects.filter(key=self.TAG_NAME).delete() 

484 

485 items = [] 

486 value = { 

487 'display_name': self.TAG_DISPLAY_NAME, 

488 'color_type': self.TAG_COLOR_TYPE, 

489 'description': self.TAG_DESCRIPTION, 

490 'table_title': self.TAG_TABLE_TITLE 

491 } 

492 for package in self.packages_to_tag(): 

493 tag = PackageData( 

494 package=package, key=self.TAG_NAME, value=value) 

495 items.append(tag) 

496 PackageData.objects.bulk_create(items)