1# Copyright 2013 The Distro Tracker Developers 

2# See the COPYRIGHT file at the top-level directory of this distribution and 

3# at https://deb.li/DTAuthors 

4# 

5# This file is part of Distro Tracker. It is subject to the license terms 

6# in the LICENSE file found in the top-level directory of this 

7# distribution and at https://deb.li/DTLicense. No part of Distro Tracker, 

8# including this file, may be copied, modified, propagated, or distributed 

9# except according to the terms contained in the LICENSE file. 

10""" 

11Utilities for handling HTTP resource access. 

12""" 

13 

14import json 

15import os 

16import re 

17import time 

18from hashlib import md5 

19 

20from django.conf import settings 

21from django.shortcuts import redirect 

22from django.utils import timezone 

23from django.utils.http import parse_http_date, url_has_allowed_host_and_scheme 

24 

25import requests 

26from requests.structures import CaseInsensitiveDict 

27 

28from .compression import get_uncompressed_stream, guess_compression_method 

29 

30 

31def parse_cache_control_header(header): 

32 """ 

33 Parses the given Cache-Control header's values. 

34 

35 :returns: The key-value pairs found in the header. 

36 If some key did not have an associated value in the header, ``None`` 

37 is used instead. 

38 :rtype: dict 

39 """ 

40 parts = header.split(',') 

41 cache_control = {} 

42 for part in parts: 

43 part = part.strip() 

44 if '=' not in part: 

45 cache_control[part] = None 

46 continue 

47 key, value = part.split('=', 1) 

48 cache_control[key] = value 

49 

50 return cache_control 

51 

52 

53class HttpCache(object): 

54 """ 

55 A class providing an interface to a cache of HTTP responses. 

56 """ 

57 def __init__(self, cache_directory_path, 

58 url_to_cache_path=None): 

59 self.cache_directory_path = cache_directory_path 

60 self.custom_url_to_cache_path = url_to_cache_path 

61 

62 def __contains__(self, item): 

63 cache_file_name = self._content_cache_file_path(item) 

64 return os.path.exists(cache_file_name) 

65 

66 def is_expired(self, url): 

67 """ 

68 If the cached response for the given URL is expired based on 

69 Cache-Control or Expires headers, returns True. 

70 """ 

71 if url not in self: 

72 return True 

73 headers = self.get_headers(url) 

74 

75 # First check if the Cache-Control header has set a max-age 

76 if 'cache-control' in headers: 

77 cache_control = parse_cache_control_header(headers['cache-control']) 

78 if 'max-age' in cache_control: 78 ↛ 87line 78 didn't jump to line 87, because the condition on line 78 was never false

79 max_age = int(cache_control['max-age']) 

80 response_age = int( 

81 os.stat(self._header_cache_file_path(url)).st_mtime) 

82 current_timestamp = int(time.time()) 

83 

84 return current_timestamp - response_age >= max_age 

85 

86 # Alternatively, try the Expires header 

87 if 'expires' in headers: 

88 expires_date = timezone.datetime.utcfromtimestamp( 

89 parse_http_date(headers['expires'])) 

90 expires_date = timezone.make_aware(expires_date, timezone.utc) 

91 current_date = timezone.now() 

92 

93 return current_date > expires_date 

94 

95 # If there is no cache freshness date consider the item expired 

96 return True 

97 

98 def get_content_stream(self, url, compression="auto", text=False): 

99 """ 

100 Returns a file-like object that reads the cached copy of the given URL. 

101 

102 If the file is compressed, the file-like object will read the 

103 decompressed stream. 

104 """ 

105 if url in self: 105 ↛ exitline 105 didn't return from function 'get_content_stream', because the condition on line 105 was never false

106 if compression == "auto": 

107 compression = guess_compression_method(url) 

108 

109 # XXX: we leak temp_file... cf skipped test in test suite 

110 # of get_uncompressed_stream 

111 temp_file = open(self._content_cache_file_path(url), 'rb') 

112 return get_uncompressed_stream(temp_file, compression=compression, 

113 text=text) 

114 

115 def get_content(self, url, compression="auto"): 

116 """ 

117 Returns the content of the cached response for the given URL. 

118 

119 If the file is compressed, then uncompress it, else, consider it 

120 as plain file. 

121 

122 :param compression: Specifies the compression method used to generate 

123 the resource, and thus the compression method one should use to 

124 decompress it. 

125 :type compression: str 

126 

127 :rtype: :class:`bytes` 

128 

129 """ 

130 if url in self: 130 ↛ exitline 130 didn't return from function 'get_content', because the condition on line 130 was never false

131 with self.get_content_stream(url, compression=compression) as f: 

132 return f.read() 

133 

134 def get_headers(self, url): 

135 """ 

136 Returns the HTTP headers of the cached response for the given URL. 

137 

138 :rtype: dict 

139 """ 

140 if url in self: 

141 with open(self._header_cache_file_path(url), 'r') as header_file: 

142 return CaseInsensitiveDict(json.load(header_file)) 

143 else: 

144 return {} 

145 

146 def remove(self, url): 

147 """ 

148 Removes the cached response for the given URL. 

149 """ 

150 if url in self: 

151 os.remove(self._content_cache_file_path(url)) 

152 os.remove(self._header_cache_file_path(url)) 

153 

154 def update(self, url, force=False, invalidate_cache=True): 

155 """ 

156 Performs an update of the cached resource. This means that it validates 

157 that its most current version is found in the cache by doing a 

158 conditional GET request. 

159 

160 :param force: To force the method to perform a full GET request, set 

161 the parameter to ``True`` 

162 

163 :returns: The original HTTP response and a Boolean indicating whether 

164 the cached value was updated. 

165 :rtype: two-tuple of (:class:`requests.Response`, ``Boolean``) 

166 """ 

167 cached_headers = self.get_headers(url) 

168 headers = {} 

169 if not force: 

170 if 'last-modified' in cached_headers: 

171 headers['If-Modified-Since'] = cached_headers['last-modified'] 

172 if 'etag' in cached_headers: 

173 headers['If-None-Match'] = cached_headers['etag'] 

174 else: 

175 # Ask all possible intermediate proxies to return a fresh response 

176 headers['Cache-Control'] = 'no-cache' 

177 

178 verify = settings.DISTRO_TRACKER_CA_BUNDLE or True 

179 response = requests.get(url, headers=headers, verify=verify, 

180 allow_redirects=True) 

181 

182 # Invalidate previously cached value if the response is not valid now 

183 if not response.ok: 

184 if invalidate_cache: 

185 self.remove(url) 

186 elif response.status_code == 200: 

187 # Dump the content and headers only if a new response is generated 

188 with open(self._content_cache_file_path(url), 'wb') as content_file: 

189 content_file.write(response.content) 

190 with open(self._header_cache_file_path(url), 'w') as header_file: 

191 json.dump(dict(response.headers), header_file) 

192 

193 return response, response.status_code != 304 

194 

195 def _prepare_path(self, cache_path): 

196 path = self.cache_directory_path 

197 dirname = os.path.dirname(cache_path) 

198 

199 # Check the directory tree, create missing directories 

200 check_dir = path 

201 for component in dirname.split(os.path.sep): 

202 check_dir = os.path.join(check_dir, component) 

203 if os.path.isdir(check_dir): 

204 continue # Expected case, avoid further checks 

205 elif os.path.exists(check_dir): 

206 # Handle conflicting file by renaming it 

207 target_directory = '{}?'.format(check_dir) 

208 if not os.path.exists(target_directory): 208 ↛ 210line 208 didn't jump to line 210, because the condition on line 208 was never false

209 os.mkdir(target_directory) 

210 os.rename(check_dir, os.path.join(target_directory, 'index')) 

211 # Also rename the associated headers file if possible 

212 headers_file = check_dir + '?headers' 

213 if os.path.exists(headers_file): 213 ↛ 216line 213 didn't jump to line 216, because the condition on line 213 was never false

214 os.rename(headers_file, 

215 os.path.join(target_directory, 'index?headers')) 

216 os.mkdir(check_dir) 

217 

218 return os.path.join(self.cache_directory_path, cache_path) 

219 

220 def _content_cache_file_path(self, url): 

221 path = self._prepare_path(self.url_to_cache_path(url)) 

222 return path 

223 

224 def _header_cache_file_path(self, url): 

225 header_cache_path = self.url_to_cache_path(url) + '?headers' 

226 path = self._prepare_path(header_cache_path) 

227 return path 

228 

229 def url_to_cache_path(self, url): 

230 """ 

231 Transforms an arbitrary URL into a relative path within the 

232 cache directory. Can be overridden by the user by supplying 

233 its own implementation in the ``url_to_cache_path`` attribute 

234 of the ``__init__()`` method. 

235 

236 :param url: The URL to be cached. 

237 :type url: str 

238 

239 :returns: A relative path within the cache directory, used to store a 

240 copy of the resource. 

241 """ 

242 # Let the user supply its own naming logic 

243 if self.custom_url_to_cache_path: 

244 return self.custom_url_to_cache_path(url) 

245 

246 # Normalizes URL into a sane path 

247 path = re.sub(r'^https?://', '', url, count=1, flags=re.IGNORECASE) 

248 path = re.sub(r'\?$', '', path) 

249 path = re.sub(r'/+', '/', path) 

250 path = re.sub(r'/+$', '', path) 

251 

252 # Handle URL with GET parameters to allow caching of multiple versions 

253 # of the same path 

254 if '?' in path: 

255 (url, args) = path.split('?', maxsplit=1) 

256 path = url + '?/' + md5(args.encode('utf-8')).hexdigest() 

257 

258 # Hande conflicting directory that will forbid save of the cache file 

259 if os.path.isdir(os.path.join(self.cache_directory_path, path)): 

260 path += '?/index' 

261 

262 return path 

263 

264 

265def get_resource_content(url, cache=None, compression="auto", 

266 only_if_updated=False, force_update=False, 

267 ignore_network_failures=False, 

268 ignore_http_error=None): 

269 """ 

270 A helper function which returns the content of the resource found at the 

271 given URL. 

272 

273 If the resource is already cached in the ``cache`` object and the cached 

274 content has not expired, the function will not do any HTTP requests and 

275 will return the cached content. 

276 

277 If the resource is stale or not cached at all, it is from the Web. 

278 

279 If the HTTP request returned an error code, the requests module will 

280 raise a :class:`requests.exceptions.HTTPError`. 

281 

282 In case of network failures, some `IOError` exception will be raised unless 

283 `ignore_network_failures` is set to True. 

284 

285 :param str url: The URL of the resource to be retrieved 

286 :param cache: A cache object which should be used to look up and store 

287 the cached resource. If it is not provided, an instance of 

288 :class:`HttpCache` with a 

289 ``DISTRO_TRACKER_CACHE_DIRECTORY`` cache directory 

290 is used. 

291 :type cache: :class:`HttpCache` or an object with an equivalent interface 

292 :param str compression: Specifies the compression method used to generate 

293 the resource, and thus the compression method one should use to 

294 decompress it. If auto, then guess it from the url file extension. 

295 :param bool only_if_updated: if set to `True` returns None when no update 

296 is done. Otherwise, returns the content in any case. 

297 :param bool force_update: if set to `True` do a new HTTP request even if we 

298 non-expired data in the cache. 

299 :param bool ignore_network_failures: if set to `True`, then the function 

300 will return `None` in case of network failures and not raise any 

301 exception. 

302 :param int ignore_http_error: if the request results in an HTTP error 

303 with the given status code, then the error is ignored and no exception 

304 is raised. And `None` is returned. 

305 

306 :returns: The bytes representation of the resource found at the given url 

307 :rtype: bytes 

308 """ 

309 if cache is None: 

310 cache_directory_path = settings.DISTRO_TRACKER_CACHE_DIRECTORY 

311 cache = HttpCache(cache_directory_path) 

312 

313 updated = False 

314 if force_update or cache.is_expired(url): 

315 try: 

316 response, updated = cache.update(url, force=force_update) 

317 except IOError: 

318 if ignore_network_failures: 

319 import logging 

320 logger = logging.getLogger(__name__) 

321 logger.warning("Failed to update cache with data from %s", 

322 url, exc_info=1) 

323 return 

324 else: 

325 raise 

326 

327 if updated: 

328 # Check HTTP return code 

329 if ignore_http_error and response.status_code == ignore_http_error: 

330 return 

331 response.raise_for_status() 

332 else: # not updated 

333 if only_if_updated: 

334 return # Stop without returning old data 

335 

336 return cache.get_content(url, compression=compression) 

337 

338 

339def get_resource_text(*args, **kwargs): 

340 """ 

341 Clone of :py:func:`get_resource_content` which transparently decodes 

342 the downloaded content into text. It supports the same parameters 

343 and adds the encoding parameter. 

344 

345 :param encoding: Specifies an encoding to decode the resource content. 

346 :type encoding: str 

347 

348 :returns: The textual representation of the resource found at the given url. 

349 :rtype: str 

350 """ 

351 

352 encoding = kwargs.pop('encoding', 'utf-8') 

353 content = get_resource_content(*args, **kwargs) 

354 

355 if content is not None: 

356 return content.decode(encoding) 

357 

358 

359def safe_redirect(to, fallback, allowed_hosts=None): 

360 """Implements a safe redirection to `to` provided that it's safe. Else, 

361 goes to `fallback`. `allowed_hosts` describes the list of valid hosts for 

362 the call to :func:`django.utils.http.url_has_allowed_host_and_scheme`. 

363 

364 :param to: The URL that one should be returned to. 

365 :type to: str or None 

366 

367 :param fallback: A safe URL to fall back on if `to` isn't safe. WARNING! 

368 This url is NOT checked! The developer is advised to put only an url he 

369 knows to be safe! 

370 :type fallback: str 

371 

372 :param allowed_hosts: A list of "safe" hosts. If `None`, relies on the 

373 default behaviour of 

374 :func:`django.utils.http.url_has_allowed_host_and_scheme`. 

375 :type allowed_hosts: list of str 

376 

377 :returns: A ResponseRedirect instance containing the appropriate intel for 

378 the redirection. 

379 :rtype: :class:`django.http.HttpResponseRedirectBase` 

380 

381 """ 

382 

383 if to and url_has_allowed_host_and_scheme(to, allowed_hosts=allowed_hosts): 

384 return redirect(to) 

385 return redirect(fallback)