Coverage for distro_tracker/core/utils/http.py: 98%

2# See the COPYRIGHT file at the top-level directory of this distribution and

3# at https://deb.li/DTAuthors

5# This file is part of Distro Tracker. It is subject to the license terms

6# in the LICENSE file found in the top-level directory of this

7# distribution and at https://deb.li/DTLicense. No part of Distro Tracker,

8# including this file, may be copied, modified, propagated, or distributed

9# except according to the terms contained in the LICENSE file.

10"""

11Utilities for handling HTTP resource access.

12"""

14import json

15import os

16import re

17import time

18from hashlib import md5

20from django.conf import settings

21from django.shortcuts import redirect

22from django.utils import timezone

23from django.utils.http import parse_http_date, url_has_allowed_host_and_scheme

25import requests

26from requests.structures import CaseInsensitiveDict

28from .compression import get_uncompressed_stream, guess_compression_method

31def parse_cache_control_header(header):

32 """

33 Parses the given Cache-Control header's values.

35 :returns: The key-value pairs found in the header.

36 If some key did not have an associated value in the header, ``None``

37 is used instead.

38 :rtype: dict

39 """

40 parts = header.split(',')

41 cache_control = {}

42 for part in parts:

43 part = part.strip()

44 if '=' not in part:

45 cache_control[part] = None

46 continue

47 key, value = part.split('=', 1)

48 cache_control[key] = value

50 return cache_control

53class HttpCache(object):

54 """

55 A class providing an interface to a cache of HTTP responses.

56 """

57 def __init__(self, cache_directory_path,

58 url_to_cache_path=None):

59 self.cache_directory_path = cache_directory_path

60 self.custom_url_to_cache_path = url_to_cache_path

62 def __contains__(self, item):

63 cache_file_name = self._content_cache_file_path(item)

64 return os.path.exists(cache_file_name)

66 def is_expired(self, url):

67 """

68 If the cached response for the given URL is expired based on

69 Cache-Control or Expires headers, returns True.

70 """

71 if url not in self:

72 return True

73 headers = self.get_headers(url)

75 # First check if the Cache-Control header has set a max-age

76 if 'cache-control' in headers:

77 cache_control = parse_cache_control_header(headers['cache-control'])

78 if 'max-age' in cache_control: 78 ↛ 87line 78 didn't jump to line 87, because the condition on line 78 was never false

79 max_age = int(cache_control['max-age'])

80 response_age = int(

81 os.stat(self._header_cache_file_path(url)).st_mtime)

82 current_timestamp = int(time.time())

84 return current_timestamp - response_age >= max_age

86 # Alternatively, try the Expires header

87 if 'expires' in headers:

88 expires_date = timezone.datetime.utcfromtimestamp(

89 parse_http_date(headers['expires']))

90 expires_date = timezone.make_aware(expires_date, timezone.utc)

91 current_date = timezone.now()

93 return current_date > expires_date

95 # If there is no cache freshness date consider the item expired

96 return True

98 def get_content_stream(self, url, compression="auto", text=False):

99 """

100 Returns a file-like object that reads the cached copy of the given URL.

101

102 If the file is compressed, the file-like object will read the

103 decompressed stream.

104 """

105 if url in self: 105 ↛ exitline 105 didn't return from function 'get_content_stream', because the condition on line 105 was never false

106 if compression == "auto":

107 compression = guess_compression_method(url)

108

109 # XXX: we leak temp_file... cf skipped test in test suite

110 # of get_uncompressed_stream

111 temp_file = open(self._content_cache_file_path(url), 'rb')

112 return get_uncompressed_stream(temp_file, compression=compression,

113 text=text)

114

115 def get_content(self, url, compression="auto"):

116 """

117 Returns the content of the cached response for the given URL.

118

119 If the file is compressed, then uncompress it, else, consider it

120 as plain file.

121

122 :param compression: Specifies the compression method used to generate

123 the resource, and thus the compression method one should use to

124 decompress it.

125 :type compression: str

126

127 :rtype: :class:`bytes`

128

129 """

130 if url in self: 130 ↛ exitline 130 didn't return from function 'get_content', because the condition on line 130 was never false

131 with self.get_content_stream(url, compression=compression) as f:

132 return f.read()

133

134 def get_headers(self, url):

135 """

136 Returns the HTTP headers of the cached response for the given URL.

137

138 :rtype: dict

139 """

140 if url in self:

141 with open(self._header_cache_file_path(url), 'r') as header_file:

142 return CaseInsensitiveDict(json.load(header_file))

143 else:

144 return {}

145

146 def remove(self, url):

147 """

148 Removes the cached response for the given URL.

149 """

150 if url in self:

151 os.remove(self._content_cache_file_path(url))

152 os.remove(self._header_cache_file_path(url))

153

154 def update(self, url, force=False, invalidate_cache=True):

155 """

156 Performs an update of the cached resource. This means that it validates

157 that its most current version is found in the cache by doing a

158 conditional GET request.

159

160 :param force: To force the method to perform a full GET request, set

161 the parameter to ``True``

162

163 :returns: The original HTTP response and a Boolean indicating whether

164 the cached value was updated.

165 :rtype: two-tuple of (:class:`requests.Response`, ``Boolean``)

166 """

167 cached_headers = self.get_headers(url)

168 headers = {}

169 if not force:

170 if 'last-modified' in cached_headers:

171 headers['If-Modified-Since'] = cached_headers['last-modified']

172 if 'etag' in cached_headers:

173 headers['If-None-Match'] = cached_headers['etag']

174 else:

175 # Ask all possible intermediate proxies to return a fresh response

176 headers['Cache-Control'] = 'no-cache'

177

178 verify = settings.DISTRO_TRACKER_CA_BUNDLE or True

179 response = requests.get(url, headers=headers, verify=verify,

180 allow_redirects=True)

181

182 # Invalidate previously cached value if the response is not valid now

183 if not response.ok:

184 if invalidate_cache:

185 self.remove(url)

186 elif response.status_code == 200:

187 # Dump the content and headers only if a new response is generated

188 with open(self._content_cache_file_path(url), 'wb') as content_file:

189 content_file.write(response.content)

190 with open(self._header_cache_file_path(url), 'w') as header_file:

191 json.dump(dict(response.headers), header_file)

192

193 return response, response.status_code != 304

194

195 def _prepare_path(self, cache_path):

196 path = self.cache_directory_path

197 dirname = os.path.dirname(cache_path)

198

199 # Check the directory tree, create missing directories

200 check_dir = path

201 for component in dirname.split(os.path.sep):

202 check_dir = os.path.join(check_dir, component)

203 if os.path.isdir(check_dir):

204 continue # Expected case, avoid further checks

205 elif os.path.exists(check_dir):

206 # Handle conflicting file by renaming it

207 target_directory = '{}?'.format(check_dir)

208 if not os.path.exists(target_directory): 208 ↛ 210line 208 didn't jump to line 210, because the condition on line 208 was never false

209 os.mkdir(target_directory)

210 os.rename(check_dir, os.path.join(target_directory, 'index'))

211 # Also rename the associated headers file if possible

212 headers_file = check_dir + '?headers'

213 if os.path.exists(headers_file): 213 ↛ 216line 213 didn't jump to line 216, because the condition on line 213 was never false

214 os.rename(headers_file,

215 os.path.join(target_directory, 'index?headers'))

216 os.mkdir(check_dir)

217

218 return os.path.join(self.cache_directory_path, cache_path)

219

220 def _content_cache_file_path(self, url):

221 path = self._prepare_path(self.url_to_cache_path(url))

222 return path

223

224 def _header_cache_file_path(self, url):

225 header_cache_path = self.url_to_cache_path(url) + '?headers'

226 path = self._prepare_path(header_cache_path)

227 return path

228

229 def url_to_cache_path(self, url):

230 """

231 Transforms an arbitrary URL into a relative path within the

232 cache directory. Can be overridden by the user by supplying

233 its own implementation in the ``url_to_cache_path`` attribute

234 of the ``__init__()`` method.

235

236 :param url: The URL to be cached.

237 :type url: str

238

239 :returns: A relative path within the cache directory, used to store a

240 copy of the resource.

241 """

242 # Let the user supply its own naming logic

243 if self.custom_url_to_cache_path:

244 return self.custom_url_to_cache_path(url)

245

246 # Normalizes URL into a sane path

247 path = re.sub(r'^https?://', '', url, count=1, flags=re.IGNORECASE)

248 path = re.sub(r'\?$', '', path)

249 path = re.sub(r'/+', '/', path)

250 path = re.sub(r'/+$', '', path)

251

252 # Handle URL with GET parameters to allow caching of multiple versions

253 # of the same path

254 if '?' in path:

255 (url, args) = path.split('?', maxsplit=1)

256 path = url + '?/' + md5(args.encode('utf-8')).hexdigest()

257

258 # Hande conflicting directory that will forbid save of the cache file

259 if os.path.isdir(os.path.join(self.cache_directory_path, path)):

260 path += '?/index'

261

262 return path

263

264

265def get_resource_content(url, cache=None, compression="auto",

266 only_if_updated=False, force_update=False,

267 ignore_network_failures=False,

268 ignore_http_error=None):

269 """

270 A helper function which returns the content of the resource found at the

271 given URL.

272

273 If the resource is already cached in the ``cache`` object and the cached

274 content has not expired, the function will not do any HTTP requests and

275 will return the cached content.

276

277 If the resource is stale or not cached at all, it is from the Web.

278

279 If the HTTP request returned an error code, the requests module will

280 raise a :class:`requests.exceptions.HTTPError`.

281

282 In case of network failures, some `IOError` exception will be raised unless

283 `ignore_network_failures` is set to True.

284

285 :param str url: The URL of the resource to be retrieved

286 :param cache: A cache object which should be used to look up and store

287 the cached resource. If it is not provided, an instance of

288 :class:`HttpCache` with a

289 ``DISTRO_TRACKER_CACHE_DIRECTORY`` cache directory

290 is used.

291 :type cache: :class:`HttpCache` or an object with an equivalent interface

292 :param str compression: Specifies the compression method used to generate

293 the resource, and thus the compression method one should use to

294 decompress it. If auto, then guess it from the url file extension.

295 :param bool only_if_updated: if set to `True` returns None when no update

296 is done. Otherwise, returns the content in any case.

297 :param bool force_update: if set to `True` do a new HTTP request even if we

298 non-expired data in the cache.

299 :param bool ignore_network_failures: if set to `True`, then the function

300 will return `None` in case of network failures and not raise any

301 exception.

302 :param int ignore_http_error: if the request results in an HTTP error

303 with the given status code, then the error is ignored and no exception

304 is raised. And `None` is returned.

305

306 :returns: The bytes representation of the resource found at the given url

307 :rtype: bytes

308 """

309 if cache is None:

310 cache_directory_path = settings.DISTRO_TRACKER_CACHE_DIRECTORY

311 cache = HttpCache(cache_directory_path)

312

313 updated = False

314 if force_update or cache.is_expired(url):

315 try:

316 response, updated = cache.update(url, force=force_update)

317 except IOError:

318 if ignore_network_failures:

319 import logging

320 logger = logging.getLogger(__name__)

321 logger.warning("Failed to update cache with data from %s",

322 url, exc_info=1)

323 return

324 else:

325 raise

326

327 if updated:

328 # Check HTTP return code

329 if ignore_http_error and response.status_code == ignore_http_error:

330 return

331 response.raise_for_status()

332 else: # not updated

333 if only_if_updated:

334 return # Stop without returning old data

335

336 return cache.get_content(url, compression=compression)

337

338

339def get_resource_text(*args, **kwargs):

340 """

341 Clone of :py:func:`get_resource_content` which transparently decodes

342 the downloaded content into text. It supports the same parameters

343 and adds the encoding parameter.

344

345 :param encoding: Specifies an encoding to decode the resource content.

346 :type encoding: str

347

348 :returns: The textual representation of the resource found at the given url.

349 :rtype: str

350 """

351

352 encoding = kwargs.pop('encoding', 'utf-8')

353 content = get_resource_content(*args, **kwargs)

354

355 if content is not None:

356 return content.decode(encoding)

357

358

359def safe_redirect(to, fallback, allowed_hosts=None):

360 """Implements a safe redirection to `to` provided that it's safe. Else,

361 goes to `fallback`. `allowed_hosts` describes the list of valid hosts for

362 the call to :func:`django.utils.http.url_has_allowed_host_and_scheme`.

363

364 :param to: The URL that one should be returned to.

365 :type to: str or None

366

367 :param fallback: A safe URL to fall back on if `to` isn't safe. WARNING!

368 This url is NOT checked! The developer is advised to put only an url he

369 knows to be safe!

370 :type fallback: str

371

372 :param allowed_hosts: A list of "safe" hosts. If `None`, relies on the

373 default behaviour of

374 :func:`django.utils.http.url_has_allowed_host_and_scheme`.

375 :type allowed_hosts: list of str

376

377 :returns: A ResponseRedirect instance containing the appropriate intel for

378 the redirection.

379 :rtype: :class:`django.http.HttpResponseRedirectBase`

380

381 """

382

383 if to and url_has_allowed_host_and_scheme(to, allowed_hosts=allowed_hosts):

384 return redirect(to)

385 return redirect(fallback)

Coverage for distro_tracker/core/utils/http.py : 98%

156 statements 156 run 0 missing 0 excluded 5 partial