1# Copyright 2013 The Distro Tracker Developers
2# See the COPYRIGHT file at the top-level directory of this distribution and
3# at https://deb.li/DTAuthors
4#
5# This file is part of Distro Tracker. It is subject to the license terms
6# in the LICENSE file found in the top-level directory of this
7# distribution and at https://deb.li/DTLicense. No part of Distro Tracker,
8# including this file, may be copied, modified, propagated, or distributed
9# except according to the terms contained in the LICENSE file.
10"""
11Utilities for handling HTTP resource access.
12"""
14import json
15import os
16import re
17import time
18from hashlib import md5
20from django.conf import settings
21from django.shortcuts import redirect
22from django.utils import timezone
23from django.utils.http import parse_http_date, url_has_allowed_host_and_scheme
25import requests
26from requests.structures import CaseInsensitiveDict
28from .compression import get_uncompressed_stream, guess_compression_method
31def parse_cache_control_header(header):
32 """
33 Parses the given Cache-Control header's values.
35 :returns: The key-value pairs found in the header.
36 If some key did not have an associated value in the header, ``None``
37 is used instead.
38 :rtype: dict
39 """
40 parts = header.split(',')
41 cache_control = {}
42 for part in parts:
43 part = part.strip()
44 if '=' not in part:
45 cache_control[part] = None
46 continue
47 key, value = part.split('=', 1)
48 cache_control[key] = value
50 return cache_control
53class HttpCache(object):
54 """
55 A class providing an interface to a cache of HTTP responses.
56 """
57 def __init__(self, cache_directory_path,
58 url_to_cache_path=None):
59 self.cache_directory_path = cache_directory_path
60 self.custom_url_to_cache_path = url_to_cache_path
62 def __contains__(self, item):
63 cache_file_name = self._content_cache_file_path(item)
64 return os.path.exists(cache_file_name)
66 def is_expired(self, url):
67 """
68 If the cached response for the given URL is expired based on
69 Cache-Control or Expires headers, returns True.
70 """
71 if url not in self:
72 return True
73 headers = self.get_headers(url)
75 # First check if the Cache-Control header has set a max-age
76 if 'cache-control' in headers:
77 cache_control = parse_cache_control_header(headers['cache-control'])
78 if 'max-age' in cache_control: 78 ↛ 87line 78 didn't jump to line 87, because the condition on line 78 was never false
79 max_age = int(cache_control['max-age'])
80 response_age = int(
81 os.stat(self._header_cache_file_path(url)).st_mtime)
82 current_timestamp = int(time.time())
84 return current_timestamp - response_age >= max_age
86 # Alternatively, try the Expires header
87 if 'expires' in headers:
88 expires_date = timezone.datetime.utcfromtimestamp(
89 parse_http_date(headers['expires']))
90 expires_date = timezone.make_aware(expires_date, timezone.utc)
91 current_date = timezone.now()
93 return current_date > expires_date
95 # If there is no cache freshness date consider the item expired
96 return True
98 def get_content_stream(self, url, compression="auto", text=False):
99 """
100 Returns a file-like object that reads the cached copy of the given URL.
102 If the file is compressed, the file-like object will read the
103 decompressed stream.
104 """
105 if url in self: 105 ↛ exitline 105 didn't return from function 'get_content_stream', because the condition on line 105 was never false
106 if compression == "auto":
107 compression = guess_compression_method(url)
109 # XXX: we leak temp_file... cf skipped test in test suite
110 # of get_uncompressed_stream
111 temp_file = open(self._content_cache_file_path(url), 'rb')
112 return get_uncompressed_stream(temp_file, compression=compression,
113 text=text)
115 def get_content(self, url, compression="auto"):
116 """
117 Returns the content of the cached response for the given URL.
119 If the file is compressed, then uncompress it, else, consider it
120 as plain file.
122 :param compression: Specifies the compression method used to generate
123 the resource, and thus the compression method one should use to
124 decompress it.
125 :type compression: str
127 :rtype: :class:`bytes`
129 """
130 if url in self: 130 ↛ exitline 130 didn't return from function 'get_content', because the condition on line 130 was never false
131 with self.get_content_stream(url, compression=compression) as f:
132 return f.read()
134 def get_headers(self, url):
135 """
136 Returns the HTTP headers of the cached response for the given URL.
138 :rtype: dict
139 """
140 if url in self:
141 with open(self._header_cache_file_path(url), 'r') as header_file:
142 return CaseInsensitiveDict(json.load(header_file))
143 else:
144 return {}
146 def remove(self, url):
147 """
148 Removes the cached response for the given URL.
149 """
150 if url in self:
151 os.remove(self._content_cache_file_path(url))
152 os.remove(self._header_cache_file_path(url))
154 def update(self, url, force=False, invalidate_cache=True):
155 """
156 Performs an update of the cached resource. This means that it validates
157 that its most current version is found in the cache by doing a
158 conditional GET request.
160 :param force: To force the method to perform a full GET request, set
161 the parameter to ``True``
163 :returns: The original HTTP response and a Boolean indicating whether
164 the cached value was updated.
165 :rtype: two-tuple of (:class:`requests.Response`, ``Boolean``)
166 """
167 cached_headers = self.get_headers(url)
168 headers = {}
169 if not force:
170 if 'last-modified' in cached_headers:
171 headers['If-Modified-Since'] = cached_headers['last-modified']
172 if 'etag' in cached_headers:
173 headers['If-None-Match'] = cached_headers['etag']
174 else:
175 # Ask all possible intermediate proxies to return a fresh response
176 headers['Cache-Control'] = 'no-cache'
178 verify = settings.DISTRO_TRACKER_CA_BUNDLE or True
179 response = requests.get(url, headers=headers, verify=verify,
180 allow_redirects=True)
182 # Invalidate previously cached value if the response is not valid now
183 if not response.ok:
184 if invalidate_cache:
185 self.remove(url)
186 elif response.status_code == 200:
187 # Dump the content and headers only if a new response is generated
188 with open(self._content_cache_file_path(url), 'wb') as content_file:
189 content_file.write(response.content)
190 with open(self._header_cache_file_path(url), 'w') as header_file:
191 json.dump(dict(response.headers), header_file)
193 return response, response.status_code != 304
195 def _prepare_path(self, cache_path):
196 path = self.cache_directory_path
197 dirname = os.path.dirname(cache_path)
199 # Check the directory tree, create missing directories
200 check_dir = path
201 for component in dirname.split(os.path.sep):
202 check_dir = os.path.join(check_dir, component)
203 if os.path.isdir(check_dir):
204 continue # Expected case, avoid further checks
205 elif os.path.exists(check_dir):
206 # Handle conflicting file by renaming it
207 target_directory = '{}?'.format(check_dir)
208 if not os.path.exists(target_directory): 208 ↛ 210line 208 didn't jump to line 210, because the condition on line 208 was never false
209 os.mkdir(target_directory)
210 os.rename(check_dir, os.path.join(target_directory, 'index'))
211 # Also rename the associated headers file if possible
212 headers_file = check_dir + '?headers'
213 if os.path.exists(headers_file): 213 ↛ 216line 213 didn't jump to line 216, because the condition on line 213 was never false
214 os.rename(headers_file,
215 os.path.join(target_directory, 'index?headers'))
216 os.mkdir(check_dir)
218 return os.path.join(self.cache_directory_path, cache_path)
220 def _content_cache_file_path(self, url):
221 path = self._prepare_path(self.url_to_cache_path(url))
222 return path
224 def _header_cache_file_path(self, url):
225 header_cache_path = self.url_to_cache_path(url) + '?headers'
226 path = self._prepare_path(header_cache_path)
227 return path
229 def url_to_cache_path(self, url):
230 """
231 Transforms an arbitrary URL into a relative path within the
232 cache directory. Can be overridden by the user by supplying
233 its own implementation in the ``url_to_cache_path`` attribute
234 of the ``__init__()`` method.
236 :param url: The URL to be cached.
237 :type url: str
239 :returns: A relative path within the cache directory, used to store a
240 copy of the resource.
241 """
242 # Let the user supply its own naming logic
243 if self.custom_url_to_cache_path:
244 return self.custom_url_to_cache_path(url)
246 # Normalizes URL into a sane path
247 path = re.sub(r'^https?://', '', url, count=1, flags=re.IGNORECASE)
248 path = re.sub(r'\?$', '', path)
249 path = re.sub(r'/+', '/', path)
250 path = re.sub(r'/+$', '', path)
252 # Handle URL with GET parameters to allow caching of multiple versions
253 # of the same path
254 if '?' in path:
255 (url, args) = path.split('?', maxsplit=1)
256 path = url + '?/' + md5(args.encode('utf-8')).hexdigest()
258 # Hande conflicting directory that will forbid save of the cache file
259 if os.path.isdir(os.path.join(self.cache_directory_path, path)):
260 path += '?/index'
262 return path
265def get_resource_content(url, cache=None, compression="auto",
266 only_if_updated=False, force_update=False,
267 ignore_network_failures=False,
268 ignore_http_error=None):
269 """
270 A helper function which returns the content of the resource found at the
271 given URL.
273 If the resource is already cached in the ``cache`` object and the cached
274 content has not expired, the function will not do any HTTP requests and
275 will return the cached content.
277 If the resource is stale or not cached at all, it is from the Web.
279 If the HTTP request returned an error code, the requests module will
280 raise a :class:`requests.exceptions.HTTPError`.
282 In case of network failures, some `IOError` exception will be raised unless
283 `ignore_network_failures` is set to True.
285 :param str url: The URL of the resource to be retrieved
286 :param cache: A cache object which should be used to look up and store
287 the cached resource. If it is not provided, an instance of
288 :class:`HttpCache` with a
289 ``DISTRO_TRACKER_CACHE_DIRECTORY`` cache directory
290 is used.
291 :type cache: :class:`HttpCache` or an object with an equivalent interface
292 :param str compression: Specifies the compression method used to generate
293 the resource, and thus the compression method one should use to
294 decompress it. If auto, then guess it from the url file extension.
295 :param bool only_if_updated: if set to `True` returns None when no update
296 is done. Otherwise, returns the content in any case.
297 :param bool force_update: if set to `True` do a new HTTP request even if we
298 non-expired data in the cache.
299 :param bool ignore_network_failures: if set to `True`, then the function
300 will return `None` in case of network failures and not raise any
301 exception.
302 :param int ignore_http_error: if the request results in an HTTP error
303 with the given status code, then the error is ignored and no exception
304 is raised. And `None` is returned.
306 :returns: The bytes representation of the resource found at the given url
307 :rtype: bytes
308 """
309 if cache is None:
310 cache_directory_path = settings.DISTRO_TRACKER_CACHE_DIRECTORY
311 cache = HttpCache(cache_directory_path)
313 updated = False
314 if force_update or cache.is_expired(url):
315 try:
316 response, updated = cache.update(url, force=force_update)
317 except IOError:
318 if ignore_network_failures:
319 import logging
320 logger = logging.getLogger(__name__)
321 logger.warning("Failed to update cache with data from %s",
322 url, exc_info=1)
323 return
324 else:
325 raise
327 if updated:
328 # Check HTTP return code
329 if ignore_http_error and response.status_code == ignore_http_error:
330 return
331 response.raise_for_status()
332 else: # not updated
333 if only_if_updated:
334 return # Stop without returning old data
336 return cache.get_content(url, compression=compression)
339def get_resource_text(*args, **kwargs):
340 """
341 Clone of :py:func:`get_resource_content` which transparently decodes
342 the downloaded content into text. It supports the same parameters
343 and adds the encoding parameter.
345 :param encoding: Specifies an encoding to decode the resource content.
346 :type encoding: str
348 :returns: The textual representation of the resource found at the given url.
349 :rtype: str
350 """
352 encoding = kwargs.pop('encoding', 'utf-8')
353 content = get_resource_content(*args, **kwargs)
355 if content is not None:
356 return content.decode(encoding)
359def safe_redirect(to, fallback, allowed_hosts=None):
360 """Implements a safe redirection to `to` provided that it's safe. Else,
361 goes to `fallback`. `allowed_hosts` describes the list of valid hosts for
362 the call to :func:`django.utils.http.url_has_allowed_host_and_scheme`.
364 :param to: The URL that one should be returned to.
365 :type to: str or None
367 :param fallback: A safe URL to fall back on if `to` isn't safe. WARNING!
368 This url is NOT checked! The developer is advised to put only an url he
369 knows to be safe!
370 :type fallback: str
372 :param allowed_hosts: A list of "safe" hosts. If `None`, relies on the
373 default behaviour of
374 :func:`django.utils.http.url_has_allowed_host_and_scheme`.
375 :type allowed_hosts: list of str
377 :returns: A ResponseRedirect instance containing the appropriate intel for
378 the redirection.
379 :rtype: :class:`django.http.HttpResponseRedirectBase`
381 """
383 if to and url_has_allowed_host_and_scheme(to, allowed_hosts=allowed_hosts):
384 return redirect(to)
385 return redirect(fallback)