# -*- coding: utf-8 -*-
from __future__ import absolute_import
import six
from datetime import datetime
from random import randint, randrange
import uuid
from base64 import urlsafe_b64encode, b64encode, b64decode
import hashlib
import string
import re
from urlparse import urlparse
from collections import namedtuple, OrderedDict
import bcrypt
import pytz
import tldextract
from unidecode import unidecode
import bleach
if six.PY3:
from html import unescape
else:
import HTMLParser
unescape = HTMLParser.HTMLParser().unescape
del HTMLParser
from ._version import *
# --- Common delimiters and punctuation ---------------------------------------
_strip_re = re.compile(ur'[\'"`‘’“”′″‴]+')
_punctuation_re = re.compile(ur'[\t +!#$%&()*\-/<=>?@\[\\\]^_{|}:;,.…‒–—―«»]+')
_username_valid_re = re.compile('^[a-z0-9]([a-z0-9-]*[a-z0-9])?$')
_ipv4_re = re.compile('^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$')
# --- Utilities ---------------------------------------------------------------
[docs]def buid():
"""
Return a new random id that is exactly 22 characters long,
by encoding a UUID4 in URL-safe Base64. See
http://en.wikipedia.org/wiki/Base64#Variants_summary_table
>>> len(newid())
22
>>> newid() == newid()
False
>>> isinstance(newid(), unicode)
True
"""
return unicode(urlsafe_b64encode(uuid.uuid4().bytes).rstrip('='))
# Retain old name
newid = buid
[docs]def newsecret():
"""
Make a secret key for email confirmation and all that stuff.
44 characters long.
>>> len(newsecret())
44
>>> newsecret() == newsecret()
False
"""
return newid() + newid()
[docs]def newpin(digits=4):
"""
Return a random numeric string with the specified number of digits,
default 4.
>>> len(newpin())
4
>>> len(newpin(5))
5
>>> newpin().isdigit()
True
"""
randnum = randint(0, 10 ** digits)
while len(str(randnum)) > digits:
randnum = randint(0, 10 ** digits)
return (u'%%0%dd' % digits) % randnum
[docs]def make_name(text, delim=u'-', maxlength=50, checkused=None, counter=2):
u"""
Generate an ASCII name slug. If a checkused filter is provided, it will
be called with the candidate. If it returns True, make_name will add
counter numbers starting from 2 until a suitable candidate is found.
:param string delim: Delimiter between words, default '-'
:param int maxlength: Maximum length of name, default 50
:param checkused: Function to check if a generated name is available for use
:param int counter: Starting position for name counter
>>> make_name('This is a title')
'this-is-a-title'
>>> make_name('Invalid URL/slug here')
'invalid-url-slug-here'
>>> make_name('this.that')
'this-that'
>>> make_name('this:that')
'this-that'
>>> make_name("How 'bout this?")
'how-bout-this'
>>> make_name(u"How’s that?")
'hows-that'
>>> make_name(u'K & D')
'k-d'
>>> make_name('billion+ pageviews')
'billion-pageviews'
>>> make_name(u'हिन्दी slug!')
'hindii-slug'
>>> make_name(u'__name__', delim=u'_')
'name'
>>> make_name(u'how_about_this', delim=u'_')
'how_about_this'
>>> make_name(u'and-that', delim=u'_')
'and_that'
>>> make_name(u'Umlauts in Mötörhead')
'umlauts-in-motorhead'
>>> make_name('Candidate', checkused=lambda c: c in ['candidate'])
'candidate2'
>>> make_name('Candidate', checkused=lambda c: c in ['candidate'], counter=1)
'candidate1'
>>> make_name('Candidate', checkused=lambda c: c in ['candidate', 'candidate1', 'candidate2'], counter=1)
'candidate3'
>>> make_name('Long title, but snipped', maxlength=20)
'long-title-but-snipp'
>>> len(make_name('Long title, but snipped', maxlength=20))
20
>>> make_name('Long candidate', maxlength=10, checkused=lambda c: c in ['long-candi', 'long-cand1'])
'long-cand2'
>>> make_name(u'Lǝnkǝran')
'lankaran'
>>> make_name(u'example@example.com')
'example-example-com'
"""
name = unicode(delim.join([_strip_re.sub('', x) for x in _punctuation_re.split(text.lower()) if x != '']))
name = unidecode(name).replace('@', 'a') # We don't know why unidecode uses '@' for 'a'-like chars
if checkused is None:
return name[:maxlength]
candidate = name[:maxlength]
existing = checkused(candidate)
while existing:
candidate = name[:maxlength - len(str(counter))] + str(counter)
counter += 1
existing = checkused(candidate)
return candidate
[docs]def make_password(password, encoding=u'BCRYPT'):
"""
Make a password with PLAIN, SSHA or BCRYPT (default) encoding.
>>> make_password('foo', encoding='PLAIN')
u'{PLAIN}foo'
>>> make_password(u'bar', encoding='PLAIN')
u'{PLAIN}bar'
>>> make_password(u're-foo', encoding='SSHA')[:6]
u'{SSHA}'
>>> make_password('bar-foo', encoding='SSHA')[:6]
u'{SSHA}'
>>> make_password(u're-foo')[:8]
u'{BCRYPT}'
>>> make_password('bar-foo')[:8]
u'{BCRYPT}'
>>> make_password('foo') == make_password('foo')
False
>>> check_password(make_password('ascii'), 'ascii')
True
>>> check_password(make_password('mixed'), u'mixed')
True
>>> check_password(make_password(u'unicode'), u'unicode')
True
"""
if encoding not in [u'PLAIN', u'SSHA', u'BCRYPT']:
raise ValueError("Unknown encoding %s" % encoding)
if encoding == u'PLAIN':
if isinstance(password, str):
password = unicode(password, 'utf-8')
return u"{PLAIN}%s" % password
elif encoding == u'SSHA':
# SSHA is a modification of the SHA digest scheme with a salt
# starting at byte 20 of the base64-encoded string.
# Source: http://developer.netscape.com/docs/technote/ldap/pass_sha.html
# This implementation is from Zope2's AccessControl.AuthEncoding.
salt = ''
for n in range(7):
salt += chr(randrange(256))
if isinstance(password, unicode):
password = password.encode('utf-8')
else:
password = str(password)
return u'{SSHA}%s' % b64encode(hashlib.sha1(password + salt).digest() + salt)
elif encoding == u'BCRYPT':
# BCRYPT is the recommended hash for secure passwords
return u'{BCRYPT}%s' % bcrypt.hashpw(
password.encode('utf-8') if isinstance(password, unicode) else password,
bcrypt.gensalt())
[docs]def check_password(reference, attempt):
"""
Compare a reference password with the user attempt.
>>> check_password('{PLAIN}foo', 'foo')
True
>>> check_password(u'{PLAIN}bar', 'bar')
True
>>> check_password(u'{UNKNOWN}baz', 'baz')
False
>>> check_password(u'no-encoding', u'no-encoding')
False
>>> check_password(u'{SSHA}q/uVU8r15k/9QhRi92CWUwMJu2DM6TUSpp25', u're-foo')
True
>>> check_password('{SSHA}q/uVU8r15k/9QhRi92CWUwMJu2DM6TUSpp25', 're-foo')
True
"""
if reference.startswith(u'{PLAIN}'):
if reference[7:] == attempt:
return True
elif reference.startswith(u'{SSHA}'):
try:
ref = b64decode(reference[6:])
except TypeError:
return False # Not Base64
if isinstance(attempt, unicode):
attempt = attempt.encode('utf-8')
salt = ref[20:]
compare = unicode('{SSHA}%s' % b64encode(hashlib.sha1(attempt + salt).digest() + salt))
return (compare == reference)
elif reference.startswith(u'{BCRYPT}'):
return bcrypt.hashpw(
attempt.encode('utf-8') if isinstance(attempt, unicode) else attempt,
reference[8:]) == reference[8:]
return False
[docs]def md5sum(data):
"""
Return md5sum of data as a 32-character string.
>>> md5sum('random text')
'd9b9bec3f4cc5482e7c5ef43143e563a'
>>> md5sum(u'random text')
'd9b9bec3f4cc5482e7c5ef43143e563a'
>>> len(md5sum('random text'))
32
"""
return hashlib.md5(data).hexdigest()
def parse_isoformat(text):
try:
return datetime.strptime(text, '%Y-%m-%dT%H:%M:%S.%fZ')
except ValueError:
return datetime.strptime(text, '%Y-%m-%dT%H:%M:%SZ')
[docs]def getbool(value):
"""
Returns a boolean from any of a range of values. Returns None for
unrecognized values. Numbers other than 0 and 1 are considered
unrecognized.
>>> getbool(True)
True
>>> getbool(1)
True
>>> getbool('1')
True
>>> getbool('t')
True
>>> getbool(2)
>>> getbool(0)
False
>>> getbool(False)
False
>>> getbool('n')
False
"""
value = str(value).lower()
if value in ['1', 't', 'true', 'y', 'yes']:
return True
elif value in ['0', 'f', 'false', 'n', 'no']:
return False
return None
[docs]def nullint(value):
"""
Return int(value) if bool(value) is not False. Return None otherwise.
Useful for coercing optional values to an integer.
>>> nullint('10')
10
>>> nullint('') is None
True
"""
if value:
return int(value)
[docs]def nullstr(value):
"""
Return str(value) if bool(value) is not False. Return None otherwise.
Useful for coercing optional values to a string.
>>> nullstr(10)
'10'
>>> nullstr('') is None
True
"""
if value:
return str(value)
[docs]def nullunicode(value):
"""
Return unicode(value) if bool(value) is not False. Return None otherwise.
Useful for coercing optional values to a string.
>>> nullunicode(10)
u'10'
>>> nullunicode('') is None
True
"""
if value:
return unicode(value)
[docs]def get_email_domain(email):
"""
Return the domain component of an email address. Returns None if the
provided string cannot be parsed as an email address.
>>> get_email_domain('test@example.com')
'example.com'
>>> get_email_domain('test+trailing@example.com')
'example.com'
>>> get_email_domain('foobar')
>>> get_email_domain('foo@bar@baz')
>>> get_email_domain('foobar@')
>>> get_email_domain('@foobar')
"""
try:
username, domain = email.split('@')
if not username:
return None
return domain or None
except ValueError:
return None
VALID_TAGS = {
'a': ['href', 'title', 'target', 'rel'],
'abbr': ['title'],
'b': [],
'br': [],
'blockquote': [],
'cite': [],
'code': [],
'dd': [],
'del': [],
'dl': [],
'dt': [],
'em': [],
'h3': [],
'h4': [],
'h5': [],
'h6': [],
'hr': [],
'i': [],
'img': ['src', 'width', 'height', 'align', 'alt'],
'ins': [],
'li': ['start'],
'mark': [],
'p': [],
'pre': [],
'ol': [],
'strong': [],
'sup': [],
'sub': [],
'ul': [],
}
[docs]def sanitize_html(value, valid_tags=VALID_TAGS, strip=True):
"""
Strips unwanted markup out of HTML.
"""
return bleach.clean(value, tags=VALID_TAGS.keys(), attributes=VALID_TAGS, strip=strip)
# Based on http://jasonpriem.org/obfuscation-decoder/
_deobfuscate_dot1_re = re.compile(r'\W*\.\W*|\W+dot\W+|\W+d0t\W+', re.U | re.I)
_deobfuscate_dot2_re = re.compile(r'([a-z0-9])DOT([a-z0-9])')
_deobfuscate_dot3_re = re.compile(r'([A-Z0-9])dot([A-Z0-9])')
_deobfuscate_at1_re = re.compile(r'\W*@\W*|\W+at\W+', re.U | re.I)
_deobfuscate_at2_re = re.compile(r'([a-z0-9])AT([a-z0-9])')
_deobfuscate_at3_re = re.compile(r'([A-Z0-9])at([A-Z0-9])')
[docs]def deobfuscate_email(text):
"""
Deobfuscate email addresses in provided text
"""
text = unescape(text)
# Find the "dot"
text = _deobfuscate_dot1_re.sub('.', text)
text = _deobfuscate_dot2_re.sub(r'\1.\2', text)
text = _deobfuscate_dot3_re.sub(r'\1.\2', text)
# Find the "at"
text = _deobfuscate_at1_re.sub('@', text)
text = _deobfuscate_at2_re.sub(r'\1@\2', text)
text = _deobfuscate_at3_re.sub(r'\1@\2', text)
return text
[docs]def simplify_text(text):
"""
Simplify text to allow comparison.
>>> simplify_text("Awesome Coder wanted at Awesome Company")
'awesome coder wanted at awesome company'
>>> simplify_text("Awesome Coder, wanted at Awesome Company! ")
'awesome coder wanted at awesome company'
>>> simplify_text(u"Awesome Coder, wanted at Awesome Company! ")
u'awesome coder wanted at awesome company'
"""
if isinstance(text, unicode):
text = unicode(text.encode('utf-8').translate(string.maketrans("", ""), string.punctuation).lower(), 'utf-8')
else:
text = text.translate(string.maketrans("", ""), string.punctuation).lower()
return " ".join(text.split())
[docs]def valid_username(candidate):
"""
Check if a username is valid.
>>> valid_username('example person')
False
>>> valid_username('example_person')
False
>>> valid_username('exampleperson')
True
>>> valid_username('example-person')
True
>>> valid_username('a')
True
>>> valid_username('a-') or valid_username('ab-') or valid_username('-a') or valid_username('-ab')
False
"""
return not _username_valid_re.search(candidate) is None
[docs]def sorted_timezones():
"""
Return a list of timezones sorted by offset from UTC.
"""
def hourmin(delta):
if delta.days < 0:
hours, remaining = divmod(86400 - delta.seconds, 3600)
else:
hours, remaining = divmod(delta.seconds, 3600)
minutes, remaining = divmod(remaining, 60)
return hours, minutes
now = datetime.utcnow()
# Make a list of country code mappings
timezone_country = {}
for countrycode in pytz.country_timezones:
for timezone in pytz.country_timezones[countrycode]:
timezone_country[timezone] = countrycode
# Make a list of timezones, discarding the US/* and Canada/* zones since they aren't reliable for
# DST, and discarding UTC and GMT since timezones in that zone have their own names
timezones = [(pytz.timezone(tzname).utcoffset(now, is_dst=False), tzname) for tzname in pytz.common_timezones
if not tzname.startswith('US/') and not tzname.startswith('Canada/') and tzname not in ('GMT', 'UTC')]
# Sort timezones by offset from UTC and their human-readable name
presorted = [(delta, '%s%s - %s%s (%s)' % (
(delta.days < 0 and '-') or (delta.days == 0 and delta.seconds == 0 and ' ') or '+',
'%02d:%02d' % hourmin(delta),
(pytz.country_names[timezone_country[name]] + ': ') if name in timezone_country else '',
name.replace('_', ' '),
pytz.timezone(name).tzname(now, is_dst=False)),
name) for delta, name in timezones]
presorted.sort()
# Return a list of (timezone, label) with the timezone offset included in the label.
return [(name, label) for (delta, label, name) in presorted]
[docs]def namespace_from_url(url):
"""
Construct a dotted namespace string from a URL.
"""
parsed = urlparse(url)
if parsed.hostname is None or parsed.hostname in ['localhost', 'localhost.localdomain'] or (
_ipv4_re.search(parsed.hostname)):
return None
namespace = parsed.hostname.split('.')
namespace.reverse()
if namespace and not namespace[0]:
namespace.pop(0)
if namespace and namespace[-1] == 'www':
namespace.pop(-1)
return '.'.join(namespace)
[docs]def base_domain_matches(d1, d2):
"""
Check if two domains have the same base domain, using the Public Suffix List.
>>> base_domain_matches('https://hasjob.co', 'hasjob.co')
True
>>> base_domain_matches('hasgeek.hasjob.co', 'hasjob.co')
True
>>> base_domain_matches('hasgeek.com', 'hasjob.co')
False
>>> base_domain_matches('static.hasgeek.co.in', 'hasgeek.com')
False
>>> base_domain_matches('static.hasgeek.co.in', 'hasgeek.co.in')
True
>>> base_domain_matches('example@example.com', 'example.com')
True
"""
r1 = tldextract.extract(d1)
r2 = tldextract.extract(d2)
# r1 and r2 contain subdomain, domain and suffix.
# We want to confirm that domain and suffix match.
return r1.domain == r2.domain and r1.suffix == r2.suffix
[docs]def domain_namespace_match(domain, namespace):
"""
Checks if namespace is related to the domain because the base domain matches.
>>> domain_namespace_match('hasgeek.com', 'com.hasgeek')
True
>>> domain_namespace_match('funnel.hasgeek.com', 'com.hasgeek.funnel')
True
>>> domain_namespace_match('app.hasgeek.com', 'com.hasgeek.peopleflow')
True
>>> domain_namespace_match('app.hasgeek.in', 'com.hasgeek.peopleflow')
False
>>> domain_namespace_match('peopleflow.local', 'local.peopleflow')
True
"""
return base_domain_matches(domain, ".".join(namespace.split(".")[::-1]))
NameTitle = namedtuple('NameTitle', ['name', 'title'])
class _LabeledEnumMeta(type):
"""Construct labeled enumeration"""
def __new__(cls, name, bases, attrs):
labels = {}
for key, value in tuple(attrs.items()):
if isinstance(value, tuple):
if len(value) == 2:
labels[value[0]] = value[1]
attrs[key] = value[0]
elif len(value) == 3:
labels[value[0]] = NameTitle(value[1], value[2])
attrs[key] = value[0]
sorted_labels = OrderedDict(sorted(labels.items()))
attrs['__labels__'] = sorted_labels
return type.__new__(cls, name, bases, attrs)
def __getitem__(cls, key):
return cls.__labels__[key]
def __setitem__(cls, key, value):
raise TypeError("LabeledEnum is immutable")
[docs]class LabeledEnum(six.with_metaclass(_LabeledEnumMeta)):
"""
Labeled enumerations. Declarate an enumeration with values and labels
(for use in UI)::
>>> class MY_ENUM(LabeledEnum):
... FIRST = (1, "First")
... THIRD = (3, "Third")
... SECOND = (2, "Second")
:class:`LabeledEnum` will convert any attribute that is a 2-tuple into
a value and label pair. Access values as direct attributes of the enumeration::
>>> MY_ENUM.FIRST
1
>>> MY_ENUM.SECOND
2
>>> MY_ENUM.THIRD
3
Access labels via dictionary lookup on the enumeration::
>>> MY_ENUM[MY_ENUM.FIRST]
'First'
>>> MY_ENUM[2]
'Second'
>>> MY_ENUM.get(3)
'Third'
>>> MY_ENUM.get(4) is None
True
Retrieve a full list of values and labels with ``.items()``. Items are always
sorted by value regardless of the original definition order (since Python
doesn't provide a way to preserve that order)::
>>> MY_ENUM.items()
[(1, 'First'), (2, 'Second'), (3, 'Third')]
Three value tuples are assumed to be (value, name, title) and the name and
title are converted into NameTitle(name, title)::
>>> class NAME_ENUM(LabeledEnum):
... FIRST = (1, 'first', "First")
... THIRD = (3, 'third', "Third")
... SECOND = (2, 'second', "Second")
>>> NAME_ENUM.FIRST
1
>>> NAME_ENUM[NAME_ENUM.FIRST]
NameTitle(name='first', title='First')
>>> NAME_ENUM[NAME_ENUM.SECOND].name
'second'
>>> NAME_ENUM[NAME_ENUM.THIRD].title
'Third'
Given a name, the value can be looked up::
>>> NAME_ENUM.value_for('first')
1
>>> NAME_ENUM.value_for('second')
2
"""
@classmethod
def get(cls, key, default=None):
return cls.__labels__.get(key, default)
@classmethod
def items(cls):
return cls.__labels__.items()
@classmethod
def value_for(cls, name):
for key, value in cls.__labels__.items():
if isinstance(value, NameTitle) and value.name == name:
return key