summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
author Shimeng (Simon) Wang <swang@google.com> 2010-02-10 11:22:01 -0800
committer Shimeng (Simon) Wang <swang@google.com> 2010-02-10 11:22:01 -0800
commit56811abc376e86d31ebd799b65cfa62cfaf8e16e (patch)
tree4e6c02fcfc3fabb253a52cf7342c5def7e1687f3
parente757f9f464f63a0ed0ca367cabb96f9f9f9e5e49 (diff)
Add back lost python script.
The script is used to generate top level domains' regular expressions. This is enhanced and used to regenerate the new top level domains. new file: common/tools/make-iana-tld-pattern.py
-rwxr-xr-xcommon/tools/make-iana-tld-pattern.py160
1 files changed, 160 insertions, 0 deletions
diff --git a/common/tools/make-iana-tld-pattern.py b/common/tools/make-iana-tld-pattern.py
new file mode 100755
index 0000000000..ece4dcfea1
--- /dev/null
+++ b/common/tools/make-iana-tld-pattern.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python
+
+from urllib2 import urlopen
+
+TLD_PREFIX = r"""
+ /**
+ * Regular expression pattern to match all IANA top-level domains.
+ * List accurate as of 2010/02/05. List taken from:
+ * http://data.iana.org/TLD/tlds-alpha-by-domain.txt
+ * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
+ */
+ public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile(
+"""
+TLD_SUFFIX = '");'
+
+URL_PREFIX = r"""
+ /**
+ * Regular expression pattern to match RFC 1738 URLs
+ * List accurate as of 2010/02/05. List taken from:
+ * http://data.iana.org/TLD/tlds-alpha-by-domain.txt
+ * This pattern is auto-generated by frameworkds/base/common/tools/make-iana-tld-pattern.py
+ */
+ public static final Pattern WEB_URL = Pattern.compile(
+ "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
+ + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
+ + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
+ + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host
+ + "(?:" // plus top level domain
+"""
+
+URL_SUFFIX = r"""
+ + "|(?:(?:25[0-5]|2[0-4]" // or ip address
+ + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
+ + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
+ + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
+ + "|[1-9][0-9]|[0-9])))"
+ + "(?:\\:\\d{1,5})?)" // plus option port number
+ + "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params
+ + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?"
+ + "(?:\\b|$)"); // and finally, a word boundary or end of
+ // input. This is to stop foo.sure from
+ // matching as foo.su
+"""
+
+class Bucket:
+ def __init__(self, baseLetter):
+ self.base=baseLetter
+ self.words=[]
+ self.letters=[]
+
+ def dump(self, isWebUrl=False, isFirst=False, isLast=False):
+ if (len(self.words) == 0) and (len(self.letters) == 0):
+ return ''
+
+ self.words.sort()
+ self.letters.sort()
+
+ output = ' ';
+
+ if isFirst:
+ if isWebUrl:
+ output += '+ "'
+ else:
+ output += '"('
+ else:
+ output += '+ "|'
+
+ if len(self.words) != 0:
+ output += '('
+
+ if isWebUrl:
+ output += '?:'
+
+ firstWord = 1
+ for word in self.words:
+ if firstWord == 0:
+ output += '|'
+ firstWord = 0
+ for letter in word:
+ if letter == '-':
+ output += '\\\\' # escape the '-' character.
+ output += letter
+
+ if len(self.words) > 0 and len(self.letters) > 0:
+ output += '|'
+
+ if len(self.letters) == 1:
+ output += '%c%c' % (self.base, self.letters[0])
+ elif len(self.letters) > 0:
+ output += '%c[' % self.base
+
+ for letter in self.letters:
+ output += letter
+
+ output += ']'
+
+ if len(self.words) != 0:
+ output += ')'
+
+ if not isLast:
+ output += '"'
+ output += '\n'
+
+ return output;
+
+ def add(self, line):
+ length = len(line)
+
+ if line.startswith('#') or (length == 0):
+ return;
+
+ if length == 2:
+ self.letters.append(line[1:2])
+ else:
+ self.words.append(line)
+
+def getBucket(buckets, line):
+ letter = line[0]
+ bucket = buckets.get(letter)
+
+ if bucket is None:
+ bucket = Bucket(letter)
+ buckets[letter] = bucket
+
+ return bucket
+
+def makePattern(prefix, suffix, buckets, isWebUrl=False):
+ output = prefix
+
+ output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)
+
+ for letter in range(ord('b'), ord('z')):
+ output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)
+
+ output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)
+
+ if isWebUrl:
+ output += '))"'
+ else:
+ output += ')'
+
+ output += suffix
+
+ print output
+
+if __name__ == "__main__":
+ f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
+ domains = f.readlines()
+ f.close()
+
+ buckets = {}
+
+ for domain in domains:
+ domain = domain.lower()
+
+ if len(domain) > 0:
+ getBucket(buckets, domain[0]).add(domain.strip())
+
+ makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
+ makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)