#!/usr/bin/env python # -*- coding: UTF-8 -*- # # create filter.yaml email filter from IpToCountry.csv file (from software77.net/geo-ip) # To Do: # - maybe one day we can use \d instead of [0-9] - right now yaml requires \\\\d, which does not provide any savings in filter size # - maybe one day we can use \. instead of [.] - right now yaml requires \\\\., which results in negative savings # ---------------------------------------------------------- # These, you _should_ change - paths are relative to ~/mail folderForProbableSpam = "jacek-dom.net/spam-probably" folderForSureSpam = "jacek-dom.net/spam-prettysure" # ---------------------------------------------------------- # These you _may_ change ip2c = "IpToCountry.csv" updateInterval = 7 # days filterFileName = "filter.yaml" filterProlog = "filter-prolog.yaml" filterEpilog = "filter-epilog.yaml" countriesToExclude = [ "Reserved", "United States", "Canada", "Poland", # "Australia", "European Union", # included in AfriNic (in prolog): (41., 196. and few others) "Angola", "Botswana", "Burundi", "Cape Verde", "Central African Republic", "Chad", "Congo", "Congo DR", "Djibouti", "Equatorial Guinea", "Eritrea", "Gambia", "Guinea", "Guinea-bissau", "Liberia", "Madagascar", "Malawi", "Mozambique", "Niger", "Reunion", "Rwanda", "Seychelles", "Sierra Leone", "Somalia", "Tanzania", "Zambia", "Zimbabwe", # never got any spam from - ARIN: "Anguilla", "Antigua and Barbuda", "Bahamas", "Bermuda", "Cayman Islands", "Grenada", "Guadeloupe", "Saint Vincent and The Grenadines", "St. Pierre and Miquelon", "Virgin Islands (BRITISH)", "Virgin Islands (U.S.)", # never got any spam from - RIPE: "Albania", "Andorra", "Gibraltar", "Greenland", "Guernsey", "Iceland", "Isle of Man", "Jersey", "Jordan", "Liechtenstein", "Monaco", "Montserrat", "San Marino", "Syria", "Yemen", "Vatican", "\xc5land Islands", "\xc3\x85land Islands", "Åland Islands", "Aland Islands", # Åland Islands # never got any spam from - misc. "American Samoa", "Aruba", "Bahrain", "Barbados", "Belize", "British Indian Ocean Territory", "Brunei Darussalam", "Burkina Faso", "Cameroon", "Comoros", "Cook Islands", "Costa Rica", "Cote D'ivoire", "Cuba", "Dominica", "Egypt", "Faroe Islands", "Faroe Islands", "French Guiana", "French Polynesia", "Gabon", "Ghana", "Guam", "Guyana", "Honduras", "Jamaica", "Kenya", "Kiribati", "Korea Democratic People's Republic of", "Lao People's Democratic Republic", "Lebanon", "Lesotho", "Libyan Arab Jamahiriya", "Maldives", "Mali", "Malta", "Marshall Islands", "Mauritania", "Micronesia Federated States of", "Moldova Republic of", "Myanmar", "Namibia", "Nauru", "Nepal", "Netherlands Antilles", "New Caledonia", "Nicaragua", "Niue", "Non-spec Asia Pas Location", "Norfolk Island", "Northern Mariana Islands", "Palau", "Papua New Guinea", "Puerto Rico", "Qatar", "Saint Kitts and Nevis", "Saint Lucia", "Saint Martin", "Samoa", "Sao Tome and Principe", "Solomon Islands", "South Sudan", "Sudan", "Suriname", "Swaziland", "Tajikistan", "Timor-leste", "Togo", "Tokelau", "Tonga", "Trinidad and Tobago", "Turkmenistan", "Turks and Caicos Islands", "Tuvalu", "Uganda", "Uzbekistan", "Vanuatu", "Wallis and Futuna Islands"] countriesProbablySpam = ["France", "United Kingdom", "Czech Republic", "Germany", "Israel", "Japan", "Mexico", "Sweden"] # END OF USUALLY-SETTABLE PARAMETERS --------------------------------------------- # advanced parameters: maxLineLen = 1024 filterActionWhitelist = " actions: [{action: finish, dest: ''}]" filterActionDiscard = " actions: [{action: save, dest: /dev/null}, {action: finish, dest: ''}]" filterActionSpamProbably = " actions: [{{action: save, dest: " + folderForProbableSpam + "/.{0}}}, {{action: finish, dest: ''}}]" filterActionSpamPrettySure = " actions: [{{action: save, dest: " + folderForSureSpam + "/.{0}}}, {{action: finish, dest: ''}}]" def mapCountryName(longName): # convert official long names to normal names cMap = { "British Indian Ocean Territory": "Chagos", "Congo The Democratic Republic of The": "Congo DR", "Cote D'ivoire": "CoteDivoire", "Croatia (LOCAL Name: Hrvatska)": "Croatia", "Holy See (VATICAN City State)": "Vatican", "Iran (ISLAMIC Republic Of)": "Iran", "Korea Democratic People's Republic of": "Korea DPR", "Korea Republic of": "Korea", "Lao People's Democratic Republic": "Laos", "Libyan Arab Jamahiriya": "Libya", "Micronesia Federated States of": "Micronesia", "Moldova Republic of": "Moldova", "Non-spec Asia Pas Location": "AsiaPas", "Northern Mariana Islands": "Mariana", "Palestinian Territory Occupied": "Palestine", "Russian Federation": "Russia", "Saint Vincent and The Grenadines": "St Vincent", "Slovakia (SLOVAK Republic)": "Slovakia", "St. Pierre and Miquelon": "St Pierre", "Syrian Arab Republic": "Syria", "Taiwan; Republic of China (ROC)": "Taiwan", "Tanzania United Republic of": "Tanzania", "Turks and Caicos Islands": "Turks and Caicos", "United States": "US", "Virgin Islands (BRITISH)": "Virgin Islands BR", "Virgin Islands (U.S.)": "Virgin Islands US", "Wallis and Futuna Islands": "Wallis and Futuna", "\xc5land Islands": "Aland", "Åland Islands": "Aland", } if longName in cMap: return cMap[longName] else: return longName # END OF PARAMETERS ----------------------------------------------------------- import sys, os, stat, re, time # sys.argv, os.path, os.system, stat.ST_MTIME, re.compile, re.match, time.time def loadIp2Country(mergeAdjacent = True, mergeEvenIfFirstOctetDifferent = True): now = time.time() if not os.path.isfile(ip2c) or now - os.stat(ip2c)[stat.ST_MTIME] > updateInterval * 86400: if os.path.isfile(ip2c): print "Data file is", (now - os.stat(ip2c)[stat.ST_MTIME]) / 86400, "days old, updating..." download() countryDict = {} prevCountry = "" with open(ip2c) as f: for line in f: if line.startswith('#'): continue line = line.strip() tuple7 = line.split(",") words = [] for item in tuple7: words.append(item.strip('"')) ipFrom,ipTo,registry,dateAssigned,country2,country3,country = words # #if int(ipFrom) < 1002434560 or int(ipFrom) > 1007157248: # # continue if mergeAdjacent and country == prevCountry: prevFrom, prevTo = countryDict[country][-1] if int(ipFrom) == int(prevTo) + 1 and (mergeEvenIfFirstOctetDifferent or (int(ipFrom) >> 24) == (int(prevTo) >> 24)): #print "Merging", prevFrom, ipFrom, "in", country countryDict[country][-1] = (prevFrom, ipTo) continue if countryDict.has_key(country): countryDict[country].append((ipFrom,ipTo)) else: countryDict[country] = [(ipFrom,ipTo)] prevCountry = country return countryDict def download(): os.system("curl software77.net/geo-ip/?DL=1 > IpToCountry.csv.gz && if [ -e IpToCountry.csv ]; then mv IpToCountry.csv old; fi && gunzip IpToCountry.csv.gz") def num2octets(num): o1 = num >> 24 num -= o1 << 24 o2 = num >> 16 num -= o2 << 16 o3 = num >> 8 o4 = num - (o3 << 8) return (o1,o2,o3,o4) def octets2str(octets): o1,o2,o3,o4 = octets return "{0}.{1}.{2}.{3}".format(o1,o2,o3,o4) def singleDigitRangeRe(d1, d2): dif = d2 - d1 if dif == 9: # special case: use shorter \d instead of [0-9] re = "[0-9]" # maybe one day we can use \d elif dif == 1: # 2 consecutive digits: use shorter [34] instead of [3-4] re = "[" + str(d1) + str(d2) + "]" elif dif > 1: # re = "[" + str(d1) + "-" + str(d2) + "]" else: # dif == 0 re = str(d1) return re def octetPair2rangeRE(o1, o2): if o1 > o2: raise IndexError, ("First octet should be smaller:", o1, o2) stro1 = str(o1) stro2 = str(o2) re = "" if o1 == o2: # same re = stro1 elif stro1[:-1] == stro2[:-1]: # only last digit varies re = stro1[:-1] + singleDigitRangeRe(o1 % 10, o2 % 10) elif stro1[:-2] == stro2[:-2]: # last 2 digits vary d1 = o1 % 10 d2 = o2 % 10 tens1 = (o1%100)/10 # not just 2nd-to-last-char, we want 0, if it is missing tens2 = (o2%100)/10 maybe = '' re = stro1[:-2] res = [] # first part - remainder of current 10, e.g. for 43-81, it is 43-50 if (d1 == 0) and (tens2 > 1): # if possible, merge [0-9] to the middle part if (tens1 > 0) or o1 >= 100: # if we are processing a remainder of an octet (e.g. 00 of 200), we have to preserve 0 tens1 -= 1 else: maybe = '?' else: res.append(stro1[-2:-1] + singleDigitRangeRe(o1 % 10, 9)) if d2 == 9: tens2 += 1 # middle part, e.g. for 43-81, it is 50-79 if tens2 - tens1 > 1: res.append(singleDigitRangeRe(tens1 + 1, tens2 - 1) + maybe + "[0-9]") # rest, partial last 10, e.g. for 43-81, it is 80-81 if d2 != 9: res.append(str(tens2) + singleDigitRangeRe(0, o2 % 10)) if len(res) > 1: re += '(' + '|'.join(res) + ')' else: re += res[0] else: # if stro1[:-3] == stro2[:-3]: # last 3 digits vary, e.g. 123-234, 12-234, 1-234 re = stro1[:-3] + "(" # rest of first 100 re += octetPair2rangeRE(o1, o1 / 100 * 100 + 99) re += "|" # middle 100s hundreds1 = o1/100 hundreds2 = o2/100 if (hundreds2 - hundreds1) > 1: #re += octetPair2rangeRE(hundreds1 + 1, hundreds2 - 1) + "[0-9][0-9]" # this will always be "1" for IP addresses re += "1[0-9][0-9]" re += "|" # partial last 100, e.g. 200-255 re += octetPair2rangeRE(o2 / 100 * 100, o2) re += ")" return re def makeEndRe(octets1, octets2): FIRST_IP = (0,0,0,0) LAST_IP = (255,255,255,255) try: if octets1[0] == 0 and octets2[0] == 255: restToo = True for o in range(min(len(octets1), len(octets2)))[1:]: if octets1[o] != 0 or octets2[o] != 255: restToo = False break if restToo: return "" raise NotImplementedError, "Octets are 0-255, but the rest are not" elif len(octets1) == 1 or len(octets2) == 1: return octetPair2rangeRE(octets1[0], octets2[0]) elif octets1[0] == octets2[0]: reThis = str(octets1[0]) reRest = makeEndRe(octets1[1:], octets2[1:]) return "[.]".join((reThis, reRest)) else: # 194.245.1.5 - 200.248.39.95 = (194.([0-244].|245.(1.[5-255]|[2-255].)) | [195-199]. | 200.([0-247].|248.([0-38].|39[0-95]))) # 1. 194.245.1.5 - 194.255.255.255 = # = 194.245.1.5 - 194.245.1.255 | 194.245.2.0 - 194.245.255.255 | 194.246.0.0 - 194.255.255.255 = # = 194.245.1.[5-255] | 194.245.[2-255]. | 194.[246-255]. = # = 194.(245.(1.[5-255] | [2-255].) | [246-255].) # 2. [195-199]. # 3. 200.0.0.0 - 200.248.39.95 # = 200.0.0.0 - 200.247.255.255 | 200.248.0.0 - 200.248.38.255 | 200.248.39.0 - 200.248.39.95 # = 200.[0-247]. | 200.248.[0-38]. | 200.248.39.[0-95] # = 200.([0-247].|248.([0-38].|39[0-95])) res = [] # 1. first part - remainder of current octet if octets1[1] == 0: # include in the middleRE middleo1 = octets1[0] else: re1 = (str(octets1[0]) + "[.]" + makeEndRe(octets1[1:],LAST_IP)) res.append(re1) middleo1 = octets1[0] + 1 if octets2[1] == 255: # include in the middleRE middleo2 = octets2[0] else: middleo2 = octets2[0] - 1 # include in the following middleRE # 2. middle - full octets between 1 and 2 if middleo2 >= middleo1: middleRE = octetPair2rangeRE(middleo1, middleo2) res.append(middleRE + "[.]") # 3. end - partial octet if octets2[1] != 255: # not already included in middleRE ? re3 = str(octets2[0]) + "[.]" + makeEndRe(FIRST_IP,octets2[1:]) res.append(re3) if len(res) > 1: re = "(" + "|".join(res) + ")" elif len(res) == 1: re = res[0] else: re = octetPair2rangeRE(middleo1, middleo2) + "[.]" return re except (IndexError, TypeError): print octets1, octets2 raise def makeRe(octets1, octets2): return "[[]" + makeEndRe(octets1, octets2) def createMergedRE(similar1, similar2): # create one RE from all pairs of octets, broken into pieces to accomodate max line length (1024) if similar1[0][0] != similar2[0][0]: re1 = makeEndRe(similar1[0], similar2[0]) return ["[[]" + re1] first = None res234 = [] numPairs = len(similar1) for i in range(numPairs): if similar1[i][1] != similar2[i][1]: re1 = makeEndRe(similar1[i][1:], similar2[i][1:]) res234.append(re1) first = None continue elif i < (numPairs - 1) and similar1[i][1] == similar1[i+1][1] == similar2[i+1][1]: if first is None: first = i continue if first is None: first = i re234 = str(similar1[i][1]) # RE of octet 2 and the rest res34 = [] # list of REs of the rest (octets 3 and 4) for o23 in range(first, i+1): o1 = similar1[o23] o2 = similar2[o23] re34 = makeEndRe(o1[2:], o2[2:]) res34.append(re34) if len(res34) == 1: re234 += "[.]" + res34[0] elif len(res34) > 1: re234 += "[.](" + "|".join(res34) + ")" res234.append(re234) first = None # join REs with '|', breaking it into pieces that will fit on a line (max=1024 chars) res = [] baseRe = "[[]" + str(similar1[0][0]) + "[.]" if len(res234) == 1: res.append(baseRe + res234[0]) else: allHaveEndDot = True # maybe we can refactor out ending "[.]" to the end of combined RE bsDotLen = len("[.]") for i in range(len(res234)): if res234[i][-bsDotLen:] != "[.]": allHaveEndDot = False break baseLen = len(baseRe) + 2 # () relen = baseLen first = 0 for i in range(len(res234)): if allHaveEndDot: res234[i] = res234[i][:-bsDotLen] relen += len(res234[i]) + 1 # '|' if relen > (maxLineLen - 66 - 1): # max line len - len of static string "...{opt: or, part: $message_headers ..." partialRe = baseRe + "(" + "|".join(res234[first:i]) + ")" if allHaveEndDot: partialRe += "[.]" res.append(partialRe) relen = baseLen + len(res234[i]) first = i partialRe = baseRe + "(" + "|".join(res234[first:]) + ")" if allHaveEndDot: partialRe += "[.]" res.append(partialRe) return res def createREs(country, countryDict = None, compileREs = True): if countryDict is None: countryDict = loadIp2Country() res = [] similar1 = [] similar2 = [] aIPs = countryDict[country] for iptuple in aIPs: ip1,ip2 = iptuple octets1 = num2octets(int(ip1)) octets2 = num2octets(int(ip2)) if (octets1[0] == octets2[0]) and ((len(similar1) == 0) or (octets1[0] == similar1[-1][0])): similar1.append(octets1) similar2.append(octets2) continue if len(similar1) > 0: mre = createMergedRE(similar1, similar2) else: mre = createMergedRE((octets1,), (octets2,)) for m in mre: if compileREs: res.append((m, re.compile(m))) else: res.append(m) if len(similar1) > 0: # current octets were not added to similar12 and not processed yet similar1 = [octets1] similar2 = [octets2] if len(similar1) > 0: mre = createMergedRE(similar1, similar2) for m in mre: if compileREs: res.append((m, re.compile(m))) else: res.append(m) return res def makeFilterFromIpToCountryCSV(oneRecordPerLine=False): # oneRecordPerLine = don't merge records starting with the same octet countryDict = loadIp2Country() with open(filterFileName, "w") as f: if os.path.exists(filterProlog): with open(filterProlog, "r") as fp: prolog = fp.read() f.write(prolog) else: f.write("--- \nfilter: \n") for cName in sorted(countryDict): shortName = mapCountryName(cName) # if cName != "Argentina": ##################################### # continue ##################################### if (cName in countriesToExclude) or (shortName in countriesToExclude): continue f.write(" - \n filtername: ") if cName.find(':') >= 0: f.write("\"" + cName + "\"\n") else: f.write(cName + "\n") if 'countriesProbablySpam' in globals() and ((cName in countriesProbablySpam) or (shortName in countriesProbablySpam)): f.write(filterActionSpamProbably.format(shortName) + "\n") else: f.write(filterActionSpamPrettySure.format(shortName) + "\n") f.write(" rules: " + "\n") aIPs = countryDict[cName] similar1 = [] similar2 = [] for iptuple in aIPs: ip1,ip2 = iptuple octets1 = num2octets(int(ip1)) octets2 = num2octets(int(ip2)) # if octets1[0] != 79: ##################################### # continue ##################################### if oneRecordPerLine: f.write(" - {opt: or, part: $message_headers, match: matches, val: \"" + makeRe(octets1, octets2) + "\"}\n") else: if (octets1[0] == octets2[0]) and ((len(similar1) == 0) or (octets1[0] == similar1[-1][0])): similar1.append(octets1) similar2.append(octets2) continue if len(similar1) > 0: mre = createMergedRE(similar1, similar2) else: mre = createMergedRE((octets1,), (octets2,)) for m in mre: f.write(" - {opt: or, part: $message_headers, match: matches, val: \"" + m + "\"}\n") if len(similar1) > 0: # current octets were not added to similar12 and not processed yet similar1 = [octets1] similar2 = [octets2] if len(similar1) > 0: mre = createMergedRE(similar1, similar2) for m in mre: f.write(" - {opt: or, part: $message_headers, match: matches, val: \"" + m + "\"}\n") if os.path.exists(filterEpilog): with open(filterEpilog, "r") as fp: epilog = fp.read() f.write(epilog) #################################### # testing routines #################################### def addCommas(num): cn = [] for i, c in enumerate(reversed(str(num))): if i > 0 and (i % 3) == 0: cn.insert(0, ',') cn.insert(0, c) return ''.join(cn) def numIPsInCountry(country, countryDict = None): if countryDict is None: countryDict = loadIp2Country(False) # don't merge records, we want full count numIPs = 0 aIPs = countryDict[country] for iptuple in aIPs: ip1, ip2 = iptuple numIPs += int(ip2) - int(ip1) + 1 return numIPs def dumpCountriesIPsAndREs(onlyThisCountry = None): countryDict = loadIp2Country() for cName in sorted(countryDict): if onlyThisCountry: shortName = mapCountryName(cName) if (cName != onlyThisCountry) and (shortName != onlyThisCountry): continue print cName aIPs = countryDict[cName] for iptuple in aIPs: ip1,ip2 = iptuple octets1 = num2octets(int(ip1)) octets2 = num2octets(int(ip2)) print "\t",ip1,"-",ip2,"=",octets2str(octets1),"-",octets2str(octets2),"=",makeRe(octets1, octets2) def dumpCountriesIPsAndMergedREs(onlyThisCountry = None): countryDict = loadIp2Country() car = {} # country, addresses, REs for cName in sorted(countryDict): if onlyThisCountry: shortName = mapCountryName(cName) if (cName != onlyThisCountry) and (shortName != onlyThisCountry): continue aIPs = countryDict[cName] res = createREs(cName, countryDict, False) car[cName] = (aIPs, res) for cName in car: print cName, aIPs, res def dumpCounts(sortBy = "name"): countryDict = loadIp2Country(False) # don't merge records, we want full count print print len(countryDict), "countries", total = 0 numRecords = {} numIPs = {} for cName in countryDict: numEntries = len(countryDict[cName]) numRecords[cName] = numEntries total += numEntries aIPs = countryDict[cName] for iptuple in aIPs: ip1, ip2 = [int(a) for a in iptuple] if numIPs.has_key(cName): numIPs[cName] = numIPs[cName] + (ip2 - ip1 + 1) else: numIPs[cName] = ip2 - ip1 + 1 print "-", total, "records" print if sortBy == "name": print "%-37s\t%10s\t%13s" % ("name", "records", "IPs") print "%-37s\t%10s\t%13s" % ("----", "-------", "---") for cName in sorted(countryDict): print "%-37s\t%10d\t%13s" % (cName, numRecords[cName], addCommas(numIPs[cName])) elif sortBy == "numRecords": for c in reversed(sorted(numRecords.items(), key=lambda t: t[1])): print "%-24s\t%s" % (mapCountryName(c[0]), c[1]) # "records,", addCommas(numIPs[c[0]]), "IPs" elif sortBy == "numIPs": for c in reversed(sorted(numIPs.items(), key=lambda t: t[1])): print "%-24s\t%s" % (mapCountryName(c[0]), addCommas(c[1])) #, "IPs,", numRecords[c[0]], "records" else: # sortBy == "showSplit": # show on which countries to split to divide work by 8 parts = 1 ipsSoFar = 0 for cName in sorted(countryDict): ipsSoFar += numIPs[cName] if ipsSoFar > (0xffffffff / 8) * parts: print cName, addCommas(ipsSoFar) parts += 1 def test_single_octets(): for o1 in range(256): for o2 in range(o1, 256): print o1, "-", o2, ":", octetPair2rangeRE(o1, o2) def testMakeEndRe(ip1 = None, ip2 = None, reStr = ""): testSets = ( ((186, 1, 192, 0), (186, 1, 207, 255), "186[.]1[.](19[2-9]|20[0-7])[.]"), ((173, 45, 106, 64), (173, 45, 106, 79), "173[.]45[.]106[.](6[4-9]|7[0-9])"), # US ((82, 139, 192, 0), (82, 140, 63, 255), "82[.](139[.](19[2-9]|2([0-4][0-9]|5[0-5]))[.]|140[.]([1-5]?[0-9]|6[0-3])[.])"), ((59, 191, 240, 0),(60, 31, 255, 255), "(59[.](191[.]2(4[0-9]|5[0-5])[.]|(19[2-9]|2([0-4][0-9]|5[0-5]))[.])|60[.]([12]?[0-9]|3[01])[.])"), # China ((118, 244, 0, 0), (119, 2, 31, 255), "(118[.]2(4[4-9]|5[0-5])[.]|119[.]([01][.]|2[.]([12]?[0-9]|3[01])[.]))"), # China ((194, 245, 1, 5), (194, 246, 39, 95), "194[.](245[.](1[.](([5-9]|[1-9][0-9])|1[0-9][0-9]|2([0-4][0-9]|5[0-5]))|(([2-9]|[1-9][0-9])|1[0-9][0-9]|2([0-4][0-9]|5[0-5]))[.])|246[.](([12]?[0-9]|3[0-8])[.]|39[.]([1-8]?[0-9]|9[0-5])))"), # Germany ((194, 245, 1, 5), (194, 247, 39, 95), "194[.](245[.](1[.](([5-9]|[1-9][0-9])|1[0-9][0-9]|2([0-4][0-9]|5[0-5]))|(([2-9]|[1-9][0-9])|1[0-9][0-9]|2([0-4][0-9]|5[0-5]))[.])|246[.]|247[.](([12]?[0-9]|3[0-8])[.]|39[.]([1-8]?[0-9]|9[0-5])))"), # Germany ((194, 245, 1, 5), (194, 248, 39, 95), "194[.](245[.](1[.](([5-9]|[1-9][0-9])|1[0-9][0-9]|2([0-4][0-9]|5[0-5]))|(([2-9]|[1-9][0-9])|1[0-9][0-9]|2([0-4][0-9]|5[0-5]))[.])|24[67][.]|248[.](([12]?[0-9]|3[0-8])[.]|39[.]([1-8]?[0-9]|9[0-5])))"), # Germany ((41, 200, 0, 0), (41, 201, 255, 255), "41[.]20[01][.]"), ((79, 106, 0, 0), (79, 106, 255, 255), "79[.]106[.]"), # Albania ((79, 171, 48, 0), (79, 171, 65, 255), "79[.]171[.](4[89]|5[0-9]|6[0-5])[.]"), # Albania ((79, 171, 48, 0), (79, 171, 75, 255), "79[.]171[.](4[89]|[56][0-9]|7[0-5])[.]"), # Albania ((79, 171, 48, 0), (79, 171, 85, 255), "79[.]171[.](4[89]|[5-7][0-9]|8[0-5])[.]"), # Albania ((200, 41, 0, 0), (200, 42, 159, 255), "200[.](41[.]|42[.]([1-9]?[0-9]|1[0-5][0-9])[.])"), # Argentina ((200, 41, 223, 0), (200, 42, 159, 255), "200[.](41[.]2(2[3-9]|[34][0-9]|5[0-5])[.]|42[.]([1-9]?[0-9]|1[0-5][0-9])[.])"), # above, modified ((200, 50, 0, 0), (200, 51, 255, 255), "200[.]5[01][.]"), # Argentina ((200, 69, 192, 0), (200, 70, 255, 255), "200[.](69[.](19[2-9]|2([0-4][0-9]|5[0-5]))[.]|70[.])"), # Argentina ) #print "ip1, ip2, re:", ip1, ip2, reStr if ip1 and ip2: octets1 = [int(a) for a in ip1.split(".")] octets2 = [int(a) for a in ip2.split(".")] testSets = [(octets1, octets2, reStr)] for s1, s2, myRe in testSets: madeRe = makeEndRe(s1, s2) if madeRe != myRe: print s1, s2 print madeRe, "<= is" print myRe, "<= should be" print "---------------" def mapUserCountyName(userName, cName): # allow user to use short names, e.g. "US", instead of "United States" shortName = mapCountryName(cName) if userName == shortName: return cName else: return userName def testIPsAgainstREs(mergedRecords = True, startCountry = "A", endCountry = "\xff"): """ check every IP for a match against corresponding RegEx - takes almost 4 hours on 2.8 GHz MacPro """ countryDict = loadIp2Country(mergedRecords) ipsTested = 0 for cName in sorted(countryDict): startCountry = mapUserCountyName(startCountry, cName) endCountry = mapUserCountyName(endCountry, cName) if cName < startCountry: continue elif cName > endCountry: break if startCountry != endCountry: # show progress print cName, sys.stdout.flush() # for each record, compare each IP in the range, to a corresponding RE, and report if if does not match aIPs = countryDict[cName] print "-", len(aIPs), "records", addCommas(numIPsInCountry(cName, countryDict)), "IPs ", for iptuple in aIPs: ip1, ip2 = [int(a) for a in iptuple] rangeRe = makeRe(num2octets(ip1), num2octets(ip2)) regEx = re.compile(rangeRe) # print " ", octets2str(num2octets(ip1)), "-", octets2str(num2octets(ip2)), "=", addCommas(ip2 - ip1 + 1), "IPs" ip = ip1 while ip <= ip2: ipsTested += 1 if (ipsTested % 1000000) == 0: sys.stdout.write("%s" % ".") sys.stdout.flush() ipStr = "[" + octets2str(num2octets(ip)) + "]" if not regEx.match(ipStr): print " *** No match for", cName, ipStr[1:-1], "-", rangeRe break ip += 1 print def testIPsAgainstMergedREs(startCountry = "A", endCountry = "\xff", startPercent = 0, endPercent = 100): """ check every IP for a match against all RegEx-es of its country (since they are merged we don't know which one it should be) """ countryDict = loadIp2Country() startPercent = int(startPercent) endPercent = int(endPercent) ipsTested = 0 for cName in sorted(countryDict): startCountry = mapUserCountyName(startCountry, cName) endCountry = mapUserCountyName(endCountry, cName) if cName < startCountry: continue elif cName > endCountry: break # show progress if startCountry != endCountry: print cName, sys.stdout.flush() # create (merged) REs for all records in this country res = createREs(cName, countryDict) # for each country, compare each IP belonging to it, to all it's REs, and report if if does not match any numIPsInThisCountry = numIPsInCountry(cName, countryDict) print "has", addCommas(numIPsInThisCountry), "IPs", if startPercent > 0 or numIPsInThisCountry > 1000000: ipsDoneThisCountry = 0 percentDoneThisCountry = 0 percentReportedThisCountry = 0 needCR = True aIPs = countryDict[cName] for iptuple in aIPs: ip1, ip2 = [int(a) for a in iptuple] firstIpReported = False ###for ip in range(ip1, ip2+1): # !!! this would need 16+ GB of memory ip = ip1 while ip <= ip2: if startPercent > 0 or numIPsInThisCountry > 1000000: ipsDoneThisCountry += 1 percentDoneThisCountry = float(ipsDoneThisCountry)/numIPsInThisCountry * 100 if percentDoneThisCountry < startPercent: if (ipsDoneThisCountry % 1000000) == 0: sys.stdout.write("%s" % ",") sys.stdout.flush() continue elif percentDoneThisCountry > (endPercent + 0.1): break else: if (ipsDoneThisCountry % 1000000) == 0: sys.stdout.write("%s" % ".") sys.stdout.flush() if numIPsInThisCountry > 50*1000*1000: if percentDoneThisCountry > percentReportedThisCountry + 1: if not firstIpReported: # skip it if we are in-between start and end range print "{0:.0f}%".format(percentDoneThisCountry), sys.stdout.flush() percentReportedThisCountry = percentDoneThisCountry needCR = True ipsTested += 1 ipStr = "[" + octets2str(num2octets(ip)) + "]" matched = False for reStr, regEx in res: mo = regEx.match(ipStr) if mo: matched = True #print cName, ipStr, "matched", reStr break if not matched: if not firstIpReported: if needCR: print needCR = False print "*** No match for", cName, ipStr[1:-1], "-", firstIpReported = True lastIp = int(ip2) elif ip == lastIp: print octets2str(num2octets(ip)), "(full range)" firstIpReported = False elif firstIpReported: print octets2str(num2octets(ip-1)) firstIpReported = False ip += 1 print " - so far tested", addCommas(ipsTested), "IPs", if startCountry == "A": print "=", "{0:.1%}".format(float(ipsTested)/0xffffffff), "of total" else: print #################################### # main #################################### if __name__ == "__main__": selector = "" if len(sys.argv) > 1: selector = sys.argv[1] if selector == "-dcir": if len(sys.argv) <= 2: dumpCountriesIPsAndREs() else: dumpCountriesIPsAndREs(sys.argv[2]) elif selector == "-dc": dumpCounts() elif selector == "-dcr": dumpCounts("numRecords") # sorted by number of records elif selector == "-dci": dumpCounts("numIPs") # sorted by number of IPs elif selector == "-dcs": dumpCounts("showSplit") # compute how to split countries to 8 jobs elif selector == "-op": print octetPair2rangeRE(int(sys.argv[2]), int(sys.argv[3])) elif selector == "-tso": test_single_octets() elif selector == "-tmr": if len(sys.argv) <= 4: testMakeEndRe() else: testMakeEndRe(sys.argv[2], sys.argv[3], sys.argv[4]) elif selector == "-tre": if len(sys.argv) <= 2: testIPsAgainstREs() elif len(sys.argv) <= 3: testIPsAgainstREs(sys.argv[2]) elif len(sys.argv) <= 4: testIPsAgainstREs(sys.argv[2], sys.argv[3]) elif len(sys.argv) <= 5: testIPsAgainstREs(sys.argv[2], sys.argv[3], sys.argv[4]) elif selector == "-tmre": if len(sys.argv) <= 2: testIPsAgainstMergedREs() elif len(sys.argv) <= 3: testIPsAgainstMergedREs(sys.argv[2]) elif len(sys.argv) <= 4: testIPsAgainstMergedREs(sys.argv[2], sys.argv[3]) elif len(sys.argv) <= 5: testIPsAgainstMergedREs(sys.argv[2], sys.argv[3], sys.argv[4]) else: testIPsAgainstMergedREs(sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5]) else: makeFilterFromIpToCountryCSV()