#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
#
# create filter.yaml email filter from IpToCountry.csv file (from software77.net/geo-ip)
# To Do:
#	- maybe one day we can use \d instead of [0-9] - right now yaml requires \\\\d, which does not provide any savings in filter size
#	- maybe one day we can use \. instead of [.] - right now yaml requires \\\\., which results in negative savings
# ----------------------------------------------------------

# These, you _should_ change - paths are relative to ~/mail

folderForProbableSpam = "jacek-dom.net/spam-probably"
folderForSureSpam = "jacek-dom.net/spam-prettysure"
# ----------------------------------------------------------

# These you _may_ change

ip2c = "IpToCountry.csv"
updateInterval = 7	# days
filterFileName = "filter.yaml"
filterProlog = "filter-prolog.yaml"
filterEpilog = "filter-epilog.yaml"
countriesToInclude = ["United States", "Canada", "Poland"]
countriesProbablySpam = ["Czech Republic", "France", "Germany", "Ireland", "Israel", "Japan", "Mexico", "Sweden", "United Kingdom"]

# END OF USUALLY-SETTABLE PARAMETERS ---------------------------------------------

# advanced parameters:
maxLineLen = 1024
filterActionWhitelist =      "    actions: [{action: finish, dest: ''}]"
filterActionDiscard =        "    actions: [{action: save, dest: /dev/null}, {action: finish, dest: ''}]"
filterActionSpamProbably =   "    actions: [{{action: save, dest: " + folderForProbableSpam + "/.{0}}}, {{action: finish, dest: ''}}]"
filterActionSpamPrettySure = "    actions: [{{action: save, dest: " + folderForSureSpam + "/.{0}}}, {{action: finish, dest: ''}}]"
#filterRulePrefix = "      - {opt: or, part: $message_headers, match: matches, val: \""
filterRulePrefix = "      - {opt: or, part: $message_headers, match: matches, val: \"^[^[]*" # match only first IP (first [)
filterRulePrefixLength = len(filterRulePrefix)

def mapCountryName(longName): # convert official long names to normal names
	cMap = {
		"Bonaire; Sint Eustatius; Saba": "Bonaire",
		"British Indian Ocean Territory": "Chagos",
		"Congo The Democratic Republic of The": "Congo DR",
		"Cote D'ivoire": "CoteDivoire",
		"Croatia (LOCAL Name: Hrvatska)": "Croatia",
		"Holy See (VATICAN City State)": "Vatican",
		"Iran (ISLAMIC Republic Of)": "Iran",
		"Korea Democratic People's Republic of": "Korea DPR",
		"Korea Republic of": "Korea",
		"Lao People's Democratic Republic": "Laos",
		"Libyan Arab Jamahiriya": "Libya",
		"Micronesia Federated States of": "Micronesia",
		"Moldova Republic of": "Moldova",
		"Non-spec Asia Pas Location": "AsiaPas",
		"Northern Mariana Islands": "Mariana",
		"Palestinian Territory Occupied": "Palestine",
		"Russian Federation": "Russia",
		"Saint Vincent and The Grenadines": "St Vincent",
		"Slovakia (SLOVAK Republic)": "Slovakia",
		"St. Pierre and Miquelon": "St Pierre",
		"Syrian Arab Republic": "Syria", 
		"Taiwan; Republic of China (ROC)": "Taiwan", 
		"Tanzania United Republic of": "Tanzania", 
		"Turks and Caicos Islands": "Turks and Caicos", 
		"United States": "US", 
		"Virgin Islands (BRITISH)": "Virgin Islands BR", 
		"Virgin Islands (U.S.)": "Virgin Islands US", 
		"Wallis and Futuna Islands": "Wallis and Futuna", 
		"\xc5land Islands": "Aland",
		"Åland Islands": "Aland",
	}
	if longName in cMap:
		return cMap[longName]
	else:
		return longName

# END OF PARAMETERS -----------------------------------------------------------

import sys, os, stat, re, time  # sys.argv, os.path, os.system, stat.ST_MTIME, re.compile, re.match, time.time

def loadIp2Country(mergeAdjacent = True, mergeEvenIfFirstOctetDifferent = True):
# 	now = time.time()
# 	if not os.path.isfile(ip2c) or now - os.stat(ip2c)[stat.ST_MTIME] > updateInterval * 86400:
# 		if os.path.isfile(ip2c):
# 			print("Data file is", (now - os.stat(ip2c)[stat.ST_MTIME]) // 86400, "days old, updating...")
# 			sys.stdout.flush()
# 		download()
	
	countryDict = {}
	prevCountry = ""
	with open(ip2c) as f:
		for line in f:
			if line.startswith('#'):
				continue
			line = line.strip()
			tuple7 = line.split(",")
			words = []
			for item in tuple7:
				words.append(item.strip('"'))
			ipFrom,ipTo,registry,dateAssigned,country2,country3,country = words
#			#if int(ipFrom) < 1002434560 or int(ipFrom) > 1007157248:
#			#	continue
			if mergeAdjacent and country == prevCountry:
				prevFrom, prevTo = countryDict[country][-1]
				if int(ipFrom) == int(prevTo) + 1 and (mergeEvenIfFirstOctetDifferent or (int(ipFrom) >> 24) == (int(prevTo) >> 24)):
					#print "Merging", prevFrom, ipFrom, "in", country
					countryDict[country][-1] = (prevFrom, ipTo)
					continue
			if country in countryDict:
				countryDict[country].append((ipFrom,ipTo))
			else:
				countryDict[country] = [(ipFrom,ipTo)]
			prevCountry = country
	return countryDict

def download():
	os.system("curl software77.net/geo-ip/?DL=1 > IpToCountry.csv.gz && if [ -e IpToCountry.csv ]; then mv IpToCountry.csv old; fi && gunzip IpToCountry.csv.gz")
	
def num2octets(num):
	o1 = num >> 24
	num -= o1 << 24
	o2 = num >> 16
	num -= o2 << 16
	o3 = num >> 8
	o4 = num - (o3 << 8)
	return (o1,o2,o3,o4)

def octets2str(octets):
	o1,o2,o3,o4 = octets
	return "{0}.{1}.{2}.{3}".format(o1,o2,o3,o4)

def singleDigitRangeRe(d1, d2):
	dif = d2 - d1
	if dif == 9:	# special case: use shorter \d instead of [0-9]
		re = "[0-9]"	# maybe one day we can use \d
	elif dif == 1:	# 2 consecutive digits: use shorter [34] instead of [3-4]
		re = "[" + str(d1) + str(d2) + "]"
	elif dif > 1:	# 
		re = "[" + str(d1) + "-" + str(d2) + "]"
	else: # dif == 0
		re = str(d1)
	return re

def octetPair2rangeRE(o1, o2):
	#print(o1, o2)
	if o1 > o2:
		raise IndexError("First octet should be smaller:", o1, o2)

	stro1 = str(o1)
	stro2 = str(o2)
	re = ""
	if o1 == o2: 								# same
		re = stro1
	elif stro1[:-1] == stro2[:-1]: 				# only last digit varies
		re = stro1[:-1] + singleDigitRangeRe(o1 % 10, o2 % 10)
	elif stro1[:-2] == stro2[:-2]: 				# last 2 digits vary
		d1 = o1 % 10
		d2 = o2 % 10
		tens1 = (o1%100)//10 # not just 2nd-to-last-char, we want 0, if it is missing
		tens2 = (o2%100)//10
		maybe = ''

		re = stro1[:-2]
		res = []

		# first part - remainder of current 10, e.g. for 43-81, it is 43-50
		if (d1 == 0) and (tens2 > 1): # if possible, merge [0-9] to the middle part
			if (tens1 > 0) or o1 >= 100: # if we are processing a remainder of an octet (e.g. 00 of 200), we have to preserve 0
				tens1 -= 1
			else:
				maybe = '?'
		else:
			res.append(stro1[-2:-1] + singleDigitRangeRe(o1 % 10, 9))

		if d2 == 9:
			tens2 += 1

		# middle part, e.g. for 43-81, it is 50-79
		if tens2 - tens1 > 1:
			res.append(singleDigitRangeRe(tens1 + 1, tens2 - 1) + maybe + "[0-9]")

		# rest, partial last 10, e.g. for 43-81, it is 80-81
		if d2 != 9:
			res.append(str(tens2) + singleDigitRangeRe(0, o2 % 10))

		if len(res) > 1:
			re += '(' + '|'.join(res) + ')'
		else:
			re += res[0]
	else: # if stro1[:-3] == stro2[:-3]:		# last 3 digits vary, e.g. 123-234, 12-234, 1-234
		re = stro1[:-3] + "("

		# rest of first 100
		re += octetPair2rangeRE(o1, o1 // 100 * 100 + 99)
		re += "|"

		# middle 100s
		hundreds1 = o1//100
		hundreds2 = o2//100
		if (hundreds2 - hundreds1) > 1:
			#re += octetPair2rangeRE(hundreds1 + 1, hundreds2 - 1) + "[0-9][0-9]" # this will always be "1" for IP addresses
			re += "1[0-9][0-9]"
			re += "|"

		# partial last 100, e.g. 200-255
		re += octetPair2rangeRE(o2 // 100 * 100, o2)

		re += ")"
	return re

def makeEndRe(octets1, octets2):
	FIRST_IP = (0,0,0,0)
	LAST_IP = (255,255,255,255)
	try:
		if octets1[0] == 0 and octets2[0] == 255:
			restToo = True
			for o in range(min(len(octets1), len(octets2)))[1:]:
				if octets1[o] != 0 or octets2[o] != 255:
					restToo = False
					break
			if restToo:
				return ""
			raise NotImplementedError("Octets are 0-255, but the rest are not")
		elif len(octets1) == 1 or len(octets2) == 1:
			return octetPair2rangeRE(octets1[0], octets2[0])
		elif octets1[0] == octets2[0]:
			reThis = str(octets1[0])
			reRest = makeEndRe(octets1[1:], octets2[1:])
			return "[.]".join((reThis, reRest))
		else:
			# 194.245.1.5 - 200.248.39.95 = (194.([0-244].|245.(1.[5-255]|[2-255].))  | [195-199].  | 200.([0-247].|248.([0-38].|39[0-95])))
			# 1. 194.245.1.5 - 194.255.255.255 = 
			# 	= 194.245.1.5 - 194.245.1.255 | 194.245.2.0 - 194.245.255.255 | 194.246.0.0 - 194.255.255.255 =
			#			= 194.245.1.[5-255]   | 194.245.[2-255]. 			  | 194.[246-255]. =
			#			= 194.(245.(1.[5-255] |	[2-255].)					  | [246-255].)
			# 2. [195-199].
			# 3. 200.0.0.0 - 200.248.39.95 
			#	= 200.0.0.0 - 200.247.255.255 | 200.248.0.0 - 200.248.38.255  | 200.248.39.0 - 200.248.39.95
			#			= 200.[0-247].		  | 200.248.[0-38].				  | 200.248.39.[0-95]
			#			= 200.([0-247].|248.([0-38].|39[0-95]))
			res = []
			# 1. first part - remainder of current octet
			if octets1[1] == 0:		# include in the middleRE
				middleo1 = octets1[0]
			else:
				re1 = (str(octets1[0]) + "[.]" + makeEndRe(octets1[1:],LAST_IP))
				res.append(re1)
				middleo1 = octets1[0] + 1
			if octets2[1] == 255: # include in the middleRE
				middleo2 = octets2[0]
			else:
				middleo2 = octets2[0] - 1		# include in the following middleRE
			# 2. middle - full octets between 1 and 2
			if middleo2 >= middleo1:
				middleRE = octetPair2rangeRE(middleo1, middleo2)
				res.append(middleRE + "[.]")
			# 3. end - partial octet
			if octets2[1] != 255:	# not already included in middleRE ?
				re3 = str(octets2[0]) + "[.]" + makeEndRe(FIRST_IP,octets2[1:])
				res.append(re3)
			if len(res) > 1:
				re = "(" + "|".join(res) + ")"
			elif len(res) == 1:
				re = res[0]
			else:
				re = octetPair2rangeRE(middleo1, middleo2) + "[.]"
			return re
	except (IndexError, TypeError):
		print(octets1, octets2)
		raise

def makeRe(octets1, octets2):
	return "[[]" + makeEndRe(octets1, octets2)

def createMergedRE(similar1, similar2): # create one RE from all pairs of octets, broken into pieces to accomodate max line length (1024)
	if similar1[0][0] != similar2[0][0]:
		re1 = makeEndRe(similar1[0], similar2[0])
		return ["[[]" + re1]
	
	first = None
	res234 = []
	numPairs = len(similar1)
	for i in range(numPairs):
		if similar1[i][1] != similar2[i][1]:
			re1 = makeEndRe(similar1[i][1:], similar2[i][1:])
			res234.append(re1)
			first = None
			continue
		elif i < (numPairs - 1) and similar1[i][1] == similar1[i+1][1] == similar2[i+1][1]:
			if first is None:
				first = i
			continue
		
		if first is None:
			first = i
		re234 = str(similar1[i][1]) # RE of octet 2 and the rest
		res34 = [] # list of REs of the rest (octets 3 and 4)
		for o23 in range(first, i+1):
			o1 = similar1[o23]
			o2 = similar2[o23]
			re34 = makeEndRe(o1[2:], o2[2:])
			res34.append(re34)
		if len(res34) == 1:
			re234 += "[.]" + res34[0]
		elif len(res34) > 1:
			re234 += "[.](" + "|".join(res34) + ")"
		res234.append(re234)
		first = None
	
	# join REs with '|', breaking it into pieces that will fit on a line (max=1024 chars)
	res = []
	baseRe = "[[]" + str(similar1[0][0]) + "[.]"
	if len(res234) == 1:
		res.append(baseRe + res234[0])
	else:
		allHaveEndDot = True	# maybe we can refactor out ending "[.]" to the end of combined RE
		bsDotLen = len("[.]")
		for i in range(len(res234)):
			if res234[i][-bsDotLen:] != "[.]":
				allHaveEndDot = False
				break
		baseLen = len(baseRe) + 2 # ()
		relen = baseLen
		first = 0
		for i in range(len(res234)):
			if allHaveEndDot:
				res234[i] = res234[i][:-bsDotLen]
			relen += len(res234[i]) + 1 # '|'
			if relen > (maxLineLen - filterRulePrefixLength - 1): # max line len - len of static string "...{opt: or, part: $message_headers ..."
				partialRe = baseRe + "(" + "|".join(res234[first:i]) + ")"
				if allHaveEndDot:
					partialRe += "[.]"
				res.append(partialRe)
				relen = baseLen + len(res234[i])
				first = i
		partialRe = baseRe + "(" + "|".join(res234[first:]) + ")"
		if allHaveEndDot:
			partialRe += "[.]"
		res.append(partialRe)
	return res

def createREs(country, countryDict = None, compileREs = True):
	if countryDict is None:
		countryDict = loadIp2Country()
	res = []
	similar1 = []
	similar2 = []
	aIPs = countryDict[country]
	for iptuple in aIPs:
		ip1,ip2 = iptuple
		octets1 = num2octets(int(ip1))
		octets2 = num2octets(int(ip2))
		if (octets1[0] == octets2[0]) and ((len(similar1) == 0) or (octets1[0] == similar1[-1][0])):
			similar1.append(octets1)
			similar2.append(octets2)
			continue
		if len(similar1) > 0:
			mre = createMergedRE(similar1, similar2)
		else:
			mre = createMergedRE((octets1,), (octets2,))
		for m in mre:
			if compileREs:
				res.append((m, re.compile(m)))
			else:
				res.append(m)
		if len(similar1) > 0: # current octets were not added to similar12 and not processed yet
			similar1 = [octets1]
			similar2 = [octets2]
	if len(similar1) > 0:
		mre = createMergedRE(similar1, similar2)
		for m in mre:
			if compileREs:
				res.append((m, re.compile(m)))
			else:
				res.append(m)
	return res

def makeFilterFromIpToCountryCSV(oneRecordPerLine=False):	# oneRecordPerLine = don't merge records starting with the same octet
	countryDict = loadIp2Country()
	with open(filterFileName, "w") as f:
		if os.path.exists(filterProlog):
			with open(filterProlog, "r") as fp:
				prolog = fp.read()
				f.write(prolog)
		else:
			f.write("--- \nfilter: \n")
		for cName in sorted(countryDict):
			shortName = mapCountryName(cName)
#     			if cName != "Argentina":	##################################### for testing and debugging
#     				continue				#####################################
			if not ((cName in countriesToInclude) or (shortName in countriesToInclude)):
				continue

			f.write("  - \n")

			if 'countriesProbablySpam' in globals() and ((cName in countriesProbablySpam) or (shortName in countriesProbablySpam)):
				f.write(filterActionSpamProbably.format(shortName) + "\n")
			else:
				f.write(filterActionWhitelist + "\n")

			f.write("    enabled: 1\n")

			f.write("    filtername: ")
			if cName.find(':') >= 0:
				f.write("\"" + cName + "\"\n")
			else:
				f.write(cName + "\n")

			f.write("    rules: \n")
			aIPs = countryDict[cName]
			similar1 = []
			similar2 = []
			for iptuple in aIPs:
				ip1,ip2 = iptuple
				octets1 = num2octets(int(ip1))
				octets2 = num2octets(int(ip2))
#				if octets1[0] != 79:	##################################### for testing and debugging
#					continue			#####################################
				if oneRecordPerLine:
					f.write(filterRulePrefix + makeRe(octets1, octets2) + "\"}\n")
				else:
					if (octets1[0] == octets2[0]) and ((len(similar1) == 0) or (octets1[0] == similar1[-1][0])):
						similar1.append(octets1)
						similar2.append(octets2)
						continue
					if len(similar1) > 0:
						mre = createMergedRE(similar1, similar2)
					else:
						mre = createMergedRE((octets1,), (octets2,))
					for m in mre:
						f.write(filterRulePrefix + m + "\"}\n")
					if len(similar1) > 0: # current octets were not added to similar12 and not processed yet
						similar1 = [octets1]
						similar2 = [octets2]
			if len(similar1) > 0:
				mre = createMergedRE(similar1, similar2)
				for m in mre:
					f.write(filterRulePrefix + m + "\"}\n")
		if os.path.exists(filterEpilog):
			with open(filterEpilog, "r") as fp:
				epilog = fp.read()
				f.write(epilog)

####################################
# testing routines
####################################

def addCommas(num):
	cn = []
	for i, c in enumerate(reversed(str(num))):
		if i > 0 and (i % 3) == 0:
			cn.insert(0, ',')
		cn.insert(0, c)
	return ''.join(cn)

def numIPsInCountry(country, countryDict = None):
	if countryDict is None:
		countryDict = loadIp2Country(False) # don't merge records, we want full count
	numIPs = 0
	aIPs = countryDict[country]
	for iptuple in aIPs:
		ip1, ip2 = iptuple
		numIPs += int(ip2) - int(ip1) + 1
	return numIPs

def dumpCountriesIPsAndREs(onlyThisCountry = None):
	countryDict = loadIp2Country()
	for cName in sorted(countryDict):
		if onlyThisCountry:
			shortName = mapCountryName(cName)
			if (cName != onlyThisCountry) and (shortName != onlyThisCountry):
				continue
		print(cName)
		aIPs = countryDict[cName]
		for iptuple in aIPs:
			ip1,ip2 = iptuple
			octets1 = num2octets(int(ip1))
			octets2 = num2octets(int(ip2))
			print("\t",ip1,"-",ip2,"=",octets2str(octets1),"-",octets2str(octets2),"=",makeRe(octets1, octets2))

def dumpCountriesIPsAndMergedREs(onlyThisCountry = None):
	countryDict = loadIp2Country()
	car = {} # country, addresses, REs
	for cName in sorted(countryDict):
		if onlyThisCountry:
			shortName = mapCountryName(cName)
			if (cName != onlyThisCountry) and (shortName != onlyThisCountry):
				continue
		aIPs = countryDict[cName]
		res = createREs(cName, countryDict, False)
		car[cName] = (aIPs, res)
	for cName in car:
		print(cName, aIPs, res)

def dumpCounts(sortBy = "name"):
	countryDict = loadIp2Country(False) # don't merge records, we want full count
	print()
	print(len(countryDict), "countries", end=' ')
	total = 0
	numRecords = {}
	numIPs = {}
	for cName in countryDict:
		numEntries = len(countryDict[cName])
		numRecords[cName] = numEntries
		total += numEntries
		
		aIPs = countryDict[cName]
		for iptuple in aIPs:
			ip1, ip2 = [int(a) for a in iptuple]
			if cName in numIPs:
				numIPs[cName] = numIPs[cName] + (ip2 - ip1 + 1)
			else:
				numIPs[cName] = ip2 - ip1 + 1
	print("-", total, "records")
	print()
	if sortBy == "name":
		print("%-37s\t%10s\t%13s" % ("name", "records", "IPs"))
		print("%-37s\t%10s\t%13s" % ("----", "-------", "---"))
		for cName in sorted(countryDict):
			print("%-37s\t%10d\t%13s" % (cName, numRecords[cName], addCommas(numIPs[cName])))
	elif sortBy == "numRecords":
		for c in reversed(sorted(list(numRecords.items()), key=lambda t: t[1])):
			print("%-24s\t%s" % (mapCountryName(c[0]), c[1]))	# "records,", addCommas(numIPs[c[0]]), "IPs"
	elif sortBy == "numIPs":
		for c in reversed(sorted(list(numIPs.items()), key=lambda t: t[1])):
			print("%-24s\t%s" % (mapCountryName(c[0]), addCommas(c[1])))		#, "IPs,", numRecords[c[0]], "records"
	else: # sortBy == "showSplit": # show on which countries to split to divide work by 8
		parts = 1
		ipsSoFar = 0
		for cName in sorted(countryDict):
			ipsSoFar += numIPs[cName]
			if ipsSoFar > (0xffffffff // 8) * parts:
				print(cName, addCommas(ipsSoFar))
				parts += 1

def test_single_octets():
	for o1 in range(256):
		for o2 in range(o1, 256):
			print(o1, "-", o2, ":", octetPair2rangeRE(o1, o2))

def testMakeEndRe(ip1 = None, ip2 = None, reStr = ""):
	testSets = (
		((186, 1, 192, 0), (186, 1, 207, 255), "186[.]1[.](19[2-9]|20[0-7])[.]"),
 		((173, 45, 106, 64), (173, 45, 106, 79), "173[.]45[.]106[.](6[4-9]|7[0-9])"), # US
 		((82, 139, 192, 0),  (82, 140, 63, 255), "82[.](139[.](19[2-9]|2([0-4][0-9]|5[0-5]))[.]|140[.]([1-5]?[0-9]|6[0-3])[.])"),
 		((59, 191, 240, 0),(60, 31, 255, 255), "(59[.](191[.]2(4[0-9]|5[0-5])[.]|(19[2-9]|2([0-4][0-9]|5[0-5]))[.])|60[.]([12]?[0-9]|3[01])[.])"), # China
 		((118, 244, 0, 0), (119, 2, 31, 255), "(118[.]2(4[4-9]|5[0-5])[.]|119[.]([01][.]|2[.]([12]?[0-9]|3[01])[.]))"), # China
 		((194, 245, 1, 5), (194, 246, 39, 95), "194[.](245[.](1[.](([5-9]|[1-9][0-9])|1[0-9][0-9]|2([0-4][0-9]|5[0-5]))|(([2-9]|[1-9][0-9])|1[0-9][0-9]|2([0-4][0-9]|5[0-5]))[.])|246[.](([12]?[0-9]|3[0-8])[.]|39[.]([1-8]?[0-9]|9[0-5])))"), # Germany
 		((194, 245, 1, 5), (194, 247, 39, 95), "194[.](245[.](1[.](([5-9]|[1-9][0-9])|1[0-9][0-9]|2([0-4][0-9]|5[0-5]))|(([2-9]|[1-9][0-9])|1[0-9][0-9]|2([0-4][0-9]|5[0-5]))[.])|246[.]|247[.](([12]?[0-9]|3[0-8])[.]|39[.]([1-8]?[0-9]|9[0-5])))"), # Germany
 		((194, 245, 1, 5), (194, 248, 39, 95), "194[.](245[.](1[.](([5-9]|[1-9][0-9])|1[0-9][0-9]|2([0-4][0-9]|5[0-5]))|(([2-9]|[1-9][0-9])|1[0-9][0-9]|2([0-4][0-9]|5[0-5]))[.])|24[67][.]|248[.](([12]?[0-9]|3[0-8])[.]|39[.]([1-8]?[0-9]|9[0-5])))"), # Germany
		((41, 200, 0, 0), (41, 201, 255, 255), "41[.]20[01][.]"),
		((79, 106, 0, 0), (79, 106, 255, 255), "79[.]106[.]"), # Albania
		((79, 171, 48, 0), (79, 171, 65, 255), "79[.]171[.](4[89]|5[0-9]|6[0-5])[.]"), # Albania
		((79, 171, 48, 0), (79, 171, 75, 255), "79[.]171[.](4[89]|[56][0-9]|7[0-5])[.]"), # Albania
		((79, 171, 48, 0), (79, 171, 85, 255), "79[.]171[.](4[89]|[5-7][0-9]|8[0-5])[.]"), # Albania
		((200, 41, 0, 0), (200, 42, 159, 255), "200[.](41[.]|42[.]([1-9]?[0-9]|1[0-5][0-9])[.])"), # Argentina
		((200, 41, 223, 0), (200, 42, 159, 255), "200[.](41[.]2(2[3-9]|[34][0-9]|5[0-5])[.]|42[.]([1-9]?[0-9]|1[0-5][0-9])[.])"), # above, modified
		((200, 50, 0, 0), (200, 51, 255, 255), "200[.]5[01][.]"), # Argentina
		((200, 69, 192, 0), (200, 70, 255, 255), "200[.](69[.](19[2-9]|2([0-4][0-9]|5[0-5]))[.]|70[.])"), # Argentina
		)
	#print "ip1, ip2, re:", ip1, ip2, reStr
	if ip1 and ip2:
		octets1 = [int(a) for a in ip1.split(".")]
		octets2 = [int(a) for a in ip2.split(".")]
		testSets = [(octets1, octets2, reStr)]
	for s1, s2, myRe in testSets:
		madeRe = makeEndRe(s1, s2)
		if madeRe != myRe:
			print(s1, s2)
			print(madeRe, "<= is")
			print(myRe, "<= should be")
			print("---------------")

def makeReFrom2Strings(ip1, ip2):
	octets1 = [int(a) for a in ip1.split(".")]
	octets2 = [int(a) for a in ip2.split(".")]
	madeRe = makeRe(octets1, octets2)
	#print octets1, octets2
	print(madeRe)

def makeRangeFrom2Strings(ip1, ip2):
	octets1 = [int(a) for a in ip1.split(".")]
	octets2 = [int(a) for a in ip2.split(".")]
	rangeStr = ""
	for i in range(0,4):
		o1 = octets1[i]
		o2 = octets2[i]
		#print i, o1, o2
		if rangeStr != "":
			rangeStr += "."
		if o1 == o2 or (i < 3 and octets2[i+1] != 255):
			rangeStr += str(o1)
		elif o1 == 0 and o2 == 255:
			break
		elif i > 0 and octets1[i-1] == octets2[i-1]:
			rangeStr += str(o1)
			rangeStr += "-"
			rangeStr += str(o2)
			if i < 3:
				rangeStr += "."
			break
		else:
			rangeStr += str(o1)
			rangeStr += "-"
			rangeStr += str(octets2[i-1])
			rangeStr += "."
			rangeStr += str(o2)
			if i < 3:
				rangeStr += "."
			break
	print(rangeStr)

def mapUserCountyName(userName, cName): # allow user to use short names, e.g. "US", instead of "United States"
	shortName = mapCountryName(cName)
	if userName == shortName:
		return cName
	else:
		return userName

def testIPsAgainstREs(mergedRecords = True, startCountry = "A", endCountry = "\xff"):
	"""
	check every IP for a match against corresponding RegEx - takes almost 4 hours on 2.8 GHz MacPro
	"""
	countryDict = loadIp2Country(mergedRecords)
	
	ipsTested = 0
	for cName in sorted(countryDict):
		startCountry = mapUserCountyName(startCountry, cName)
		endCountry = mapUserCountyName(endCountry, cName)
		if cName < startCountry:
			continue
		elif cName > endCountry:
			break
		if startCountry != endCountry: 	# show progress
			print(cName, end=' ')
			sys.stdout.flush()
		
		# for each record, compare each IP in the range, to a corresponding RE, and report if if does not match
		aIPs = countryDict[cName]
		print("-", len(aIPs), "records", addCommas(numIPsInCountry(cName, countryDict)), "IPs ", end=' ')
		for iptuple in aIPs:
			ip1, ip2 = [int(a) for a in iptuple]
			rangeRe = makeRe(num2octets(ip1), num2octets(ip2))
			regEx = re.compile(rangeRe)
#			print "    ", octets2str(num2octets(ip1)), "-", octets2str(num2octets(ip2)), "=", addCommas(ip2 - ip1 + 1), "IPs"
			ip = ip1
			while ip <= ip2:
				ipsTested += 1
				if (ipsTested % 1000000) == 0:
					sys.stdout.write("%s" % ".")
					sys.stdout.flush()
				ipStr = "[" + octets2str(num2octets(ip)) + "]"
				if not regEx.match(ipStr):
					print("    *** No match for", cName, ipStr[1:-1], "-", rangeRe)
					break
				ip += 1
		print()

def testIPsAgainstMergedREs(startCountry = "A", endCountry = "\xff", startPercent = 0, endPercent = 100):
	"""
	check every IP for a match against all RegEx-es of its country (since they are merged we don't know which one it should be)
	"""
	countryDict = loadIp2Country()
	 
	startPercent = int(startPercent)
	endPercent = int(endPercent)
	
	ipsTested = 0
	
	for cName in sorted(countryDict):
		startCountry = mapUserCountyName(startCountry, cName)
		endCountry = mapUserCountyName(endCountry, cName)
		if cName < startCountry:
			continue
		elif cName > endCountry:
			break
		
		# show progress
		if startCountry != endCountry:
			print(cName, end=' ')
			sys.stdout.flush()
				
		# create (merged) REs for all records in this country
		res = createREs(cName, countryDict)
		
		# for each country, compare each IP belonging to it, to all it's REs, and report if if does not match any
		numIPsInThisCountry = numIPsInCountry(cName, countryDict)
		print("has", addCommas(numIPsInThisCountry), "IPs", end=' ')
		if startPercent > 0 or numIPsInThisCountry > 1000000:
			ipsDoneThisCountry = 0
			percentDoneThisCountry = 0
			percentReportedThisCountry = 0
		needCR = True
		aIPs = countryDict[cName]
		for iptuple in aIPs:
			ip1, ip2 = [int(a) for a in iptuple]
			firstIpReported = False
			###for ip in range(ip1, ip2+1): # !!! this would need 16+ GB of memory
			ip = ip1
			while ip <= ip2:
				if startPercent > 0 or numIPsInThisCountry > 1000000:
					ipsDoneThisCountry += 1
					percentDoneThisCountry = float(ipsDoneThisCountry)//numIPsInThisCountry * 100
					if percentDoneThisCountry < startPercent:
						if (ipsDoneThisCountry % 1000000) == 0:
							sys.stdout.write("%s" % ",")
							sys.stdout.flush()
						continue
					elif percentDoneThisCountry > (endPercent + 0.1):
						break
					else:
						if (ipsDoneThisCountry % 1000000) == 0:
							sys.stdout.write("%s" % ".")
							sys.stdout.flush()
					if numIPsInThisCountry > 50*1000*1000:
						if percentDoneThisCountry > percentReportedThisCountry + 1:
							if not firstIpReported:	# skip it if we are in-between start and end range
								print("{0:.0f}%".format(percentDoneThisCountry), end=' ')
								sys.stdout.flush()
							percentReportedThisCountry = percentDoneThisCountry
							needCR = True
				
				ipsTested += 1
				ipStr = "[" + octets2str(num2octets(ip)) + "]"
				matched = False
				for reStr, regEx in res:
					mo = regEx.match(ipStr)
					if mo:
						matched = True
						#print cName, ipStr, "matched", reStr
						break
				if not matched:
					if not firstIpReported:
						if needCR:
							print()
							needCR = False
						print("*** No match for", cName, ipStr[1:-1], "-", end=' ')
						firstIpReported = True
						lastIp = int(ip2)
					elif ip == lastIp:
						print(octets2str(num2octets(ip)), "(full range)")
						firstIpReported = False
				elif firstIpReported:
					print(octets2str(num2octets(ip-1)))
					firstIpReported = False
				ip += 1
		print(" - so far tested", addCommas(ipsTested), "IPs", end=' ')
		if startCountry == "A":
			print("=", "{0:.1%}".format(float(ipsTested)//0xffffffff), "of total")
		else:
			print()

####################################
# main
####################################

if __name__ == "__main__":
	#sys.setrecursionlimit(15000)
	selector = ""
	if len(sys.argv) > 1:
		selector = sys.argv[1]
	
	if selector == "-dcir":
		if len(sys.argv) <= 2:
			dumpCountriesIPsAndREs()
		else:
			dumpCountriesIPsAndREs(sys.argv[2])
	elif selector == "-dc":
		dumpCounts()
	elif selector == "-dcr":
		dumpCounts("numRecords") # sorted by number of records
	elif selector == "-dci":
		dumpCounts("numIPs") # sorted by number of IPs
	elif selector == "-dcs":
		dumpCounts("showSplit") # compute how to split countries to 8 jobs
	elif selector == "-op":
		print(octetPair2rangeRE(int(sys.argv[2]), int(sys.argv[3])))
	elif selector == "-tso":
		test_single_octets()
	elif selector == "-tmr":
		if len(sys.argv) <= 4:
			testMakeEndRe()
		else:
			testMakeEndRe(sys.argv[2], sys.argv[3], sys.argv[4])
	elif selector == "-tre":
		if len(sys.argv) <= 2:
			testIPsAgainstREs()
		elif len(sys.argv) <= 3:
			testIPsAgainstREs(sys.argv[2])
		elif len(sys.argv) <= 4:
			testIPsAgainstREs(sys.argv[2], sys.argv[3])
		elif len(sys.argv) <= 5:
			testIPsAgainstREs(sys.argv[2], sys.argv[3], sys.argv[4])
	elif selector == "-tmre":
		if len(sys.argv) <= 2:
			testIPsAgainstMergedREs()
		elif len(sys.argv) <= 3:
			testIPsAgainstMergedREs(sys.argv[2])
		elif len(sys.argv) <= 4:
			testIPsAgainstMergedREs(sys.argv[2], sys.argv[3])
		elif len(sys.argv) <= 5:
			testIPsAgainstMergedREs(sys.argv[2], sys.argv[3], sys.argv[4])
		else:
			testIPsAgainstMergedREs(sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5])
	elif selector == "-mre":
		makeReFrom2Strings(sys.argv[2], sys.argv[3])
	elif selector == "-mr":
		makeRangeFrom2Strings(sys.argv[2], sys.argv[3])
	else:
		makeFilterFromIpToCountryCSV()