diff options
Diffstat (limited to 'scripts/maint/geoip/geoip-db-tool/src/main.rs')
-rw-r--r-- | scripts/maint/geoip/geoip-db-tool/src/main.rs | 239 |
1 files changed, 239 insertions, 0 deletions
diff --git a/scripts/maint/geoip/geoip-db-tool/src/main.rs b/scripts/maint/geoip/geoip-db-tool/src/main.rs new file mode 100644 index 0000000000..9a22598a35 --- /dev/null +++ b/scripts/maint/geoip/geoip-db-tool/src/main.rs @@ -0,0 +1,239 @@ +/// A basic tool to convert IPFire Location dumps into the CSV formats that Tor +/// expects. +mod db; + +use argh::FromArgs; +use ipnetwork::IpNetwork; +use rangemap::RangeInclusiveMap; + +use std::fs::File; +use std::io::{BufRead, BufReader, BufWriter, Write}; +use std::net::{IpAddr, Ipv6Addr}; +use std::num::NonZeroU32; +use std::path::PathBuf; + +fn default_ipv4_path() -> PathBuf { + "./geoip".into() +} +fn default_ipv6_path() -> PathBuf { + "./geoip6".into() +} + +#[derive(FromArgs)] +/// Convert an IPFire Location dump into CSV geoip files. +struct Args { + /// where to store the IPv4 geoip output + #[argh(option, default = "default_ipv4_path()", short = '4')] + output_ipv4: PathBuf, + + /// where to store the IPv6 geoip6 output + #[argh(option, default = "default_ipv6_path()", short = '6')] + output_ipv6: PathBuf, + + /// where to find the dump file + #[argh(option, short = 'i')] + input: PathBuf, + + /// whether to include AS information in our output + #[argh(switch)] + include_asn: bool, + + /// where to store the AS map. + #[argh(option)] + output_asn: Option<PathBuf>, +} + +/// Represents a network block from running `location dump`. +#[derive(Debug, Clone)] +pub struct NetBlock { + pub net: IpNetwork, + pub cc: [u8; 2], + pub asn: Option<NonZeroU32>, + pub is_anon_proxy: bool, + pub is_anycast: bool, + pub is_satellite: bool, +} + +/// Represents an AS definition from running `location dump`. +#[derive(Debug, Clone, Ord, PartialOrd, Eq, PartialEq)] +pub struct AsBlock { + pub asn: NonZeroU32, + pub name: String, +} + +impl PartialEq for NetBlock { + fn eq(&self, other: &Self) -> bool { + self.net == other.net + } +} + +/// We define network blocks as being sorted first from largest to smallest, +/// then by address. +impl Ord for NetBlock { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.net + .prefix() + .cmp(&other.net.prefix()) + .then_with(|| self.net.network().cmp(&other.net.network())) + } +} + +impl PartialOrd for NetBlock { + fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> { + Some(self.cmp(other)) + } +} + +impl Eq for NetBlock {} + +#[derive(Copy, Clone, Eq, PartialEq, Debug)] +struct NetDefn { + cc: [u8; 2], + asn: Option<NonZeroU32>, +} + +impl NetBlock { + fn into_defn(self, include_asn: bool) -> NetDefn { + if include_asn { + NetDefn { + cc: self.cc, + asn: self.asn, + } + } else { + NetDefn { + cc: self.cc, + asn: None, + } + } + } +} + +impl NetDefn { + fn cc(&self) -> &str { + std::str::from_utf8(&self.cc).unwrap() + } + fn asn(&self) -> u32 { + match self.asn { + Some(v) => v.into(), + None => 0, + } + } +} + +const PROLOGUE: &str = "\ +# This file has been converted from the IPFire Location database +# using Tor's geoip-db-tool. For more information on the data, see +# https://location.ipfire.org/. +# +# Below is the header from the original export: +# +"; + +/// Read an input file in the `location dump` format, and write CSV ipv4 and ipv6 files. +/// +/// This code tries to be "efficient enough"; most of the logic is handled by +/// using the rangemap crate. +fn convert(args: Args) -> std::io::Result<()> { + let input = args.input.as_path(); + let output_v4 = args.output_ipv4.as_path(); + let output_v6 = args.output_ipv6.as_path(); + let include_asn = args.include_asn; + + let f = File::open(input)?; + let f = BufReader::new(f); + let mut blocks = Vec::new(); + let mut networks = Vec::new(); + + let mut reader = db::BlockReader::new(f.lines()); + let hdr = reader.extract_header(); + // Read blocks, and then sort them by specificity and address. + for nb in reader { + match nb { + db::AnyBlock::AsBlock(a) => networks.push(a), + db::AnyBlock::NetBlock(n) => blocks.push(n), + _ => {} + } + } + blocks.sort(); + + // Convert the sorted blocks into a map from address ranges into + // country codes. + // + // Note that since we have sorted the blocks from least to most specific, + // we will be puttting them into the maps in the right order, so that the + // most specific rule "wins". + // + // We use u32 and u128 as the index types for these RangeInclusiveMaps, + // so that we don't need to implement a step function for IpAddr. + let mut v4map: RangeInclusiveMap<u32, NetDefn, _> = RangeInclusiveMap::new(); + let mut v6map: RangeInclusiveMap<u128, NetDefn, _> = RangeInclusiveMap::new(); + + let mut n = 0usize; + let num_blocks = blocks.len(); + for nb in blocks { + n += 1; + if n % 100000 == 0 { + println!("{}/{}", n, num_blocks); + } + let start = nb.net.network(); + let end = nb.net.broadcast(); + match (start, end) { + (IpAddr::V4(a), IpAddr::V4(b)) => { + v4map.insert(a.into()..=b.into(), nb.into_defn(include_asn)); + } + (IpAddr::V6(a), IpAddr::V6(b)) => { + v6map.insert(a.into()..=b.into(), nb.into_defn(include_asn)); + } + (_, _) => panic!("network started and ended in different families!?"), + } + } + + // Write the ranges out to the appropriate files, in order. + let mut v4 = BufWriter::new(File::create(output_v4)?); + let mut v6 = BufWriter::new(File::create(output_v6)?); + + v4.write_all(PROLOGUE.as_bytes())?; + v4.write_all(hdr.as_bytes())?; + for (r, defn) in v4map.iter() { + let a: u32 = *r.start(); + let b: u32 = *r.end(); + if include_asn { + writeln!(&mut v4, "{},{},{},{}", a, b, defn.cc(), defn.asn())?; + } else { + writeln!(&mut v4, "{},{},{}", a, b, defn.cc())?; + } + } + + v6.write_all(PROLOGUE.as_bytes())?; + v6.write_all(hdr.as_bytes())?; + for (r, defn) in v6map.iter() { + let a: Ipv6Addr = (*r.start()).into(); + let b: Ipv6Addr = (*r.end()).into(); + if include_asn { + writeln!(&mut v6, "{},{},{},{}", a, b, defn.cc(), defn.asn())?; + } else { + writeln!(&mut v6, "{},{},{}", a, b, defn.cc())?; + } + } + + // The documentation says you should always flush a BufWriter. + v4.flush()?; + v6.flush()?; + + if let Some(output_asn) = args.output_asn { + networks.sort(); + let mut asn = BufWriter::new(File::create(output_asn)?); + for net in networks { + writeln!(&mut asn, "{},{}", net.asn, net.name)?; + } + asn.flush()?; + } + + Ok(()) +} + +fn main() -> std::io::Result<()> { + let args: Args = argh::from_env(); + + convert(args) +} |