Rust tool to convert IPFire Location dump into CSV format.

The IPFire people provide a tool that collects data from several top-level sources, combines it into a single database, and annotates it with optional overrides. This tool transforms the "dump" format of their database into the form Tor expects.
author: Nick Mathewson <nickm@torproject.org> 2021-02-22 08:30:11 -0500
committer: Nick Mathewson <nickm@torproject.org> 2021-02-22 12:25:18 -0500
commit: 0d4237839b21b466526a01147538d09c117cc884 (patch)
tree: f13c7c70aed9ae3645c0eb171348f035e468ad8b /scripts
parent: 8ccfd4a51ad55e9834cffcc91cbaa13e1f19c8ff (diff)
download: tor-0d4237839b21b466526a01147538d09c117cc884.tar.gz
tor-0d4237839b21b466526a01147538d09c117cc884.zip
6 files changed, 349 insertions, 0 deletions
diff --git a/scripts/maint/geoip/README.geoip b/scripts/maint/geoip/README.geoip
new file mode 100644
index 0000000000..0ed94b2276
--- /dev/null
+++ b/scripts/maint/geoip/README.geoip
@@ -0,0 +1,25 @@
+To generate new geoip files, you'll need to install the
+libloc/"location" tool provided by https://location.ipfire.org/.
+I personally build it with:
+
+  ./configure CFLAGS='-g -O2' --disable-perl --without-systemd --prefix=/opt/libloc
+  make
+  make install
+
+Then (after adjusting PATH and PYTHONPATH) you can get the latest
+dump with:
+
+  location update
+  location dump geoip-dump.txt
+
+And transform it into geoip files with
+
+  cargo run --release -- -i geoip-dump.txt
+
+
+==============================
+
+Note that the current version "0.1.9" of rangemap has a performance
+bug, making this tool quite slow.  Previous versions had a
+correctness bug that made the output needlessly long.  With luck,
+there will soon be a fast correct rangemap version.
+\ No newline at end of file
diff --git a/scripts/maint/geoip/geoip-db-tool/.gitignore b/scripts/maint/geoip/geoip-db-tool/.gitignore
new file mode 100644
index 0000000000..eb5a316cbd
--- /dev/null
+++ b/scripts/maint/geoip/geoip-db-tool/.gitignore
@@ -0,0 +1 @@
+target
diff --git a/scripts/maint/geoip/geoip-db-tool/Cargo.toml b/scripts/maint/geoip/geoip-db-tool/Cargo.toml
new file mode 100644
index 0000000000..b08863924a
--- /dev/null
+++ b/scripts/maint/geoip/geoip-db-tool/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "geoip-db-tool"
+version = "0.1.0"
+authors = ["Nick Mathewson <nickm@torproject.org>"]
+edition = "2018"
+license = "MIT OR Apache-2.0"
+publish = false
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+ipnetwork= "0.17.0"
+rangemap= "0.1.9"
+# I use this for now to avoid a performance hit due to a bug on 0.1.9
+# rangemap = {version = "*", path = "/home/nickm/src/rangemap/" }
+argh = "0.1.4"
diff --git a/scripts/maint/geoip/geoip-db-tool/src/db.rs b/scripts/maint/geoip/geoip-db-tool/src/db.rs
new file mode 100644
index 0000000000..eaadd4c612
--- /dev/null
+++ b/scripts/maint/geoip/geoip-db-tool/src/db.rs
@@ -0,0 +1,126 @@
+/// Code to parse a dump file
+use std::collections::HashMap;
+use std::convert::TryInto;
+use std::iter::Peekable;
+
+use super::NetBlock;
+
+pub struct BlockReader<I>
+where
+    I: Iterator<Item = std::io::Result<String>>,
+{
+    iter: Peekable<I>,
+}
+
+enum AnyBlock {
+    NotNet,
+    NetBlock(NetBlock),
+}
+
+impl<I> BlockReader<I>
+where
+    I: Iterator<Item = std::io::Result<String>>,
+{
+    pub fn new(iter: I) -> Self {
+        BlockReader {
+            iter: iter.peekable(),
+        }
+    }
+
+    /// Extract the initial header from the file.
+    pub fn extract_header(&mut self) -> String {
+        let mut res: String = "".to_string();
+
+        while let Some(Ok(line)) = self.iter.peek() {
+            if !line.starts_with('#') {
+                break;
+            }
+            res.push_str(line.as_str());
+            res.push('\n');
+            let _ = self.iter.next();
+        }
+
+        res
+    }
+
+    /// Extract the next empty-line-delimited block from the file.
+    ///
+    /// This isn't terribly efficient, but it's "fast enough".
+    fn get_block(&mut self) -> Option<std::io::Result<AnyBlock>> {
+        let mut kv = HashMap::new();
+
+        while let Some(line) = self.iter.next() {
+            //dbg!(&line);
+            if let Err(e) = line {
+                return Some(Err(e));
+            }
+            let line_orig = line.unwrap();
+            let line = line_orig.splitn(2, '#').next().unwrap().trim();
+            if line.is_empty() {
+                if kv.is_empty() {
+                    continue;
+                } else {
+                    break;
+                }
+            }
+            let kwds: Vec<_> = line.splitn(2, ':').collect();
+            if kwds.len() != 2 {
+                return None; // XXXX handle the error better.
+            }
+            kv.insert(kwds[0].trim().to_string(), kwds[1].trim().to_string());
+        }
+
+        if kv.is_empty() {
+            return None;
+        }
+
+        let net = if let Some(net) = kv.get("net") {
+            net.parse().unwrap() //XXXX handle the error better.
+        } else {
+            return Some(Ok(AnyBlock::NotNet));
+        };
+
+        let cc = if let Some(country) = kv.get("country") {
+            assert!(country.as_bytes().len() == 2);
+            country.as_bytes()[0..2].try_into().unwrap()
+        } else {
+            return Some(Ok(AnyBlock::NotNet));
+        };
+
+        fn is_true(v: Option<&String>) -> bool {
+            match v {
+                Some(s) => s == "true",
+                None => false,
+            }
+        }
+
+        let is_anon_proxy = is_true(kv.get("is-anonymous-proxy"));
+        let is_anycast = is_true(kv.get("is-anycast-proxy"));
+        let is_satellite = is_true(kv.get("is-satellite-provider"));
+
+        Some(Ok(AnyBlock::NetBlock(NetBlock {
+            net,
+            cc,
+            is_anon_proxy,
+            is_anycast,
+            is_satellite,
+        })))
+    }
+}
+
+impl<I> Iterator for BlockReader<I>
+where
+    I: Iterator<Item = std::io::Result<String>>,
+{
+    type Item = NetBlock;
+    fn next(&mut self) -> Option<Self::Item> {
+        loop {
+            match self.get_block() {
+                None => return None,
+                Some(Err(_)) => return None,
+                Some(Ok(AnyBlock::NotNet)) => continue,
+                Some(Ok(AnyBlock::NetBlock(n))) => return Some(n),
+            }
+        }
+    }
+}
diff --git a/scripts/maint/geoip/geoip-db-tool/src/main.rs b/scripts/maint/geoip/geoip-db-tool/src/main.rs
new file mode 100644
index 0000000000..38d70f7e1b
--- /dev/null
+++ b/scripts/maint/geoip/geoip-db-tool/src/main.rs
@@ -0,0 +1,165 @@
+/// A basic tool to convert IPFire Location dumps into the CSV formats that Tor
+/// expects.
+mod db;
+
+use argh::FromArgs;
+use ipnetwork::IpNetwork;
+use rangemap::RangeInclusiveMap;
+
+use std::fs::File;
+use std::io::{BufRead, BufReader, BufWriter, Write};
+use std::net::{IpAddr, Ipv6Addr};
+use std::path::{Path, PathBuf};
+
+fn default_ipv4_path() -> PathBuf {
+    "./geoip".into()
+}
+fn default_ipv6_path() -> PathBuf {
+    "./geoip6".into()
+}
+
+#[derive(FromArgs)]
+/// Convert an IPFire Location dump into CSV geoip files.
+struct Args {
+    /// where to store the IPv4 geoip output
+    #[argh(option, default = "default_ipv4_path()", short = '4')]
+    output_ipv4: PathBuf,
+
+    /// where to store the IPv6 geoip6 output
+    #[argh(option, default = "default_ipv6_path()", short = '6')]
+    output_ipv6: PathBuf,
+
+    /// where to find the dump file
+    #[argh(option, short = 'i')]
+    input: PathBuf,
+}
+
+/// Represents a network block from running `location dump`.
+#[derive(Debug, Clone)]
+pub struct NetBlock {
+    pub net: IpNetwork,
+    pub cc: [u8; 2],
+    pub is_anon_proxy: bool,
+    pub is_anycast: bool,
+    pub is_satellite: bool,
+}
+
+impl PartialEq for NetBlock {
+    fn eq(&self, other: &Self) -> bool {
+        self.net == other.net
+    }
+}
+
+/// We define network blocks as being sorted first from largest to smallest,
+/// then by address.
+impl Ord for NetBlock {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        self.net
+            .prefix()
+            .cmp(&other.net.prefix())
+            .then_with(|| self.net.network().cmp(&other.net.network()))
+    }
+}
+
+impl PartialOrd for NetBlock {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Eq for NetBlock {}
+
+const PROLOGUE: &str = "\
+# This file has been converted from the IPFire Location database
+# using Tor's geoip-db-tool.  For more information on the data, see
+# https://location.ipfire.org/.
+#
+# Below is the header from the original export:
+#
+";
+
+/// Read an input file in the `location dump` format, and write CSV ipv4 and ipv6 files.
+///
+/// This code tries to be "efficient enough"; most of the logic is handled by
+/// using the rangemap crate.
+fn convert(input: &Path, output_v4: &Path, output_v6: &Path) -> std::io::Result<()> {
+    let f = File::open(input)?;
+    let f = BufReader::new(f);
+    let mut blocks = Vec::new();
+
+    let mut reader = db::BlockReader::new(f.lines());
+    let hdr = reader.extract_header();
+    // Read blocks, and then sort them by specificity and address.
+    for nb in reader {
+        blocks.push(nb);
+    }
+    blocks.sort();
+
+    // Convert the sorted blocks into a map from address ranges into
+    // country codes.
+    //
+    // Note that since we have sorted the blocks from least to most specific,
+    // we will be puttting them into the maps in the right order, so that the
+    // most specific rule "wins".
+    //
+    // We use u32 and u128 as the index types for these RangeInclusiveMaps,
+    // so that we don't need to implement a step function for IpAddr.
+    let mut v4map: RangeInclusiveMap<u32, [u8; 2], _> = RangeInclusiveMap::new();
+    let mut v6map: RangeInclusiveMap<u128, [u8; 2], _> = RangeInclusiveMap::new();
+
+    let mut n = 0usize;
+    let num_blocks = blocks.len();
+    for nb in blocks {
+        n += 1;
+        if n % 100000 == 0 {
+            println!("{}/{}", n, num_blocks);
+        }
+        let start = nb.net.network();
+        let end = nb.net.broadcast();
+        match (start, end) {
+            (IpAddr::V4(a), IpAddr::V4(b)) => {
+                v4map.insert(a.into()..=b.into(), nb.cc);
+            }
+            (IpAddr::V6(a), IpAddr::V6(b)) => {
+                v6map.insert(a.into()..=b.into(), nb.cc);
+            }
+            (_, _) => panic!("network started and ended in different families!?"),
+        }
+    }
+
+    // Write the ranges out to the appropriate files, in order.
+    let mut v4 = BufWriter::new(File::create(output_v4)?);
+    let mut v6 = BufWriter::new(File::create(output_v6)?);
+
+    v4.write_all(PROLOGUE.as_bytes())?;
+    v4.write_all(hdr.as_bytes())?;
+    for (r, cc) in v4map.iter() {
+        let a: u32 = *r.start();
+        let b: u32 = *r.end();
+        writeln!(&mut v4, "{},{},{}", a, b, std::str::from_utf8(cc).unwrap())?;
+    }
+
+    v6.write_all(PROLOGUE.as_bytes())?;
+    v6.write_all(hdr.as_bytes())?;
+    for (r, cc) in v6map.iter() {
+        let a: Ipv6Addr = (*r.start()).into();
+        let b: Ipv6Addr = (*r.end()).into();
+        writeln!(&mut v6, "{},{},{}", a, b, std::str::from_utf8(cc).unwrap())?;
+    }
+
+    // The documentation says you should always flush a BufWriter.
+    v4.flush()?;
+    v6.flush()?;
+
+    Ok(())
+}
+
+fn main() -> std::io::Result<()> {
+    let args: Args = argh::from_env();
+
+    convert(
+        args.input.as_path(),
+        args.output_ipv4.as_path(),
+        args.output_ipv6.as_path(),
+    )
+}
diff --git a/scripts/maint/geoip/update_geoip.sh b/scripts/maint/geoip/update_geoip.sh
new file mode 100755
index 0000000000..9289e7a969
--- /dev/null
+++ b/scripts/maint/geoip/update_geoip.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+set -e
+
+DIR=$(cd "$(dirname "$0")" && pwd)
+TMP=$(mktemp -d)
+
+location update
+location dump "$TMP/geoip-dump.txt"
+
+OLDDIR=$(pwd)
+cd "$DIR/geoip-db-tool/"
+cargo build --release
+cd "$OLDDIR"
+
+"$DIR/geoip-db-tool/target/release/geoip-db-tool" -i "$TMP/geoip-dump.txt"
author	Nick Mathewson <nickm@torproject.org>	2021-02-22 08:30:11 -0500
committer	Nick Mathewson <nickm@torproject.org>	2021-02-22 12:25:18 -0500
commit	0d4237839b21b466526a01147538d09c117cc884 (patch)
tree	f13c7c70aed9ae3645c0eb171348f035e468ad8b /scripts
parent	8ccfd4a51ad55e9834cffcc91cbaa13e1f19c8ff (diff)
download	tor-0d4237839b21b466526a01147538d09c117cc884.tar.gz tor-0d4237839b21b466526a01147538d09c117cc884.zip