scripts/maint/geoip/geoip-db-tool/src/main.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165

/// A basic tool to convert IPFire Location dumps into the CSV formats that Tor
/// expects.
mod db;

use argh::FromArgs;
use ipnetwork::IpNetwork;
use rangemap::RangeInclusiveMap;

use std::fs::File;
use std::io::{BufRead, BufReader, BufWriter, Write};
use std::net::{IpAddr, Ipv6Addr};
use std::path::{Path, PathBuf};

fn default_ipv4_path() -> PathBuf {
    "./geoip".into()
}
fn default_ipv6_path() -> PathBuf {
    "./geoip6".into()
}

#[derive(FromArgs)]
/// Convert an IPFire Location dump into CSV geoip files.
struct Args {
    /// where to store the IPv4 geoip output
    #[argh(option, default = "default_ipv4_path()", short = '4')]
    output_ipv4: PathBuf,

    /// where to store the IPv6 geoip6 output
    #[argh(option, default = "default_ipv6_path()", short = '6')]
    output_ipv6: PathBuf,

    /// where to find the dump file
    #[argh(option, short = 'i')]
    input: PathBuf,
}

/// Represents a network block from running `location dump`.
#[derive(Debug, Clone)]
pub struct NetBlock {
    pub net: IpNetwork,
    pub cc: [u8; 2],
    pub is_anon_proxy: bool,
    pub is_anycast: bool,
    pub is_satellite: bool,
}

impl PartialEq for NetBlock {
    fn eq(&self, other: &Self) -> bool {
        self.net == other.net
    }
}

/// We define network blocks as being sorted first from largest to smallest,
/// then by address.
impl Ord for NetBlock {
    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
        self.net
            .prefix()
            .cmp(&other.net.prefix())
            .then_with(|| self.net.network().cmp(&other.net.network()))
    }
}

impl PartialOrd for NetBlock {
    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
        Some(self.cmp(other))
    }
}

impl Eq for NetBlock {}

const PROLOGUE: &str = "\
# This file has been converted from the IPFire Location database
# using Tor's geoip-db-tool.  For more information on the data, see
# https://location.ipfire.org/.
#
# Below is the header from the original export:
#
";

/// Read an input file in the `location dump` format, and write CSV ipv4 and ipv6 files.
///
/// This code tries to be "efficient enough"; most of the logic is handled by
/// using the rangemap crate.
fn convert(input: &Path, output_v4: &Path, output_v6: &Path) -> std::io::Result<()> {
    let f = File::open(input)?;
    let f = BufReader::new(f);
    let mut blocks = Vec::new();

    let mut reader = db::BlockReader::new(f.lines());
    let hdr = reader.extract_header();
    // Read blocks, and then sort them by specificity and address.
    for nb in reader {
        blocks.push(nb);
    }
    blocks.sort();

    // Convert the sorted blocks into a map from address ranges into
    // country codes.
    //
    // Note that since we have sorted the blocks from least to most specific,
    // we will be puttting them into the maps in the right order, so that the
    // most specific rule "wins".
    //
    // We use u32 and u128 as the index types for these RangeInclusiveMaps,
    // so that we don't need to implement a step function for IpAddr.
    let mut v4map: RangeInclusiveMap<u32, [u8; 2], _> = RangeInclusiveMap::new();
    let mut v6map: RangeInclusiveMap<u128, [u8; 2], _> = RangeInclusiveMap::new();

    let mut n = 0usize;
    let num_blocks = blocks.len();
    for nb in blocks {
        n += 1;
        if n % 100000 == 0 {
            println!("{}/{}", n, num_blocks);
        }
        let start = nb.net.network();
        let end = nb.net.broadcast();
        match (start, end) {
            (IpAddr::V4(a), IpAddr::V4(b)) => {
                v4map.insert(a.into()..=b.into(), nb.cc);
            }
            (IpAddr::V6(a), IpAddr::V6(b)) => {
                v6map.insert(a.into()..=b.into(), nb.cc);
            }
            (_, _) => panic!("network started and ended in different families!?"),
        }
    }

    // Write the ranges out to the appropriate files, in order.
    let mut v4 = BufWriter::new(File::create(output_v4)?);
    let mut v6 = BufWriter::new(File::create(output_v6)?);

    v4.write_all(PROLOGUE.as_bytes())?;
    v4.write_all(hdr.as_bytes())?;
    for (r, cc) in v4map.iter() {
        let a: u32 = *r.start();
        let b: u32 = *r.end();
        writeln!(&mut v4, "{},{},{}", a, b, std::str::from_utf8(cc).unwrap())?;
    }

    v6.write_all(PROLOGUE.as_bytes())?;
    v6.write_all(hdr.as_bytes())?;
    for (r, cc) in v6map.iter() {
        let a: Ipv6Addr = (*r.start()).into();
        let b: Ipv6Addr = (*r.end()).into();
        writeln!(&mut v6, "{},{},{}", a, b, std::str::from_utf8(cc).unwrap())?;
    }

    // The documentation says you should always flush a BufWriter.
    v4.flush()?;
    v6.flush()?;

    Ok(())
}

fn main() -> std::io::Result<()> {
    let args: Args = argh::from_env();

    convert(
        args.input.as_path(),
        args.output_ipv4.as_path(),
        args.output_ipv6.as_path(),
    )
}