summaryrefslogtreecommitdiff
path: root/scripts/maint/geoip/geoip-db-tool/src/main.rs
blob: 9a22598a354c2a5ea5ee873c7fc842a31a5803be (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
/// A basic tool to convert IPFire Location dumps into the CSV formats that Tor
/// expects.
mod db;

use argh::FromArgs;
use ipnetwork::IpNetwork;
use rangemap::RangeInclusiveMap;

use std::fs::File;
use std::io::{BufRead, BufReader, BufWriter, Write};
use std::net::{IpAddr, Ipv6Addr};
use std::num::NonZeroU32;
use std::path::PathBuf;

fn default_ipv4_path() -> PathBuf {
    "./geoip".into()
}
fn default_ipv6_path() -> PathBuf {
    "./geoip6".into()
}

#[derive(FromArgs)]
/// Convert an IPFire Location dump into CSV geoip files.
struct Args {
    /// where to store the IPv4 geoip output
    #[argh(option, default = "default_ipv4_path()", short = '4')]
    output_ipv4: PathBuf,

    /// where to store the IPv6 geoip6 output
    #[argh(option, default = "default_ipv6_path()", short = '6')]
    output_ipv6: PathBuf,

    /// where to find the dump file
    #[argh(option, short = 'i')]
    input: PathBuf,

    /// whether to include AS information in our output
    #[argh(switch)]
    include_asn: bool,

    /// where to store the AS map.
    #[argh(option)]
    output_asn: Option<PathBuf>,
}

/// Represents a network block from running `location dump`.
#[derive(Debug, Clone)]
pub struct NetBlock {
    pub net: IpNetwork,
    pub cc: [u8; 2],
    pub asn: Option<NonZeroU32>,
    pub is_anon_proxy: bool,
    pub is_anycast: bool,
    pub is_satellite: bool,
}

/// Represents an AS definition from running `location dump`.
#[derive(Debug, Clone, Ord, PartialOrd, Eq, PartialEq)]
pub struct AsBlock {
    pub asn: NonZeroU32,
    pub name: String,
}

impl PartialEq for NetBlock {
    fn eq(&self, other: &Self) -> bool {
        self.net == other.net
    }
}

/// We define network blocks as being sorted first from largest to smallest,
/// then by address.
impl Ord for NetBlock {
    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
        self.net
            .prefix()
            .cmp(&other.net.prefix())
            .then_with(|| self.net.network().cmp(&other.net.network()))
    }
}

impl PartialOrd for NetBlock {
    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
        Some(self.cmp(other))
    }
}

impl Eq for NetBlock {}

#[derive(Copy, Clone, Eq, PartialEq, Debug)]
struct NetDefn {
    cc: [u8; 2],
    asn: Option<NonZeroU32>,
}

impl NetBlock {
    fn into_defn(self, include_asn: bool) -> NetDefn {
        if include_asn {
            NetDefn {
                cc: self.cc,
                asn: self.asn,
            }
        } else {
            NetDefn {
                cc: self.cc,
                asn: None,
            }
        }
    }
}

impl NetDefn {
    fn cc(&self) -> &str {
        std::str::from_utf8(&self.cc).unwrap()
    }
    fn asn(&self) -> u32 {
        match self.asn {
            Some(v) => v.into(),
            None => 0,
        }
    }
}

const PROLOGUE: &str = "\
# This file has been converted from the IPFire Location database
# using Tor's geoip-db-tool.  For more information on the data, see
# https://location.ipfire.org/.
#
# Below is the header from the original export:
#
";

/// Read an input file in the `location dump` format, and write CSV ipv4 and ipv6 files.
///
/// This code tries to be "efficient enough"; most of the logic is handled by
/// using the rangemap crate.
fn convert(args: Args) -> std::io::Result<()> {
    let input = args.input.as_path();
    let output_v4 = args.output_ipv4.as_path();
    let output_v6 = args.output_ipv6.as_path();
    let include_asn = args.include_asn;

    let f = File::open(input)?;
    let f = BufReader::new(f);
    let mut blocks = Vec::new();
    let mut networks = Vec::new();

    let mut reader = db::BlockReader::new(f.lines());
    let hdr = reader.extract_header();
    // Read blocks, and then sort them by specificity and address.
    for nb in reader {
        match nb {
            db::AnyBlock::AsBlock(a) => networks.push(a),
            db::AnyBlock::NetBlock(n) => blocks.push(n),
            _ => {}
        }
    }
    blocks.sort();

    // Convert the sorted blocks into a map from address ranges into
    // country codes.
    //
    // Note that since we have sorted the blocks from least to most specific,
    // we will be puttting them into the maps in the right order, so that the
    // most specific rule "wins".
    //
    // We use u32 and u128 as the index types for these RangeInclusiveMaps,
    // so that we don't need to implement a step function for IpAddr.
    let mut v4map: RangeInclusiveMap<u32, NetDefn, _> = RangeInclusiveMap::new();
    let mut v6map: RangeInclusiveMap<u128, NetDefn, _> = RangeInclusiveMap::new();

    let mut n = 0usize;
    let num_blocks = blocks.len();
    for nb in blocks {
        n += 1;
        if n % 100000 == 0 {
            println!("{}/{}", n, num_blocks);
        }
        let start = nb.net.network();
        let end = nb.net.broadcast();
        match (start, end) {
            (IpAddr::V4(a), IpAddr::V4(b)) => {
                v4map.insert(a.into()..=b.into(), nb.into_defn(include_asn));
            }
            (IpAddr::V6(a), IpAddr::V6(b)) => {
                v6map.insert(a.into()..=b.into(), nb.into_defn(include_asn));
            }
            (_, _) => panic!("network started and ended in different families!?"),
        }
    }

    // Write the ranges out to the appropriate files, in order.
    let mut v4 = BufWriter::new(File::create(output_v4)?);
    let mut v6 = BufWriter::new(File::create(output_v6)?);

    v4.write_all(PROLOGUE.as_bytes())?;
    v4.write_all(hdr.as_bytes())?;
    for (r, defn) in v4map.iter() {
        let a: u32 = *r.start();
        let b: u32 = *r.end();
        if include_asn {
            writeln!(&mut v4, "{},{},{},{}", a, b, defn.cc(), defn.asn())?;
        } else {
            writeln!(&mut v4, "{},{},{}", a, b, defn.cc())?;
        }
    }

    v6.write_all(PROLOGUE.as_bytes())?;
    v6.write_all(hdr.as_bytes())?;
    for (r, defn) in v6map.iter() {
        let a: Ipv6Addr = (*r.start()).into();
        let b: Ipv6Addr = (*r.end()).into();
        if include_asn {
            writeln!(&mut v6, "{},{},{},{}", a, b, defn.cc(), defn.asn())?;
        } else {
            writeln!(&mut v6, "{},{},{}", a, b, defn.cc())?;
        }
    }

    // The documentation says you should always flush a BufWriter.
    v4.flush()?;
    v6.flush()?;

    if let Some(output_asn) = args.output_asn {
        networks.sort();
        let mut asn = BufWriter::new(File::create(output_asn)?);
        for net in networks {
            writeln!(&mut asn, "{},{}", net.asn, net.name)?;
        }
        asn.flush()?;
    }

    Ok(())
}

fn main() -> std::io::Result<()> {
    let args: Args = argh::from_env();

    convert(args)
}