Country Generator 1.0.1
Procedural country generation for C++23
Loading...
Searching...
No Matches
countrygen.hpp
Go to the documentation of this file.
1#ifndef DASMIG_COUNTRYGEN_HPP
2#define DASMIG_COUNTRYGEN_HPP
3
4#include "random.hpp"
5#include <algorithm>
6#include <array>
7#include <charconv>
8#include <cstddef>
9#include <cstdint>
10#include <filesystem>
11#include <fstream>
12#include <ostream>
13#include <random>
14#include <ranges>
15#include <stdexcept>
16#include <string>
17#include <unordered_map>
18#include <utility>
19#include <vector>
20
21/// @file countrygen.hpp
22/// @brief Country generator library — procedural country generation for C++23.
23/// @author Diego Dasso Migotto (diegomigotto at hotmail dot com)
24/// @see See doc/usage.md for the narrative tutorial.
25
26namespace dasmig
27{
28
29/// @brief Dataset size tier for resource loading.
30#ifndef DASMIG_DATASET_DEFINED
31#define DASMIG_DATASET_DEFINED
32enum class dataset : std::uint8_t
33{
34 lite, ///< ~195 sovereign states only.
35 full ///< ~250 countries and territories.
36};
37#endif
38
39/// @brief Return type for country generation, holding all data fields.
40///
41/// Supports implicit conversion to std::string (returns the common name)
42/// and streaming via operator<<.
44{
45 public:
46 // -- Identification ---------------------------------------------------
47
48 /// @brief ISO 3166-1 alpha-2 code (e.g. "BR").
49 std::string cca2;
50
51 /// @brief ISO 3166-1 alpha-3 code (e.g. "BRA").
52 std::string cca3;
53
54 /// @brief ISO 3166-1 numeric code (e.g. "076").
55 std::string ccn3;
56
57 /// @brief Common English name (e.g. "Brazil").
58 std::string name_common;
59
60 /// @brief Official English name (e.g. "Federative Republic of Brazil").
61 std::string name_official;
62
63 /// @brief Capital city (semicolon-separated if multiple).
64 std::string capital;
65
66 // -- Geography --------------------------------------------------------
67
68 /// @brief UN geoscheme region (e.g. "Americas").
69 std::string region;
70
71 /// @brief UN geoscheme subregion (e.g. "South America").
72 std::string subregion;
73
74 /// @brief Continent (semicolon-separated if multiple).
75 std::string continent;
76
77 /// @brief WGS84 latitude in decimal degrees.
78 double latitude{0.0};
79
80 /// @brief WGS84 longitude in decimal degrees.
81 double longitude{0.0};
82
83 /// @brief Total land area in km².
84 std::uint64_t area{0};
85
86 /// @brief Total population.
87 std::uint64_t population{0};
88
89 /// @brief Whether the country is landlocked.
90 bool landlocked{false};
91
92 // -- Political --------------------------------------------------------
93
94 /// @brief Whether the country is an independent sovereign state.
95 bool independent{false};
96
97 /// @brief Whether the country is a UN member.
98 bool un_member{false};
99
100 // -- Culture ----------------------------------------------------------
101
102 /// @brief Semicolon-separated language names (e.g. "Portuguese").
103 std::string languages;
104
105 /// @brief Primary ISO 4217 currency code (e.g. "BRL").
106 std::string currency_code;
107
108 /// @brief Primary currency name (e.g. "Brazilian real").
109 std::string currency_name;
110
111 /// @brief Primary currency symbol (e.g. "R$").
112 std::string currency_symbol;
113
114 // -- Borders & time ---------------------------------------------------
115
116 /// @brief Semicolon-separated ISO alpha-3 border codes.
117 std::string borders;
118
119 /// @brief Semicolon-separated UTC offset strings.
120 std::string timezones;
121
122 /// @brief Driving side: "right" or "left".
123 std::string driving_side;
124
125 // -- Telecom & web ----------------------------------------------------
126
127 /// @brief Country-code top-level domain (e.g. ".br").
128 std::string tld;
129
130 /// @brief IDD root (e.g. "+5").
131 std::string idd_root;
132
133 /// @brief IDD primary suffix (e.g. "5").
134 std::string idd_suffix;
135
136 // -- Demonyms & misc --------------------------------------------------
137
138 /// @brief English male demonym (e.g. "Brazilian").
139 std::string demonym_m;
140
141 /// @brief English female demonym (e.g. "Brazilian").
142 std::string demonym_f;
143
144 /// @brief Unicode flag emoji.
145 std::string flag_emoji;
146
147 /// @brief World Bank income level (e.g. "Upper middle income").
148 std::string income_level;
149
150 /// @brief Start of the week: "monday", "sunday", or "saturday".
151 std::string start_of_week;
152
153 // -- Methods ----------------------------------------------------------
154
155 /// @brief Retrieve the random seed used to generate this country.
156 /// @return The per-call seed for replay.
157 /// @see cntg::get_country(std::uint64_t)
158 [[nodiscard]] std::uint64_t seed() const
159 {
160 return _seed;
161 }
162
163 /// @brief Implicit conversion to std::string.
164 /// @return The common English name.
165 operator std::string() const // NOLINT(hicpp-explicit-conversions)
166 {
167 return name_common;
168 }
169
170 /// @brief Stream the common name to an output stream.
171 friend std::ostream& operator<<(std::ostream& os, const country& c)
172 {
173 os << c.name_common;
174 return os;
175 }
176
177 private:
178 std::uint64_t _seed{0};
179
180 friend class cntg;
181};
182
183/// @brief Country generator that produces random countries from
184/// aggregated open-data sources.
185///
186/// Generates countries using population-weighted random selection by
187/// default. Larger countries are proportionally more likely to be
188/// selected, mirroring real-world demographic distributions.
189///
190/// Can be used as a singleton via instance() or constructed independently.
191///
192/// @par Thread safety
193/// Each instance is independent. Concurrent calls to get_country() on
194/// the **same** instance require external synchronization. load() mutates
195/// internal state and must not be called concurrently with get_country()
196/// on the same instance.
197class cntg
198{
199 public:
200 /// @brief Default constructor — creates an empty generator with no data.
201 cntg() = default;
202
203 cntg(const cntg&) = delete; ///< Not copyable.
204 cntg& operator=(const cntg&) = delete; ///< Not copyable.
205 cntg(cntg&&) noexcept = default; ///< Move constructor.
206 cntg& operator=(cntg&&) noexcept = default; ///< Move assignment.
207 ~cntg() = default; ///< Default destructor.
208
209 /// @brief Access the global singleton instance.
210 ///
211 /// The singleton auto-probes common resource paths on first access.
212 /// @return Reference to the global cntg instance.
213 static cntg& instance()
214 {
215 static cntg inst{auto_probe_tag{}};
216 return inst;
217 }
218
219 // -- Generation -------------------------------------------------------
220
221 /// @brief Generate a random country.
222 ///
223 /// By default, selection is population-weighted. Call weighted(false)
224 /// to switch to uniform random selection.
225 /// @return A country object with all fields.
226 /// @throws std::runtime_error If no data has been loaded.
227 [[nodiscard]] country get_country()
228 {
229 auto call_seed = static_cast<std::uint64_t>(_engine());
230 return get_country(call_seed);
231 }
232
233 /// @brief Generate a deterministic country using a specific seed.
234 /// @param call_seed Seed for reproducible results.
235 /// @return A country object with all fields.
236 /// @throws std::runtime_error If no data has been loaded.
237 [[nodiscard]] country get_country(std::uint64_t call_seed) const
238 {
239 if (_countries.empty())
240 {
241 throw std::runtime_error(
242 "No country data loaded. Call load() first.");
243 }
244
245 effolkronium::random_local call_engine;
246 call_engine.seed(static_cast<std::mt19937::result_type>(
247 (call_seed ^ (call_seed >> seed_shift_))));
248
249 auto idx = _weighted
250 ? _distribution(call_engine.engine())
251 : _uniform(call_engine.engine());
252 country result = _countries[idx]; // NOLINT(cppcoreguidelines-pro-bounds-*)
253 result._seed = call_seed;
254 return result;
255 }
256
257 /// @brief Generate a random country filtered by region.
258 /// @param rgn UN geoscheme region (e.g. "Europe", "Asia").
259 /// @return A country object from the specified region.
260 /// @throws std::runtime_error If no data has been loaded.
261 /// @throws std::invalid_argument If no countries match the region.
262 [[nodiscard]] country get_country(const std::string& rgn)
263 {
264 auto call_seed = static_cast<std::uint64_t>(_engine());
265 return get_country(rgn, call_seed);
266 }
267
268 /// @brief Generate a deterministic country filtered by region.
269 /// @param rgn UN geoscheme region.
270 /// @param call_seed Seed for reproducible results.
271 /// @return A country object from the specified region.
272 /// @throws std::runtime_error If no data has been loaded.
273 /// @throws std::invalid_argument If no countries match the region.
274 [[nodiscard]] country get_country(const std::string& rgn,
275 std::uint64_t call_seed) const
276 {
277 if (_countries.empty())
278 {
279 throw std::runtime_error(
280 "No country data loaded. Call load() first.");
281 }
282
283 auto it = _region_index.find(rgn);
284 if (it == _region_index.end())
285 {
286 throw std::invalid_argument(
287 "No countries found for region: " + rgn);
288 }
289
290 effolkronium::random_local call_engine;
291 call_engine.seed(static_cast<std::mt19937::result_type>(
292 (call_seed ^ (call_seed >> seed_shift_))));
293
294 const auto& idx = it->second;
295 auto selected = _weighted
296 ? idx.distribution(call_engine.engine())
297 : std::uniform_int_distribution<std::size_t>(
298 0, idx.country_indices.size() - 1)(
299 call_engine.engine());
300 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-*)
301 country result = _countries[idx.country_indices[selected]];
302 result._seed = call_seed;
303 return result;
304 }
305
306 /// @brief Look up a country by its ISO 3166-1 alpha-2 code.
307 /// @param cca2 Two-letter country code (e.g. "US", "BR").
308 /// @return The matching country object.
309 /// @throws std::runtime_error If no data has been loaded.
310 /// @throws std::invalid_argument If no country matches the code.
311 [[nodiscard]] country find_country(const std::string& cca2) const
312 {
313 if (_countries.empty())
314 {
315 throw std::runtime_error(
316 "No country data loaded. Call load() first.");
317 }
318
319 auto it = _cca2_index.find(cca2);
320 if (it == _cca2_index.end())
321 {
322 throw std::invalid_argument(
323 "No country found for CCA2 code: " + cca2);
324 }
325
326 return _countries[it->second]; // NOLINT(cppcoreguidelines-pro-bounds-*)
327 }
328
329 // -- Seeding ----------------------------------------------------------
330
331 /// @name Seeding
332 /// @{
333
334 /// @brief Seed the internal random engine for deterministic sequences.
335 /// @param seed_value The seed value.
336 /// @return `*this` for chaining.
337 cntg& seed(std::uint64_t seed_value)
338 {
339 _engine.seed(seed_value);
340 return *this;
341 }
342
343 /// @brief Reseed the engine with a non-deterministic source.
344 /// @return `*this` for chaining.
346 {
347 _engine.seed(std::random_device{}());
348 return *this;
349 }
350
351 /// @}
352
353 /// @brief Set whether generation is population-weighted or uniform.
354 /// @param enable `true` for weighted (default), `false` for uniform.
355 /// @return `*this` for chaining.
356 cntg& weighted(bool enable)
357 {
358 _weighted = enable;
359 return *this;
360 }
361
362 /// @brief Query whether generation is population-weighted.
363 /// @return `true` if weighted (default), `false` if uniform.
364 [[nodiscard]] bool weighted() const
365 {
366 return _weighted;
367 }
368
369 // -- Data management --------------------------------------------------
370
371 /// @brief Check whether any data has been loaded.
372 [[nodiscard]] bool has_data() const
373 {
374 return !_countries.empty();
375 }
376
377 /// @brief Return the number of loaded countries.
378 [[nodiscard]] std::size_t country_count() const
379 {
380 return _countries.size();
381 }
382
383 /// @brief Load country data from a TSV file.
384 ///
385 /// Expects a tab-delimited file with a header row and 31 columns
386 /// matching the schema produced by scripts/prepare_countries.py.
387 /// Safe to call multiple times.
388 ///
389 /// @param tsv_path Path to the TSV file.
390 void load(const std::filesystem::path& tsv_path)
391 {
392 if (!std::filesystem::exists(tsv_path) ||
393 !std::filesystem::is_regular_file(tsv_path))
394 {
395 return;
396 }
397
398 std::ifstream file{tsv_path};
399 if (!file.is_open())
400 {
401 return;
402 }
403
404 std::string line;
405
406 // Skip header row.
407 if (!std::getline(file, line))
408 {
409 return;
410 }
411
412 while (std::getline(file, line))
413 {
414 if (line.empty())
415 {
416 continue;
417 }
418
419 if (line.back() == '\r')
420 {
421 line.pop_back();
422 }
423
424 auto c = parse_line(line);
425 if (!c.cca3.empty())
426 {
427 _countries.push_back(std::move(c));
428 }
429 }
430
431 rebuild_indices();
432 }
433
434 /// @brief Load a specific dataset tier from a base resources directory.
435 /// @param tier The dataset size to load.
436 /// @return `true` if a matching directory was found and loaded.
437 [[nodiscard]] bool load(dataset tier)
438 {
439 const char* subfolder =
440 (tier == dataset::full) ? "full" : "lite";
441
442 auto found = std::ranges::find_if(
443 probe_bases_, [&](const char* base) {
444 const auto tsv =
445 std::filesystem::path{base} / subfolder / "countries.tsv";
446 return std::filesystem::is_regular_file(tsv);
447 });
448 if (found != probe_bases_.end())
449 {
450 load(std::filesystem::path{*found} / subfolder / "countries.tsv");
451 return true;
452 }
453 return false;
454 }
455
456 private:
457 // All loaded countries.
458 std::vector<country> _countries;
459
460 // Population-weighted distribution over _countries indices.
461 mutable std::discrete_distribution<std::size_t> _distribution;
462
463 // Uniform distribution over _countries indices.
464 mutable std::uniform_int_distribution<std::size_t> _uniform;
465
466 // Whether to use population-weighted (true) or uniform (false) selection.
467 bool _weighted{true};
468
469 // Pre-built per-region index for O(1) region-filtered lookups.
470 struct region_entry
471 {
472 std::vector<std::size_t> country_indices;
473 mutable std::discrete_distribution<std::size_t> distribution;
474 };
475
476 std::unordered_map<std::string, region_entry> _region_index;
477
478 // Pre-built CCA2 → index map for O(1) code-based lookups.
479 std::unordered_map<std::string, std::size_t> _cca2_index;
480
481 // Bit shift for mixing per-call seeds.
482 static constexpr unsigned seed_shift_{32U};
483
484 // Common base paths probed for resource directories.
485 static constexpr std::array probe_bases_{
486 "resources", "../resources", "country-generator/resources"};
487
488 // Per-instance random engine for seed drawing.
489 std::mt19937_64 _engine{std::random_device{}()};
490
491 // Tag type for the auto-probing singleton constructor.
492 struct auto_probe_tag {};
493
494 // Singleton constructor: auto-probes common resource locations.
495 explicit cntg(auto_probe_tag /*tag*/)
496 {
497 auto found = std::ranges::find_if(
498 probe_bases_, [](const char* p) {
499 return std::filesystem::exists(p) &&
500 std::filesystem::is_directory(p);
501 });
502 if (found != probe_bases_.end())
503 {
504 const std::filesystem::path base{*found};
505 auto lite_tsv = base / "lite" / "countries.tsv";
506 auto full_tsv = base / "full" / "countries.tsv";
507 if (std::filesystem::is_regular_file(lite_tsv))
508 {
509 load(lite_tsv);
510 }
511 else if (std::filesystem::is_regular_file(full_tsv))
512 {
513 load(full_tsv);
514 }
515 }
516 }
517
518 // Rebuild the global distribution and region index.
519 void rebuild_indices()
520 {
521 // Global distribution.
522 std::vector<double> weights;
523 weights.reserve(_countries.size());
524
525 std::ranges::transform(
526 _countries, std::back_inserter(weights), [](const country& c) {
527 return static_cast<double>(
528 std::max<std::uint64_t>(c.population, 1));
529 });
530
531 _distribution = std::discrete_distribution<std::size_t>(
532 weights.begin(), weights.end());
533
534 // Uniform distribution.
535 if (!_countries.empty())
536 {
537 _uniform = std::uniform_int_distribution<std::size_t>(
538 0, _countries.size() - 1);
539 }
540
541 // Region index.
542 _region_index.clear();
543 _cca2_index.clear();
544
545 for (auto&& [i, c] : _countries | std::views::enumerate)
546 {
547 auto idx = static_cast<std::size_t>(i);
548 auto& entry = _region_index[c.region];
549 entry.country_indices.push_back(idx);
550 _cca2_index[c.cca2] = idx;
551 }
552
553 // Build per-region distributions.
554 for (auto& [rgn, entry] : _region_index)
555 {
556 std::vector<double> rw;
557 rw.reserve(entry.country_indices.size());
558
559 for (auto idx : entry.country_indices)
560 {
561 rw.push_back(static_cast<double>(
562 std::max<std::uint64_t>(
563 _countries[idx].population, 1))); // NOLINT(cppcoreguidelines-pro-bounds-*)
564 }
565
566 entry.distribution = std::discrete_distribution<std::size_t>(
567 rw.begin(), rw.end());
568 }
569 }
570
571 // Parse a single tab-delimited line into a country object.
572 // Expected 31 fields matching the TSV header.
573 // NOLINTNEXTLINE(readability-function-cognitive-complexity)
574 static country parse_line(const std::string& line)
575 {
576 country c;
577
578 static constexpr std::size_t num_fields{31};
579 std::vector<std::string> fields;
580 fields.reserve(num_fields);
581
582 for (auto part : line | std::views::split('\t'))
583 {
584 fields.emplace_back(std::ranges::begin(part),
585 std::ranges::end(part));
586 }
587
588 if (fields.size() < num_fields)
589 {
590 return c; // cca3 stays empty, skipped by caller.
591 }
592
593 // Locale-independent numeric parsing via std::from_chars.
594 // NOLINTBEGIN(cppcoreguidelines-pro-bounds-pointer-arithmetic)
595 auto parse_double = [](const std::string& s, double fallback = 0.0) {
596 if (s.empty())
597 {
598 return fallback;
599 }
600 double val{};
601 auto [ptr, ec] = std::from_chars(s.data(), s.data() + s.size(), val);
602 return ec == std::errc{} ? val : fallback;
603 };
604
605 auto parse_uint64 = [](const std::string& s) -> std::uint64_t {
606 if (s.empty())
607 {
608 return 0ULL;
609 }
610 std::uint64_t val{};
611 auto [ptr, ec] = std::from_chars(s.data(), s.data() + s.size(), val);
612 return ec == std::errc{} ? val : 0ULL;
613 };
614 // NOLINTEND(cppcoreguidelines-pro-bounds-pointer-arithmetic)
615
616 c.cca2 = std::move(fields[0]);
617 c.cca3 = std::move(fields[1]);
618 c.ccn3 = std::move(fields[2]);
619 c.name_common = std::move(fields[3]);
620 c.name_official = std::move(fields[4]);
621 c.capital = std::move(fields[5]);
622 c.region = std::move(fields[6]);
623 c.subregion = std::move(fields[7]);
624 c.continent = std::move(fields[8]);
625 c.latitude = parse_double(fields[9]);
626 c.longitude = parse_double(fields[10]);
627 c.area = parse_uint64(fields[11]);
628 c.population = parse_uint64(fields[12]);
629 c.landlocked = (fields[13] == "1");
630 c.independent = (fields[14] == "1");
631 c.un_member = (fields[15] == "1");
632 c.languages = std::move(fields[16]);
633 c.currency_code = std::move(fields[17]);
634 c.currency_name = std::move(fields[18]);
635 c.currency_symbol = std::move(fields[19]);
636 c.borders = std::move(fields[20]);
637 c.timezones = std::move(fields[21]);
638 c.driving_side = std::move(fields[22]);
639 c.tld = std::move(fields[23]);
640 c.idd_root = std::move(fields[24]);
641 c.idd_suffix = std::move(fields[25]);
642 c.demonym_m = std::move(fields[26]);
643 c.demonym_f = std::move(fields[27]);
644 c.flag_emoji = std::move(fields[28]);
645 c.income_level = std::move(fields[29]);
646 c.start_of_week = std::move(fields[30]);
647
648 return c;
649 }
650};
651
652} // namespace dasmig
653
654#endif // DASMIG_COUNTRYGEN_HPP
Country generator that produces random countries from aggregated open-data sources.
cntg & seed(std::uint64_t seed_value)
Seed the internal random engine for deterministic sequences.
bool load(dataset tier)
Load a specific dataset tier from a base resources directory.
cntg(cntg &&) noexcept=default
Move constructor.
country get_country(const std::string &rgn, std::uint64_t call_seed) const
Generate a deterministic country filtered by region.
country get_country()
Generate a random country.
cntg(const cntg &)=delete
Not copyable.
country get_country(const std::string &rgn)
Generate a random country filtered by region.
cntg & operator=(const cntg &)=delete
Not copyable.
static cntg & instance()
Access the global singleton instance.
country get_country(std::uint64_t call_seed) const
Generate a deterministic country using a specific seed.
cntg & unseed()
Reseed the engine with a non-deterministic source.
country find_country(const std::string &cca2) const
Look up a country by its ISO 3166-1 alpha-2 code.
cntg & weighted(bool enable)
Set whether generation is population-weighted or uniform.
cntg()=default
Default constructor — creates an empty generator with no data.
bool weighted() const
Query whether generation is population-weighted.
void load(const std::filesystem::path &tsv_path)
Load country data from a TSV file.
std::size_t country_count() const
Return the number of loaded countries.
bool has_data() const
Check whether any data has been loaded.
Return type for country generation, holding all data fields.
friend std::ostream & operator<<(std::ostream &os, const country &c)
Stream the common name to an output stream.
std::uint64_t seed() const
Retrieve the random seed used to generate this country.
std::string cca3
ISO 3166-1 alpha-3 code (e.g. "BRA").
std::string ccn3
ISO 3166-1 numeric code (e.g. "076").
double longitude
WGS84 longitude in decimal degrees.
std::string idd_suffix
IDD primary suffix (e.g. "5").
std::string timezones
Semicolon-separated UTC offset strings.
std::string demonym_f
English female demonym (e.g. "Brazilian").
std::uint64_t area
Total land area in km².
std::string flag_emoji
Unicode flag emoji.
std::string name_official
Official English name (e.g. "Federative Republic of Brazil").
std::string region
UN geoscheme region (e.g. "Americas").
std::string continent
Continent (semicolon-separated if multiple).
std::string capital
Capital city (semicolon-separated if multiple).
std::string cca2
ISO 3166-1 alpha-2 code (e.g. "BR").
std::string start_of_week
Start of the week: "monday", "sunday", or "saturday".
bool un_member
Whether the country is a UN member.
std::string borders
Semicolon-separated ISO alpha-3 border codes.
std::string name_common
Common English name (e.g. "Brazil").
bool independent
Whether the country is an independent sovereign state.
std::string income_level
World Bank income level (e.g. "Upper middle income").
double latitude
WGS84 latitude in decimal degrees.
std::string driving_side
Driving side: "right" or "left".
std::string currency_symbol
Primary currency symbol (e.g. "R$").
std::string demonym_m
English male demonym (e.g. "Brazilian").
std::string tld
Country-code top-level domain (e.g. ".br").
std::string languages
Semicolon-separated language names (e.g. "Portuguese").
std::string idd_root
IDD root (e.g. "+5").
std::uint64_t population
Total population.
bool landlocked
Whether the country is landlocked.
std::string subregion
UN geoscheme subregion (e.g. "South America").
std::string currency_code
Primary ISO 4217 currency code (e.g. "BRL").
std::string currency_name
Primary currency name (e.g. "Brazilian real").
@ full
~250 countries and territories.
@ lite
~195 sovereign states only.