City Generator 1.0.1
Procedural city generation for C++23
Loading...
Searching...
No Matches
citygen.hpp
Go to the documentation of this file.
1#ifndef DASMIG_CITYGEN_HPP
2#define DASMIG_CITYGEN_HPP
3
4#include "random.hpp"
5#include <algorithm>
6#include <array>
7#include <cstddef>
8#include <cstdint>
9#include <filesystem>
10#include <fstream>
11#include <ostream>
12#include <random>
13#include <ranges>
14#include <stdexcept>
15#include <string>
16#include <unordered_map>
17#include <utility>
18#include <vector>
19
20/// @file citygen.hpp
21/// @brief City generator library — procedural city generation for C++23.
22/// @author Diego Dasso Migotto (diegomigotto at hotmail dot com)
23/// @see See doc/usage.md for the narrative tutorial.
24
25namespace dasmig
26{
27
28/// @brief Dataset size tier for resource loading.
29#ifndef DASMIG_DATASET_DEFINED
30#define DASMIG_DATASET_DEFINED
31enum class dataset : std::uint8_t
32{
33 lite, ///< ~25k major cities (population >= 15,000).
34 full ///< ~200k cities (population >= 500).
35};
36#endif
37
38/// @brief Return type for city generation, holding all GeoNames fields.
39///
40/// Supports implicit conversion to std::string (returns the city name)
41/// and streaming via operator<<.
42class city
43{
44 public:
45 /// @brief GeoNames primary key.
46 std::uint32_t geonameid{0};
47
48 /// @brief UTF-8 city name.
49 std::string name;
50
51 /// @brief Plain ASCII transliteration of the name.
52 std::string asciiname;
53
54 /// @brief WGS84 latitude in decimal degrees.
55 double latitude{0.0};
56
57 /// @brief WGS84 longitude in decimal degrees.
58 double longitude{0.0};
59
60 /// @brief GeoNames feature code (PPL, PPLA, PPLC, etc.).
61 std::string feature_code;
62
63 /// @brief ISO-3166 two-letter country code.
64 std::string country_code;
65
66 /// @brief Alternate country codes (comma-separated).
67 std::string cc2;
68
69 /// @brief First-level administrative division code (state/province).
70 std::string admin1_code;
71
72 /// @brief Second-level administrative division code (county/district).
73 std::string admin2_code;
74
75 /// @brief Third-level administrative division code (township/commune).
76 std::string admin3_code;
77
78 /// @brief Fourth-level administrative division code (sub-district).
79 std::string admin4_code;
80
81 /// @brief City population.
82 std::uint64_t population{0};
83
84 /// @brief Elevation in meters (-9999 if unknown).
85 std::int16_t elevation{-9999};
86
87 /// @brief Digital elevation model value (more reliable than elevation).
88 std::int16_t dem{0};
89
90 /// @brief IANA timezone identifier (e.g., "Europe/London").
91 std::string timezone;
92
93 /// @brief Retrieve the random seed used to generate this city.
94 /// @return The per-call seed for replay.
95 /// @see cg::get_city(std::uint64_t)
96 [[nodiscard]] std::uint64_t seed() const
97 {
98 return _seed;
99 }
100
101 /// @brief Implicit conversion to std::string.
102 /// @return The UTF-8 city name.
103 operator std::string() const // NOLINT(hicpp-explicit-conversions)
104 {
105 return name;
106 }
107
108 /// @brief Stream the city name to an output stream.
109 /// @param os Output stream.
110 /// @param c City to stream.
111 /// @return Reference to the output stream.
112 friend std::ostream& operator<<(std::ostream& os, const city& c)
113 {
114 os << c.name;
115 return os;
116 }
117
118 private:
119 std::uint64_t _seed{0}; ///< Random seed used to generate this city.
120
121 friend class cg; ///< Allows cg to set the seed.
122};
123
124/// @brief City generator that produces random cities from GeoNames data.
125///
126/// Generates cities using population-weighted random selection from
127/// GeoNames data. Larger cities are proportionally more likely to be
128/// selected, mirroring real-world population distributions.
129///
130/// Can be used as a singleton via instance() or constructed independently.
131/// Independent instances own their own data and random engine,
132/// making them safe for concurrent use without shared state.
133///
134/// @par Thread safety
135/// Each instance is independent. Concurrent calls to get_city() on
136/// the **same** instance require external synchronization. load() mutates
137/// internal state and must not be called concurrently with get_city()
138/// on the same instance.
139class cg
140{
141 public:
142 /// @brief Default constructor — creates an empty generator with no data.
143 ///
144 /// Call load() to populate city data before generating.
145 cg() = default;
146
147 cg(const cg&) = delete; ///< Not copyable.
148 cg& operator=(const cg&) = delete; ///< Not copyable.
149 cg(cg&&) = default; ///< Move constructor.
150 cg& operator=(cg&&) = default; ///< Move assignment.
151 ~cg() = default; ///< Default destructor.
152
153 /// @brief Access the global singleton instance.
154 ///
155 /// The singleton auto-probes common resource paths on first access.
156 /// For independent generators (e.g., inside an entity-generator
157 /// component), prefer constructing a separate cg instance.
158 /// @return Reference to the global cg instance.
159 static cg& instance()
160 {
161 static cg inst{auto_probe_tag{}};
162 return inst;
163 }
164
165 /// @brief Generate a random city.
166 ///
167 /// By default, selection is population-weighted. Call weighted(false)
168 /// to switch to uniform random selection.
169 /// @return A city object with all GeoNames fields.
170 /// @throws std::runtime_error If no city data has been loaded.
171 [[nodiscard]] city get_city()
172 {
173 auto call_seed = static_cast<std::uint64_t>(_engine());
174 return get_city(call_seed);
175 }
176
177 /// @brief Generate a deterministic city using a specific seed.
178 ///
179 /// Given the same loaded data and seed, this method always produces the
180 /// same city. Retrieve the seed from a previous city via city::seed().
181 ///
182 /// @param call_seed Seed for reproducible results.
183 /// @return A city object with all GeoNames fields.
184 /// @throws std::runtime_error If no city data has been loaded.
185 [[nodiscard]] city get_city(std::uint64_t call_seed) const
186 {
187 if (_cities.empty())
188 {
189 throw std::runtime_error(
190 "No city data loaded. Call load() first.");
191 }
192
193 effolkronium::random_local call_engine;
194 call_engine.seed(static_cast<std::mt19937::result_type>(
195 (call_seed ^ (call_seed >> 32U))));
196
197 auto idx = _weighted
198 ? _distribution(call_engine.engine())
199 : _uniform(call_engine.engine());
200 city result = _cities[idx];
201 result._seed = call_seed;
202 return result;
203 }
204
205 /// @brief Generate a random city filtered by country code.
206 /// @param country ISO-3166 two-letter country code (e.g., "BR", "US").
207 /// @return A city object from the specified country.
208 /// @throws std::runtime_error If no city data has been loaded.
209 /// @throws std::invalid_argument If no cities match the country code.
210 [[nodiscard]] city get_city(const std::string& country)
211 {
212 auto call_seed = static_cast<std::uint64_t>(_engine());
213 return get_city(country, call_seed);
214 }
215
216 /// @brief Generate a deterministic city filtered by country code.
217 /// @param country ISO-3166 two-letter country code.
218 /// @param call_seed Seed for reproducible results.
219 /// @return A city object from the specified country.
220 /// @throws std::runtime_error If no city data has been loaded.
221 /// @throws std::invalid_argument If no cities match the country code.
222 [[nodiscard]] city get_city(const std::string& country,
223 std::uint64_t call_seed) const
224 {
225 if (_cities.empty())
226 {
227 throw std::runtime_error(
228 "No city data loaded. Call load() first.");
229 }
230
231 auto it = _country_index.find(country);
232 if (it == _country_index.end())
233 {
234 throw std::invalid_argument(
235 "No cities found for country code: " + country);
236 }
237
238 effolkronium::random_local call_engine;
239 call_engine.seed(static_cast<std::mt19937::result_type>(
240 (call_seed ^ (call_seed >> 32U))));
241
242 const auto& idx = it->second;
243 auto selected = _weighted
244 ? idx.distribution(call_engine.engine())
245 : std::uniform_int_distribution<std::size_t>(
246 0, idx.city_indices.size() - 1)(
247 call_engine.engine());
248 city result = _cities[idx.city_indices[selected]];
249 result._seed = call_seed;
250 return result;
251 }
252
253 /// @name Seeding
254 /// @{
255
256 /// @brief Seed the internal random engine for deterministic sequences.
257 ///
258 /// Subsequent get_city() calls (without an explicit seed) draw
259 /// per-call seeds from this engine, producing a reproducible sequence.
260 ///
261 /// @param seed_value The seed value.
262 /// @return `*this` for chaining.
263 cg& seed(std::uint64_t seed_value)
264 {
265 _engine.seed(seed_value);
266 return *this;
267 }
268
269 /// @brief Reseed the engine with a non-deterministic source.
270 ///
271 /// Subsequent get_city() calls will produce non-reproducible results.
272 /// @return `*this` for chaining.
274 {
275 _engine.seed(std::random_device{}());
276 return *this;
277 }
278
279 /// @}
280
281 /// @brief Set whether generation is population-weighted or uniform.
282 ///
283 /// When `true` (default), larger cities are proportionally more likely
284 /// to be selected. When `false`, every city has an equal probability.
285 ///
286 /// @param enable `true` for weighted, `false` for uniform.
287 /// @return `*this` for chaining.
288 cg& weighted(bool enable)
289 {
290 _weighted = enable;
291 return *this;
292 }
293
294 /// @brief Query whether generation is population-weighted.
295 /// @return `true` if weighted (default), `false` if uniform.
296 [[nodiscard]] bool weighted() const
297 {
298 return _weighted;
299 }
300
301 /// @brief Load admin1 code-to-name mappings from a GeoNames
302 /// admin1CodesASCII.txt file.
303 ///
304 /// Each line has tab-separated fields:
305 /// CC.CODE \t Name \t ASCII_Name \t GeoNameID
306 ///
307 /// After loading, resolve_admin1() can translate raw admin1 codes
308 /// into human-readable names. Safe to call multiple times.
309 ///
310 /// @param path Path to admin1CodesASCII.txt.
311 void load_admin1(const std::filesystem::path& path)
312 {
313 if (!std::filesystem::is_regular_file(path))
314 return;
315
316 std::ifstream file{path};
317 if (!file.is_open())
318 return;
319
320 std::string line;
321 while (std::getline(file, line))
322 {
323 if (line.empty())
324 continue;
325 if (!line.empty() && line.back() == '\r')
326 line.pop_back();
327
328 // Split on first two tabs: key \t name \t ...
329 auto t1 = line.find('\t');
330 if (t1 == std::string::npos)
331 continue;
332 auto t2 = line.find('\t', t1 + 1);
333 if (t2 == std::string::npos)
334 t2 = line.size();
335
336 auto key = line.substr(0, t1); // "CC.CODE"
337 auto name = line.substr(t1 + 1, t2 - t1 - 1);
338 _admin1_names[key] = name;
339 }
340 }
341
342 /// @brief Resolve an admin1 code to a human-readable name.
343 ///
344 /// Looks up the combination of country_code and admin1_code
345 /// (e.g., "US" + "CA" → "California") in the loaded admin1 mapping.
346 /// Returns the raw code if no match is found.
347 ///
348 /// @param country_code ISO-3166 two-letter country code.
349 /// @param admin1_code GeoNames first-level administrative code.
350 /// @return The admin1 name, or the raw code if not mapped.
351 [[nodiscard]] std::string resolve_admin1(
352 const std::string& country_code,
353 const std::string& admin1_code) const
354 {
355 if (admin1_code.empty())
356 return {};
357
358 auto key = country_code + "." + admin1_code;
359 auto it = _admin1_names.find(key);
360 if (it != _admin1_names.end())
361 return it->second;
362 return admin1_code;
363 }
364
365 /// @brief Check whether admin1 name mappings have been loaded.
366 /// @return `true` if at least one mapping is available.
367 [[nodiscard]] bool has_admin1() const
368 {
369 return !_admin1_names.empty();
370 }
371
372 /// @brief Check whether any city data has been loaded.
373 /// @return `true` if at least one city is available.
374 [[nodiscard]] bool has_data() const
375 {
376 return !_cities.empty();
377 }
378
379 /// @brief Return the number of loaded cities.
380 /// @return City count.
381 [[nodiscard]] std::size_t city_count() const
382 {
383 return _cities.size();
384 }
385
386 /// @brief Load city data from a TSV file.
387 ///
388 /// Expects a tab-delimited file with a header row and 16 columns
389 /// matching the GeoNames schema (as produced by
390 /// scripts/prepare_geonames.py). Safe to call multiple times to add
391 /// from different files.
392 ///
393 /// @param tsv_path Path to the TSV file.
394 void load(const std::filesystem::path& tsv_path)
395 {
396 if (!std::filesystem::exists(tsv_path) ||
397 !std::filesystem::is_regular_file(tsv_path))
398 {
399 return;
400 }
401
402 std::ifstream file{tsv_path};
403 if (!file.is_open())
404 {
405 return;
406 }
407
408 std::string line;
409
410 // Skip header row.
411 if (!std::getline(file, line))
412 {
413 return;
414 }
415
416 while (std::getline(file, line))
417 {
418 if (line.empty())
419 {
420 continue;
421 }
422
423 // Strip trailing carriage return.
424 if (!line.empty() && line.back() == '\r')
425 {
426 line.pop_back();
427 }
428
429 auto c = parse_line(line);
430 if (c.geonameid != 0)
431 {
432 _cities.push_back(std::move(c));
433 }
434 }
435
436 rebuild_indices();
437 }
438
439 /// @brief Load a specific dataset tier from a base resources directory.
440 ///
441 /// Probes common base paths ("resources", "../resources",
442 /// "city-generator/resources") and loads from the `lite/` or `full/`
443 /// subfolder according to @p tier.
444 ///
445 /// @param tier The dataset size to load (dataset::lite or dataset::full).
446 /// @return `true` if a matching directory was found and loaded.
447 bool load(dataset tier)
448 {
449 static constexpr std::array probe_paths = {
450 "resources", "../resources", "city-generator/resources"};
451
452 const char* subfolder =
453 (tier == dataset::full) ? "full" : "lite";
454
455 auto found = std::ranges::find_if(probe_paths, [&](const char* base) {
456 const auto tsv =
457 std::filesystem::path{base} / subfolder / "cities.tsv";
458 return std::filesystem::is_regular_file(tsv);
459 });
460 if (found != probe_paths.end())
461 {
462 load(std::filesystem::path{*found} / subfolder / "cities.tsv");
463 return true;
464 }
465 return false;
466 }
467
468 private:
469 // All loaded cities.
470 std::vector<city> _cities;
471
472 // Admin1 code-to-name mapping ("CC.CODE" → name).
473 std::unordered_map<std::string, std::string> _admin1_names;
474
475 // Population-weighted distribution over _cities indices.
476 mutable std::discrete_distribution<std::size_t> _distribution;
477
478 // Uniform distribution over _cities indices.
479 mutable std::uniform_int_distribution<std::size_t> _uniform;
480
481 // Whether to use population-weighted (true) or uniform (false) selection.
482 bool _weighted{true};
483
484 // Pre-built per-country index for O(1) country-filtered lookups.
485 struct country_entry
486 {
487 std::vector<std::size_t> city_indices;
488 mutable std::discrete_distribution<std::size_t> distribution;
489 };
490
491 std::unordered_map<std::string, country_entry> _country_index;
492
493 // Per-instance random engine for seed drawing.
494 std::mt19937_64 _engine{std::random_device{}()};
495
496 // Tag type for the auto-probing singleton constructor.
497 struct auto_probe_tag {};
498
499 // Singleton constructor: auto-probes common resource locations.
500 explicit cg(auto_probe_tag /*tag*/)
501 {
502 static constexpr std::array probe_paths = {
503 "resources", "../resources", "city-generator/resources"};
504
505 auto found = std::ranges::find_if(probe_paths, [](const char* p) {
506 return std::filesystem::exists(p) &&
507 std::filesystem::is_directory(p);
508 });
509 if (found != probe_paths.end())
510 {
511 const std::filesystem::path base{*found};
512 auto lite_tsv = base / "lite" / "cities.tsv";
513 auto full_tsv = base / "full" / "cities.tsv";
514 if (std::filesystem::is_regular_file(lite_tsv))
515 {
516 load(lite_tsv);
517 }
518 else if (std::filesystem::is_regular_file(full_tsv))
519 {
520 load(full_tsv);
521 }
522 }
523 }
524
525 // Rebuild the global distribution and country index.
526 void rebuild_indices()
527 {
528 // Global distribution.
529 std::vector<double> weights;
530 weights.reserve(_cities.size());
531
532 std::ranges::transform(_cities, std::back_inserter(weights),
533 [](const city& c) {
534 return static_cast<double>(
535 std::max<std::uint64_t>(c.population, 1));
536 });
537
538 _distribution = std::discrete_distribution<std::size_t>(
539 weights.begin(), weights.end());
540
541 // Uniform distribution.
542 if (!_cities.empty())
543 {
544 _uniform = std::uniform_int_distribution<std::size_t>(
545 0, _cities.size() - 1);
546 }
547
548 // Country index.
549 _country_index.clear();
550
551 for (auto&& [i, c] : _cities | std::views::enumerate)
552 {
553 auto& entry = _country_index[c.country_code];
554 entry.city_indices.push_back(static_cast<std::size_t>(i));
555 }
556
557 // Build per-country distributions.
558 for (auto& [code, entry] : _country_index)
559 {
560 std::vector<double> cw;
561 cw.reserve(entry.city_indices.size());
562
563 for (auto idx : entry.city_indices)
564 {
565 cw.push_back(static_cast<double>(
566 std::max<std::uint64_t>(_cities[idx].population, 1)));
567 }
568
569 entry.distribution = std::discrete_distribution<std::size_t>(
570 cw.begin(), cw.end());
571 }
572 }
573
574 // Parse a single tab-delimited line into a city object.
575 // Expected 16 fields: geonameid, name, asciiname, latitude, longitude,
576 // feature_code, country_code, cc2, admin1..4, population, elevation,
577 // dem, timezone.
578 static city parse_line(const std::string& line)
579 {
580 city c;
581
582 // Split by tabs using C++23 views::split.
583 std::vector<std::string> fields;
584 fields.reserve(16);
585
586 for (auto part : line | std::views::split('\t'))
587 {
588 fields.emplace_back(std::ranges::begin(part),
589 std::ranges::end(part));
590 }
591
592 // Expected field count.
593 static constexpr std::size_t expected_fields{16};
594
595 if (fields.size() < expected_fields)
596 {
597 return c; // geonameid stays 0, skipped by caller.
598 }
599
600 try
601 {
602 c.geonameid = static_cast<std::uint32_t>(std::stoul(fields[0]));
603 c.name = fields[1];
604 c.asciiname = fields[2];
605 c.latitude = std::stod(fields[3]);
606 c.longitude = std::stod(fields[4]);
607 c.feature_code = fields[5];
608 c.country_code = fields[6];
609 c.cc2 = fields[7];
610 c.admin1_code = fields[8];
611 c.admin2_code = fields[9];
612 c.admin3_code = fields[10];
613 c.admin4_code = fields[11];
614 c.population =
615 fields[12].empty()
616 ? 0
617 : static_cast<std::uint64_t>(std::stoull(fields[12]));
618 c.elevation =
619 fields[13].empty()
620 ? static_cast<std::int16_t>(-9999)
621 : static_cast<std::int16_t>(std::stoi(fields[13]));
622 c.dem = fields[14].empty()
623 ? static_cast<std::int16_t>(0)
624 : static_cast<std::int16_t>(std::stoi(fields[14]));
625 c.timezone = fields[15];
626 }
627 catch (...)
628 {
629 c.geonameid = 0; // Mark as invalid.
630 }
631
632 return c;
633 }
634};
635
636} // namespace dasmig
637
638#endif // DASMIG_CITYGEN_HPP
@ full
~200k cities (population >= 500).
@ lite
~25k major cities (population >= 15,000).
City generator that produces random cities from GeoNames data.
Definition citygen.hpp:140
~cg()=default
Default destructor.
void load_admin1(const std::filesystem::path &path)
Load admin1 code-to-name mappings from a GeoNames admin1CodesASCII.txt file.
Definition citygen.hpp:311
city get_city()
Generate a random city.
Definition citygen.hpp:171
bool load(dataset tier)
Load a specific dataset tier from a base resources directory.
Definition citygen.hpp:447
cg()=default
Default constructor — creates an empty generator with no data.
cg & seed(std::uint64_t seed_value)
Seed the internal random engine for deterministic sequences.
Definition citygen.hpp:263
std::size_t city_count() const
Return the number of loaded cities.
Definition citygen.hpp:381
cg & weighted(bool enable)
Set whether generation is population-weighted or uniform.
Definition citygen.hpp:288
city get_city(std::uint64_t call_seed) const
Generate a deterministic city using a specific seed.
Definition citygen.hpp:185
void load(const std::filesystem::path &tsv_path)
Load city data from a TSV file.
Definition citygen.hpp:394
bool weighted() const
Query whether generation is population-weighted.
Definition citygen.hpp:296
city get_city(const std::string &country, std::uint64_t call_seed) const
Generate a deterministic city filtered by country code.
Definition citygen.hpp:222
city get_city(const std::string &country)
Generate a random city filtered by country code.
Definition citygen.hpp:210
cg(cg &&)=default
Move constructor.
std::string resolve_admin1(const std::string &country_code, const std::string &admin1_code) const
Resolve an admin1 code to a human-readable name.
Definition citygen.hpp:351
cg & operator=(cg &&)=default
Move assignment.
cg & operator=(const cg &)=delete
Not copyable.
static cg & instance()
Access the global singleton instance.
Definition citygen.hpp:159
bool has_admin1() const
Check whether admin1 name mappings have been loaded.
Definition citygen.hpp:367
cg & unseed()
Reseed the engine with a non-deterministic source.
Definition citygen.hpp:273
cg(const cg &)=delete
Not copyable.
bool has_data() const
Check whether any city data has been loaded.
Definition citygen.hpp:374
Return type for city generation, holding all GeoNames fields.
Definition citygen.hpp:43
double latitude
WGS84 latitude in decimal degrees.
Definition citygen.hpp:55
std::uint32_t geonameid
GeoNames primary key.
Definition citygen.hpp:46
std::string admin3_code
Third-level administrative division code (township/commune).
Definition citygen.hpp:76
std::string admin4_code
Fourth-level administrative division code (sub-district).
Definition citygen.hpp:79
std::string asciiname
Plain ASCII transliteration of the name.
Definition citygen.hpp:52
double longitude
WGS84 longitude in decimal degrees.
Definition citygen.hpp:58
std::string country_code
ISO-3166 two-letter country code.
Definition citygen.hpp:64
std::string timezone
IANA timezone identifier (e.g., "Europe/London").
Definition citygen.hpp:91
std::string feature_code
GeoNames feature code (PPL, PPLA, PPLC, etc.).
Definition citygen.hpp:61
std::string name
UTF-8 city name.
Definition citygen.hpp:49
std::string admin2_code
Second-level administrative division code (county/district).
Definition citygen.hpp:73
std::uint64_t population
City population.
Definition citygen.hpp:82
std::string admin1_code
First-level administrative division code (state/province).
Definition citygen.hpp:70
std::uint64_t seed() const
Retrieve the random seed used to generate this city.
Definition citygen.hpp:96
std::int16_t dem
Digital elevation model value (more reliable than elevation).
Definition citygen.hpp:88
std::int16_t elevation
Elevation in meters (-9999 if unknown).
Definition citygen.hpp:85
friend std::ostream & operator<<(std::ostream &os, const city &c)
Stream the city name to an output stream.
Definition citygen.hpp:112
std::string cc2
Alternate country codes (comma-separated).
Definition citygen.hpp:67