City Generator 1.0.1
Procedural city generation for C++23
Loading...
Searching...
No Matches
citygen.hpp
Go to the documentation of this file.
1#ifndef DASMIG_CITYGEN_HPP
2#define DASMIG_CITYGEN_HPP
3
4#include "random.hpp"
5#include <algorithm>
6#include <array>
7#include <cstddef>
8#include <cstdint>
9#include <filesystem>
10#include <fstream>
11#include <ostream>
12#include <random>
13#include <ranges>
14#include <stdexcept>
15#include <string>
16#include <unordered_map>
17#include <utility>
18#include <vector>
19
20/// @file citygen.hpp
21/// @brief City generator library — procedural city generation for C++23.
22/// @author Diego Dasso Migotto (diegomigotto at hotmail dot com)
23/// @see See doc/usage.md for the narrative tutorial.
24
25namespace dasmig
26{
27
28/// @brief Dataset size tier for resource loading.
29#ifndef DASMIG_DATASET_DEFINED
30#define DASMIG_DATASET_DEFINED
31enum class dataset : std::uint8_t
32{
33 lite, ///< ~25k major cities (population >= 15,000).
34 full ///< ~200k cities (population >= 500).
35};
36#endif
37
38/// @brief Return type for city generation, holding all GeoNames fields.
39///
40/// Supports implicit conversion to std::string (returns the city name)
41/// and streaming via operator<<.
42class city
43{
44 public:
45 /// @brief GeoNames primary key.
46 std::uint32_t geonameid{0};
47
48 /// @brief UTF-8 city name.
49 std::string name;
50
51 /// @brief Plain ASCII transliteration of the name.
52 std::string asciiname;
53
54 /// @brief WGS84 latitude in decimal degrees.
55 double latitude{0.0};
56
57 /// @brief WGS84 longitude in decimal degrees.
58 double longitude{0.0};
59
60 /// @brief GeoNames feature code (PPL, PPLA, PPLC, etc.).
61 std::string feature_code;
62
63 /// @brief ISO-3166 two-letter country code.
64 std::string country_code;
65
66 /// @brief Alternate country codes (comma-separated).
67 std::string cc2;
68
69 /// @brief First-level administrative division code (state/province).
70 std::string admin1_code;
71
72 /// @brief Second-level administrative division code (county/district).
73 std::string admin2_code;
74
75 /// @brief Third-level administrative division code (township/commune).
76 std::string admin3_code;
77
78 /// @brief Fourth-level administrative division code (sub-district).
79 std::string admin4_code;
80
81 /// @brief City population.
82 std::uint64_t population{0};
83
84 /// @brief Elevation in meters (-9999 if unknown).
85 std::int16_t elevation{-9999};
86
87 /// @brief Digital elevation model value (more reliable than elevation).
88 std::int16_t dem{0};
89
90 /// @brief IANA timezone identifier (e.g., "Europe/London").
91 std::string timezone;
92
93 /// @brief Retrieve the random seed used to generate this city.
94 /// @return The per-call seed for replay.
95 /// @see cg::get_city(std::uint64_t)
96 [[nodiscard]] std::uint64_t seed() const
97 {
98 return _seed;
99 }
100
101 /// @brief Implicit conversion to std::string.
102 /// @return The UTF-8 city name.
103 operator std::string() const // NOLINT(hicpp-explicit-conversions)
104 {
105 return name;
106 }
107
108 /// @brief Stream the city name to an output stream.
109 /// @param os Output stream.
110 /// @param c City to stream.
111 /// @return Reference to the output stream.
112 friend std::ostream& operator<<(std::ostream& os, const city& c)
113 {
114 os << c.name;
115 return os;
116 }
117
118 private:
119 std::uint64_t _seed{0}; ///< Random seed used to generate this city.
120
121 friend class cg; ///< Allows cg to set the seed.
122};
123
124/// @brief City generator that produces random cities from GeoNames data.
125///
126/// Generates cities using population-weighted random selection from
127/// GeoNames data. Larger cities are proportionally more likely to be
128/// selected, mirroring real-world population distributions.
129///
130/// Can be used as a singleton via instance() or constructed independently.
131/// Independent instances own their own data and random engine,
132/// making them safe for concurrent use without shared state.
133///
134/// @par Thread safety
135/// Each instance is independent. Concurrent calls to get_city() on
136/// the **same** instance require external synchronization. load() mutates
137/// internal state and must not be called concurrently with get_city()
138/// on the same instance.
139class cg
140{
141 public:
142 /// @brief Default constructor — creates an empty generator with no data.
143 ///
144 /// Call load() to populate city data before generating.
145 cg() = default;
146
147 cg(const cg&) = delete; ///< Not copyable.
148 cg& operator=(const cg&) = delete; ///< Not copyable.
149 cg(cg&&) = default; ///< Move constructor.
150 cg& operator=(cg&&) = default; ///< Move assignment.
151 ~cg() = default; ///< Default destructor.
152
153 /// @brief Access the global singleton instance.
154 ///
155 /// The singleton auto-probes common resource paths on first access.
156 /// For independent generators (e.g., inside an entity-generator
157 /// component), prefer constructing a separate cg instance.
158 /// @return Reference to the global cg instance.
159 static cg& instance()
160 {
161 static cg inst{auto_probe_tag{}};
162 return inst;
163 }
164
165 /// @brief Generate a random city.
166 ///
167 /// By default, selection is population-weighted. Call weighted(false)
168 /// to switch to uniform random selection.
169 /// @return A city object with all GeoNames fields.
170 /// @throws std::runtime_error If no city data has been loaded.
171 [[nodiscard]] city get_city()
172 {
173 auto call_seed = static_cast<std::uint64_t>(_engine());
174 return get_city(call_seed);
175 }
176
177 /// @brief Generate a deterministic city using a specific seed.
178 ///
179 /// Given the same loaded data and seed, this method always produces the
180 /// same city. Retrieve the seed from a previous city via city::seed().
181 ///
182 /// @param call_seed Seed for reproducible results.
183 /// @return A city object with all GeoNames fields.
184 /// @throws std::runtime_error If no city data has been loaded.
185 [[nodiscard]] city get_city(std::uint64_t call_seed) const
186 {
187 if (_cities.empty())
188 {
189 throw std::runtime_error(
190 "No city data loaded. Call load() first.");
191 }
192
193 effolkronium::random_local call_engine;
194 call_engine.seed(static_cast<std::mt19937::result_type>(
195 (call_seed ^ (call_seed >> 32U))));
196
197 auto idx = _weighted
198 ? _distribution(call_engine.engine())
199 : _uniform(call_engine.engine());
200 city result = _cities[idx];
201 result._seed = call_seed;
202 return result;
203 }
204
205 /// @brief Generate a random city filtered by country code.
206 /// @param country ISO-3166 two-letter country code (e.g., "BR", "US").
207 /// @return A city object from the specified country.
208 /// @throws std::runtime_error If no city data has been loaded.
209 /// @throws std::invalid_argument If no cities match the country code.
210 [[nodiscard]] city get_city(const std::string& country)
211 {
212 auto call_seed = static_cast<std::uint64_t>(_engine());
213 return get_city(country, call_seed);
214 }
215
216 /// @brief Generate a deterministic city filtered by country code.
217 /// @param country ISO-3166 two-letter country code.
218 /// @param call_seed Seed for reproducible results.
219 /// @return A city object from the specified country.
220 /// @throws std::runtime_error If no city data has been loaded.
221 /// @throws std::invalid_argument If no cities match the country code.
222 [[nodiscard]] city get_city(const std::string& country,
223 std::uint64_t call_seed) const
224 {
225 if (_cities.empty())
226 {
227 throw std::runtime_error(
228 "No city data loaded. Call load() first.");
229 }
230
231 auto it = _country_index.find(country);
232 if (it == _country_index.end())
233 {
234 throw std::invalid_argument(
235 "No cities found for country code: " + country);
236 }
237
238 effolkronium::random_local call_engine;
239 call_engine.seed(static_cast<std::mt19937::result_type>(
240 (call_seed ^ (call_seed >> 32U))));
241
242 const auto& idx = it->second;
243 auto selected = _weighted
244 ? idx.distribution(call_engine.engine())
245 : std::uniform_int_distribution<std::size_t>(
246 0, idx.city_indices.size() - 1)(
247 call_engine.engine());
248 city result = _cities[idx.city_indices[selected]];
249 result._seed = call_seed;
250 return result;
251 }
252
253 /// @name Seeding
254 /// @{
255
256 /// @brief Seed the internal random engine for deterministic sequences.
257 ///
258 /// Subsequent get_city() calls (without an explicit seed) draw
259 /// per-call seeds from this engine, producing a reproducible sequence.
260 ///
261 /// @param seed_value The seed value.
262 /// @return `*this` for chaining.
263 cg& seed(std::uint64_t seed_value)
264 {
265 _engine.seed(seed_value);
266 return *this;
267 }
268
269 /// @brief Reseed the engine with a non-deterministic source.
270 ///
271 /// Subsequent get_city() calls will produce non-reproducible results.
272 /// @return `*this` for chaining.
274 {
275 _engine.seed(std::random_device{}());
276 return *this;
277 }
278
279 /// @}
280
281 /// @brief Set whether generation is population-weighted or uniform.
282 ///
283 /// When `true` (default), larger cities are proportionally more likely
284 /// to be selected. When `false`, every city has an equal probability.
285 ///
286 /// @param enable `true` for weighted, `false` for uniform.
287 /// @return `*this` for chaining.
288 cg& weighted(bool enable)
289 {
290 _weighted = enable;
291 return *this;
292 }
293
294 /// @brief Query whether generation is population-weighted.
295 /// @return `true` if weighted (default), `false` if uniform.
296 [[nodiscard]] bool weighted() const
297 {
298 return _weighted;
299 }
300
301 /// @brief Check whether any city data has been loaded.
302 /// @return `true` if at least one city is available.
303 [[nodiscard]] bool has_data() const
304 {
305 return !_cities.empty();
306 }
307
308 /// @brief Return the number of loaded cities.
309 /// @return City count.
310 [[nodiscard]] std::size_t city_count() const
311 {
312 return _cities.size();
313 }
314
315 /// @brief Load city data from a TSV file.
316 ///
317 /// Expects a tab-delimited file with a header row and 16 columns
318 /// matching the GeoNames schema (as produced by
319 /// scripts/prepare_geonames.py). Safe to call multiple times to add
320 /// from different files.
321 ///
322 /// @param tsv_path Path to the TSV file.
323 void load(const std::filesystem::path& tsv_path)
324 {
325 if (!std::filesystem::exists(tsv_path) ||
326 !std::filesystem::is_regular_file(tsv_path))
327 {
328 return;
329 }
330
331 std::ifstream file{tsv_path};
332 if (!file.is_open())
333 {
334 return;
335 }
336
337 std::string line;
338
339 // Skip header row.
340 if (!std::getline(file, line))
341 {
342 return;
343 }
344
345 while (std::getline(file, line))
346 {
347 if (line.empty())
348 {
349 continue;
350 }
351
352 // Strip trailing carriage return.
353 if (!line.empty() && line.back() == '\r')
354 {
355 line.pop_back();
356 }
357
358 auto c = parse_line(line);
359 if (c.geonameid != 0)
360 {
361 _cities.push_back(std::move(c));
362 }
363 }
364
365 rebuild_indices();
366 }
367
368 /// @brief Load a specific dataset tier from a base resources directory.
369 ///
370 /// Probes common base paths ("resources", "../resources",
371 /// "city-generator/resources") and loads from the `lite/` or `full/`
372 /// subfolder according to @p tier.
373 ///
374 /// @param tier The dataset size to load (dataset::lite or dataset::full).
375 /// @return `true` if a matching directory was found and loaded.
376 bool load(dataset tier)
377 {
378 static constexpr std::array probe_paths = {
379 "resources", "../resources", "city-generator/resources"};
380
381 const char* subfolder =
382 (tier == dataset::full) ? "full" : "lite";
383
384 auto found = std::ranges::find_if(probe_paths, [&](const char* base) {
385 const auto tsv =
386 std::filesystem::path{base} / subfolder / "cities.tsv";
387 return std::filesystem::is_regular_file(tsv);
388 });
389 if (found != probe_paths.end())
390 {
391 load(std::filesystem::path{*found} / subfolder / "cities.tsv");
392 return true;
393 }
394 return false;
395 }
396
397 private:
398 // All loaded cities.
399 std::vector<city> _cities;
400
401 // Population-weighted distribution over _cities indices.
402 mutable std::discrete_distribution<std::size_t> _distribution;
403
404 // Uniform distribution over _cities indices.
405 mutable std::uniform_int_distribution<std::size_t> _uniform;
406
407 // Whether to use population-weighted (true) or uniform (false) selection.
408 bool _weighted{true};
409
410 // Pre-built per-country index for O(1) country-filtered lookups.
411 struct country_entry
412 {
413 std::vector<std::size_t> city_indices;
414 mutable std::discrete_distribution<std::size_t> distribution;
415 };
416
417 std::unordered_map<std::string, country_entry> _country_index;
418
419 // Per-instance random engine for seed drawing.
420 std::mt19937_64 _engine{std::random_device{}()};
421
422 // Tag type for the auto-probing singleton constructor.
423 struct auto_probe_tag {};
424
425 // Singleton constructor: auto-probes common resource locations.
426 explicit cg(auto_probe_tag /*tag*/)
427 {
428 static constexpr std::array probe_paths = {
429 "resources", "../resources", "city-generator/resources"};
430
431 auto found = std::ranges::find_if(probe_paths, [](const char* p) {
432 return std::filesystem::exists(p) &&
433 std::filesystem::is_directory(p);
434 });
435 if (found != probe_paths.end())
436 {
437 const std::filesystem::path base{*found};
438 auto lite_tsv = base / "lite" / "cities.tsv";
439 auto full_tsv = base / "full" / "cities.tsv";
440 if (std::filesystem::is_regular_file(lite_tsv))
441 {
442 load(lite_tsv);
443 }
444 else if (std::filesystem::is_regular_file(full_tsv))
445 {
446 load(full_tsv);
447 }
448 }
449 }
450
451 // Rebuild the global distribution and country index.
452 void rebuild_indices()
453 {
454 // Global distribution.
455 std::vector<double> weights;
456 weights.reserve(_cities.size());
457
458 std::ranges::transform(_cities, std::back_inserter(weights),
459 [](const city& c) {
460 return static_cast<double>(
461 std::max<std::uint64_t>(c.population, 1));
462 });
463
464 _distribution = std::discrete_distribution<std::size_t>(
465 weights.begin(), weights.end());
466
467 // Uniform distribution.
468 if (!_cities.empty())
469 {
470 _uniform = std::uniform_int_distribution<std::size_t>(
471 0, _cities.size() - 1);
472 }
473
474 // Country index.
475 _country_index.clear();
476
477 for (auto&& [i, c] : _cities | std::views::enumerate)
478 {
479 auto& entry = _country_index[c.country_code];
480 entry.city_indices.push_back(static_cast<std::size_t>(i));
481 }
482
483 // Build per-country distributions.
484 for (auto& [code, entry] : _country_index)
485 {
486 std::vector<double> cw;
487 cw.reserve(entry.city_indices.size());
488
489 for (auto idx : entry.city_indices)
490 {
491 cw.push_back(static_cast<double>(
492 std::max<std::uint64_t>(_cities[idx].population, 1)));
493 }
494
495 entry.distribution = std::discrete_distribution<std::size_t>(
496 cw.begin(), cw.end());
497 }
498 }
499
500 // Parse a single tab-delimited line into a city object.
501 // Expected 16 fields: geonameid, name, asciiname, latitude, longitude,
502 // feature_code, country_code, cc2, admin1..4, population, elevation,
503 // dem, timezone.
504 static city parse_line(const std::string& line)
505 {
506 city c;
507
508 // Split by tabs using C++23 views::split.
509 std::vector<std::string> fields;
510 fields.reserve(16);
511
512 for (auto part : line | std::views::split('\t'))
513 {
514 fields.emplace_back(std::ranges::begin(part),
515 std::ranges::end(part));
516 }
517
518 // Expected field count.
519 static constexpr std::size_t expected_fields{16};
520
521 if (fields.size() < expected_fields)
522 {
523 return c; // geonameid stays 0, skipped by caller.
524 }
525
526 try
527 {
528 c.geonameid = static_cast<std::uint32_t>(std::stoul(fields[0]));
529 c.name = fields[1];
530 c.asciiname = fields[2];
531 c.latitude = std::stod(fields[3]);
532 c.longitude = std::stod(fields[4]);
533 c.feature_code = fields[5];
534 c.country_code = fields[6];
535 c.cc2 = fields[7];
536 c.admin1_code = fields[8];
537 c.admin2_code = fields[9];
538 c.admin3_code = fields[10];
539 c.admin4_code = fields[11];
540 c.population =
541 fields[12].empty()
542 ? 0
543 : static_cast<std::uint64_t>(std::stoull(fields[12]));
544 c.elevation =
545 fields[13].empty()
546 ? static_cast<std::int16_t>(-9999)
547 : static_cast<std::int16_t>(std::stoi(fields[13]));
548 c.dem = fields[14].empty()
549 ? static_cast<std::int16_t>(0)
550 : static_cast<std::int16_t>(std::stoi(fields[14]));
551 c.timezone = fields[15];
552 }
553 catch (...)
554 {
555 c.geonameid = 0; // Mark as invalid.
556 }
557
558 return c;
559 }
560};
561
562} // namespace dasmig
563
564#endif // DASMIG_CITYGEN_HPP
@ full
~200k cities (population >= 500).
@ lite
~25k major cities (population >= 15,000).
City generator that produces random cities from GeoNames data.
Definition citygen.hpp:140
~cg()=default
Default destructor.
city get_city()
Generate a random city.
Definition citygen.hpp:171
bool load(dataset tier)
Load a specific dataset tier from a base resources directory.
Definition citygen.hpp:376
cg()=default
Default constructor — creates an empty generator with no data.
cg & seed(std::uint64_t seed_value)
Seed the internal random engine for deterministic sequences.
Definition citygen.hpp:263
std::size_t city_count() const
Return the number of loaded cities.
Definition citygen.hpp:310
cg & weighted(bool enable)
Set whether generation is population-weighted or uniform.
Definition citygen.hpp:288
city get_city(std::uint64_t call_seed) const
Generate a deterministic city using a specific seed.
Definition citygen.hpp:185
void load(const std::filesystem::path &tsv_path)
Load city data from a TSV file.
Definition citygen.hpp:323
bool weighted() const
Query whether generation is population-weighted.
Definition citygen.hpp:296
city get_city(const std::string &country, std::uint64_t call_seed) const
Generate a deterministic city filtered by country code.
Definition citygen.hpp:222
city get_city(const std::string &country)
Generate a random city filtered by country code.
Definition citygen.hpp:210
cg(cg &&)=default
Move constructor.
cg & operator=(cg &&)=default
Move assignment.
cg & operator=(const cg &)=delete
Not copyable.
static cg & instance()
Access the global singleton instance.
Definition citygen.hpp:159
cg & unseed()
Reseed the engine with a non-deterministic source.
Definition citygen.hpp:273
cg(const cg &)=delete
Not copyable.
bool has_data() const
Check whether any city data has been loaded.
Definition citygen.hpp:303
Return type for city generation, holding all GeoNames fields.
Definition citygen.hpp:43
double latitude
WGS84 latitude in decimal degrees.
Definition citygen.hpp:55
std::uint32_t geonameid
GeoNames primary key.
Definition citygen.hpp:46
std::string admin3_code
Third-level administrative division code (township/commune).
Definition citygen.hpp:76
std::string admin4_code
Fourth-level administrative division code (sub-district).
Definition citygen.hpp:79
std::string asciiname
Plain ASCII transliteration of the name.
Definition citygen.hpp:52
double longitude
WGS84 longitude in decimal degrees.
Definition citygen.hpp:58
std::string country_code
ISO-3166 two-letter country code.
Definition citygen.hpp:64
std::string timezone
IANA timezone identifier (e.g., "Europe/London").
Definition citygen.hpp:91
std::string feature_code
GeoNames feature code (PPL, PPLA, PPLC, etc.).
Definition citygen.hpp:61
std::string name
UTF-8 city name.
Definition citygen.hpp:49
std::string admin2_code
Second-level administrative division code (county/district).
Definition citygen.hpp:73
std::uint64_t population
City population.
Definition citygen.hpp:82
std::string admin1_code
First-level administrative division code (state/province).
Definition citygen.hpp:70
std::uint64_t seed() const
Retrieve the random seed used to generate this city.
Definition citygen.hpp:96
std::int16_t dem
Digital elevation model value (more reliable than elevation).
Definition citygen.hpp:88
std::int16_t elevation
Elevation in meters (-9999 if unknown).
Definition citygen.hpp:85
friend std::ostream & operator<<(std::ostream &os, const city &c)
Stream the city name to an output stream.
Definition citygen.hpp:112
std::string cc2
Alternate country codes (comma-separated).
Definition citygen.hpp:67