Name Generator 2.0.1
Culture-aware name generation for C++23
Loading...
Searching...
No Matches
namegen.hpp
Go to the documentation of this file.
1#ifndef DASMIG_NAMEGEN_HPP
2#define DASMIG_NAMEGEN_HPP
3
4#include "random.hpp"
5#include <algorithm>
6#include <array>
7#include <cstddef>
8#include <cstdint>
9#include <filesystem>
10#include <fstream>
11#include <functional>
12#include <iterator>
13#include <map>
14#include <ostream>
15#include <random>
16#include <stdexcept>
17#include <string>
18#include <string_view>
19#include <utility>
20#include <vector>
21
22/// @file namegen.hpp
23/// @brief Name generator library — culture-aware name generation for C++23.
24/// @author Diego Dasso Migotto (diegomigotto at hotmail dot com)
25/// @see See doc/usage.md for the narrative tutorial.
26
27struct ng_test_access;
28
29namespace dasmig
30{
31
32/// @brief Culture representing a country or a broader group.
33enum class culture : std::uint8_t
34{
35 afghan,
36 albanian,
37 algerian,
38 american,
39 angolan,
40 argentinian,
41 austrian,
42 azerbaijani,
43 bahraini,
44 bangladeshi,
45 belgian,
46 bolivian,
47 botswanan,
48 brazilian,
49 british,
50 bruneian,
51 bulgarian,
52 burkinabe,
53 burundian,
54 cambodian,
55 cameroonian,
56 canadian,
57 chilean,
58 chinese,
59 colombian,
60 costarican,
61 croatian,
62 cypriot,
63 czech,
64 danish,
65 djiboutian,
66 dutch,
67 ecuadorian,
68 egyptian,
69 emirati,
70 estonian,
71 ethiopian,
72 fijian,
73 filipino,
74 finnish,
75 french,
76 georgian,
77 german,
78 ghanaian,
79 greek,
80 guatemalan,
81 haitian,
82 honduran,
83 hongkonger,
84 hungarian,
85 icelandic,
86 indian,
87 indonesian,
88 iranian,
89 iraqi,
90 irish,
91 israeli,
92 italian,
93 jamaican,
94 japanese,
95 jordanian,
96 kazakh,
97 korean,
98 kuwaiti,
99 lebanese,
100 libyan,
101 lithuanian,
102 luxembourgish,
103 macanese,
104 malaysian,
105 maldivian,
106 maltese,
107 mauritian,
108 mexican,
109 moldovan,
110 moroccan,
111 namibian,
112 nigerian,
113 norwegian,
114 omani,
115 palestinian,
116 panamanian,
117 peruvian,
118 polish,
119 portuguese,
120 puertorican,
121 qatari,
122 russian,
123 salvadoran,
124 saudi,
125 serbian,
126 singaporean,
127 slovenian,
128 southafrican,
129 spanish,
130 sudanese,
131 swedish,
132 swiss,
133 syrian,
134 taiwanese,
135 tunisian,
136 turkish,
137 turkmen,
138 uruguayan,
139 yemeni,
140 any
141};
142
143/// @brief Dataset size tier for resource loading.
144#ifndef DASMIG_DATASET_DEFINED
145#define DASMIG_DATASET_DEFINED
146enum class dataset : std::uint8_t
147{
148 lite, ///< Top-500 names per category (~2 MB).
149 full ///< Complete dataset (~39 MB).
150};
151#endif
152
153/// @brief Simple gender enum to distinguish between male and female names.
154enum class gender : std::uint8_t
155{
156 m,
157 f,
158 any
159};
160
161/// @brief Return type for name generation, holding both individual parts and
162/// the full composed string.
163///
164/// Supports implicit conversion to std::wstring, streaming via operator<<,
165/// and chained appending of names and surnames.
166class name
167{
168 public:
169 /// @brief Retrieve the random seed used to generate this name.
170 /// @return The per-call seed for replay.
171 /// @see ng::get_name(gender, culture, std::uint64_t)
172 [[nodiscard]] std::uint64_t seed() const
173 {
174 return _seed;
175 }
176
177 /// @brief Return the individual parts (names/surnames) as a vector.
178 /// @return Vector of name parts.
179 [[nodiscard]] const std::vector<std::wstring>& parts() const
180 {
181 return _parts;
182 }
183
184 /// @brief Append a forename to this name, preserving gender and culture.
185 /// @return `*this` for chaining.
186 name& append_name();
187
188 /// @brief Append a forename of a specific culture.
189 /// @param c Culture for the appended name.
190 /// @return `*this` for chaining.
192
193 /// @brief Append a surname to this name, preserving culture.
194 /// @return `*this` for chaining.
196
197 /// @brief Append a surname of a specific culture.
198 /// @param c Culture for the appended surname.
199 /// @return `*this` for chaining.
201
202 /// @brief Implicit conversion to std::wstring.
203 /// @return The full composed name string.
204 operator std::wstring() const // NOLINT(hicpp-explicit-conversions)
205 {
206 return _full_string;
207 }
208
209 /// @brief Implicit conversion to a vector of name parts.
210 operator std::vector<std::wstring>() const // NOLINT(hicpp-explicit-conversions)
211 {
212 return _parts;
213 }
214
215 /// @brief Stream the name to a wide output stream.
216 /// @param wos Output stream.
217 /// @param n Name to stream.
218 /// @return Reference to the output stream.
219 friend std::wostream& operator<<(std::wostream& wos, const name& n)
220 {
221 wos << n._full_string;
222 return wos;
223 }
224
225 private:
226 // Private constructor — names are created only by ng.
227 name(std::wstring name_str, gender g, culture c, class ng* owner,
228 std::uint64_t seed = 0)
229 : _full_string(std::move(name_str)), _gender(g), _culture(c),
230 _owner(owner), _seed(seed)
231 {
232 _parts.push_back(_full_string);
233 }
234
235 std::wstring _full_string; ///< Full composed name.
236 std::vector<std::wstring> _parts; ///< Individual name parts.
237 gender _gender; ///< Gender of the first name.
238 culture _culture; ///< Culture of the first name.
239 class ng* _owner; ///< Generator that created this name.
240 std::uint64_t _seed{0}; ///< Random seed for replay.
241
242 friend class ng;
243};
244
245/// @brief Name generator that produces culture-aware names and surnames.
246///
247/// Generates realistic names by picking from popular name databases indexed
248/// by culture and gender. Supports 105 cultures.
249///
250/// Can be used as a singleton via instance() or constructed independently.
251/// Independent instances own their own name databases and random engine.
252///
253/// @par Thread safety
254/// Each instance is independent. Concurrent calls to get_name() on
255/// the **same** instance require external synchronization. load() mutates
256/// internal state and must not be called concurrently with get_name()
257/// on the same instance.
258class ng
259{
260 public:
261 /// @brief Default constructor — creates an empty generator with no names.
262 ///
263 /// Call load() to populate name databases before generating.
264 ng() = default;
265
266 ng(const ng&) = delete; ///< Not copyable.
267 ng& operator=(const ng&) = delete; ///< Not copyable.
268 ng(ng&&) noexcept = default; ///< Move constructor.
269 ng& operator=(ng&&) noexcept = default; ///< Move assignment.
270 ~ng() = default; ///< Default destructor.
271
272 /// @brief Access the global singleton instance.
273 ///
274 /// The singleton auto-probes common resource paths on first access.
275 /// For independent generators, prefer constructing a separate ng instance.
276 /// @return Reference to the global ng instance.
277 static ng& instance()
278 {
279 static ng inst{auto_probe_tag{}};
280 return inst;
281 }
282
283 /// @brief Translate an ISO 3166 2-letter country code to a culture enum.
284 /// @param country_code Two-letter country code (e.g., L"us", L"br").
285 /// @return Matching culture, or culture::any if not recognized.
286 [[nodiscard]] static culture to_culture(std::wstring_view country_code)
287 {
288 static const std::map<std::wstring, culture, std::less<>>
289 country_code_map = {
290 {L"ae", culture::emirati}, {L"af", culture::afghan},
291 {L"al", culture::albanian}, {L"ao", culture::angolan},
292 {L"ar", culture::argentinian}, {L"at", culture::austrian},
293 {L"az", culture::azerbaijani}, {L"bd", culture::bangladeshi},
294 {L"be", culture::belgian}, {L"bf", culture::burkinabe},
295 {L"bg", culture::bulgarian}, {L"bh", culture::bahraini},
296 {L"bi", culture::burundian}, {L"bn", culture::bruneian},
297 {L"bo", culture::bolivian}, {L"br", culture::brazilian},
298 {L"bw", culture::botswanan}, {L"ca", culture::canadian},
299 {L"ch", culture::swiss}, {L"cl", culture::chilean},
300 {L"cm", culture::cameroonian}, {L"cn", culture::chinese},
301 {L"co", culture::colombian}, {L"cr", culture::costarican},
302 {L"cy", culture::cypriot}, {L"cz", culture::czech},
303 {L"de", culture::german}, {L"dj", culture::djiboutian},
304 {L"dk", culture::danish}, {L"dz", culture::algerian},
305 {L"ec", culture::ecuadorian}, {L"ee", culture::estonian},
306 {L"eg", culture::egyptian}, {L"es", culture::spanish},
307 {L"et", culture::ethiopian}, {L"fi", culture::finnish},
308 {L"fj", culture::fijian}, {L"fr", culture::french},
309 {L"gb", culture::british}, {L"ge", culture::georgian},
310 {L"gh", culture::ghanaian}, {L"gr", culture::greek},
311 {L"gt", culture::guatemalan}, {L"hk", culture::hongkonger},
312 {L"hn", culture::honduran}, {L"hr", culture::croatian},
313 {L"ht", culture::haitian}, {L"hu", culture::hungarian},
314 {L"id", culture::indonesian}, {L"ie", culture::irish},
315 {L"il", culture::israeli}, {L"in", culture::indian},
316 {L"iq", culture::iraqi}, {L"ir", culture::iranian},
317 {L"is", culture::icelandic}, {L"it", culture::italian},
318 {L"jm", culture::jamaican}, {L"jo", culture::jordanian},
319 {L"jp", culture::japanese}, {L"kh", culture::cambodian},
320 {L"kr", culture::korean}, {L"kw", culture::kuwaiti},
321 {L"kz", culture::kazakh}, {L"lb", culture::lebanese},
322 {L"lt", culture::lithuanian}, {L"lu", culture::luxembourgish},
323 {L"ly", culture::libyan}, {L"ma", culture::moroccan},
324 {L"md", culture::moldovan}, {L"mo", culture::macanese},
325 {L"mt", culture::maltese}, {L"mu", culture::mauritian},
326 {L"mv", culture::maldivian}, {L"mx", culture::mexican},
327 {L"my", culture::malaysian}, {L"na", culture::namibian},
328 {L"ng", culture::nigerian}, {L"nl", culture::dutch},
329 {L"no", culture::norwegian}, {L"om", culture::omani},
330 {L"pa", culture::panamanian}, {L"pe", culture::peruvian},
331 {L"ph", culture::filipino}, {L"pl", culture::polish},
332 {L"pr", culture::puertorican}, {L"ps", culture::palestinian},
333 {L"pt", culture::portuguese}, {L"qa", culture::qatari},
334 {L"rs", culture::serbian}, {L"ru", culture::russian},
335 {L"sa", culture::saudi}, {L"sd", culture::sudanese},
336 {L"se", culture::swedish}, {L"sg", culture::singaporean},
337 {L"si", culture::slovenian}, {L"sv", culture::salvadoran},
338 {L"sy", culture::syrian}, {L"tm", culture::turkmen},
339 {L"tn", culture::tunisian}, {L"tr", culture::turkish},
340 {L"tw", culture::taiwanese}, {L"us", culture::american},
341 {L"uy", culture::uruguayan}, {L"ye", culture::yemeni},
342 {L"za", culture::southafrican}};
343
344 if (auto it = country_code_map.find(country_code);
345 it != country_code_map.end())
346 {
347 return it->second;
348 }
349 return culture::any;
350 }
351
352 /// @brief Translate a gender string to a gender enum.
353 /// @param gender_string Gender string (e.g., L"male", L"female", L"m", L"f").
354 /// @return Matching gender, or gender::any if not recognized.
355 [[nodiscard]] static gender to_gender(std::wstring_view gender_string)
356 {
357 static const std::map<std::wstring, gender, std::less<>> gender_map = {
358 {L"m", gender::m},
359 {L"f", gender::f},
360 {L"male", gender::m},
361 {L"female", gender::f}};
362
363 if (auto it = gender_map.find(gender_string);
364 it != gender_map.end())
365 {
366 return it->second;
367 }
368 return gender::any;
369 }
370
371 /// @brief Generate a first name.
372 /// @param g Gender (default: random).
373 /// @param c Culture (default: random).
374 /// @return A name object supporting chained appending.
375 /// @throws std::invalid_argument If no names loaded for the resolved culture/gender.
376 [[nodiscard]] name get_name(gender g = gender::any,
377 culture c = culture::any)
378 {
379 auto call_seed = static_cast<std::uint64_t>(_engine());
380 auto result = solver(true, g, c, call_seed);
381 result._seed = call_seed;
382 return result;
383 }
384
385 /// @brief Generate a deterministic first name using a specific seed.
386 /// @param g Gender (default: random).
387 /// @param c Culture (default: random).
388 /// @param call_seed Seed for reproducible results.
389 /// @return A name object.
390 /// @throws std::invalid_argument If no names loaded for the resolved culture/gender.
391 [[nodiscard]] name get_name(gender g, culture c,
392 std::uint64_t call_seed)
393 {
394 auto result = solver(true, g, c, call_seed);
395 result._seed = call_seed;
396 return result;
397 }
398
399 /// @brief Generate a surname.
400 /// @param c Culture (default: random).
401 /// @return A name object supporting chained appending.
402 /// @throws std::invalid_argument If no surnames loaded for the resolved culture.
403 [[nodiscard]] name get_surname(culture c = culture::any)
404 {
405 auto call_seed = static_cast<std::uint64_t>(_engine());
406 auto result = solver(false, gender::any, c, call_seed);
407 result._seed = call_seed;
408 return result;
409 }
410
411 /// @brief Generate a deterministic surname using a specific seed.
412 /// @param c Culture (default: random).
413 /// @param call_seed Seed for reproducible results.
414 /// @return A name object.
415 /// @throws std::invalid_argument If no surnames loaded for the resolved culture.
416 [[nodiscard]] name get_surname(culture c,
417 std::uint64_t call_seed)
418 {
419 auto result = solver(false, gender::any, c, call_seed);
420 result._seed = call_seed;
421 return result;
422 }
423
424 /// @name Seeding
425 /// @{
426
427 /// @brief Seed the internal random engine for deterministic sequences.
428 ///
429 /// Subsequent get_name() / get_surname() calls (without an explicit seed)
430 /// draw per-call seeds from this engine, producing a reproducible sequence.
431 ///
432 /// @param seed_value The seed value.
433 /// @return `*this` for chaining.
434 ng& seed(std::uint64_t seed_value)
435 {
436 _engine.seed(seed_value);
437 return *this;
438 }
439
440 /// @brief Reseed the engine with a non-deterministic source.
441 /// @return `*this` for chaining.
443 {
444 _engine.seed(std::random_device{}());
445 return *this;
446 }
447
448 /// @}
449
450 /// @brief Check whether any name databases have been loaded.
451 /// @return `true` if at least one name file has been loaded.
452 [[nodiscard]] bool has_resources() const
453 {
454 return !_m_pool.empty() ||
455 !_f_pool.empty() ||
456 !_sur_pool.empty();
457 }
458
459 /// @brief Load name files from a directory.
460 ///
461 /// Recursively scans @p resource_path for `.names` files and indexes them
462 /// by culture and gender. Safe to call multiple times.
463 ///
464 /// @param resource_path Directory containing `.names` files.
465 void load(const std::filesystem::path& resource_path)
466 {
467 if (std::filesystem::exists(resource_path) &&
468 std::filesystem::is_directory(resource_path))
469 {
470 for (const auto& entry :
471 std::filesystem::recursive_directory_iterator(resource_path))
472 {
473 if (entry.is_regular_file() &&
474 (entry.path().extension() == ".names"))
475 {
476 parse_file(entry);
477 }
478 }
479 }
480 }
481
482 /// @brief Load a specific dataset tier from a base resources directory.
483 ///
484 /// Probes common base paths ("resources", "../resources",
485 /// "name-generator/resources") and loads from the `lite/` or `full/`
486 /// subfolder according to @p tier.
487 ///
488 /// @param tier The dataset size to load (dataset::lite or dataset::full).
489 /// @return `true` if a matching directory was found and loaded.
490 bool load(dataset tier)
491 {
492 static constexpr std::array probe_paths = {
493 "resources", "../resources", "name-generator/resources"};
494
495 const char* subfolder =
496 (tier == dataset::full) ? "full" : "lite";
497
498 auto found = std::ranges::find_if(probe_paths, [&](const char* base) {
499 const std::filesystem::path dir =
500 std::filesystem::path{base} / subfolder;
501 return std::filesystem::is_directory(dir);
502 });
503 if (found != probe_paths.end())
504 {
505 load(std::filesystem::path{*found} / subfolder);
506 return true;
507 }
508 return false;
509 }
510
511 private:
512 // Container of names paired with their selection weights.
513 struct name_pool
514 {
515 std::vector<std::wstring> names;
516 std::vector<double> weights;
517 // Mutable because discrete_distribution::operator() is non-const
518 // (it may update internal generator state), but the distribution
519 // parameters are immutable after construction.
520 mutable std::discrete_distribution<std::size_t> dist;
521 };
522
523 // Maps for accessing names through culture.
524 std::map<culture, name_pool> _m_pool;
525 std::map<culture, name_pool> _f_pool;
526 std::map<culture, name_pool> _sur_pool;
527
528 // Per-instance random engine for seed drawing.
529 std::mt19937_64 _engine{std::random_device{}()};
530
531 // Tag type for the auto-probing singleton constructor.
532 struct auto_probe_tag {};
533
534 // Singleton constructor: auto-probes common resource locations.
535 explicit ng(auto_probe_tag /*tag*/)
536 {
537 static constexpr std::array probe_paths = {
538 "resources", "../resources", "name-generator/resources"};
539
540 auto found = std::ranges::find_if(probe_paths, [](const char* p) {
541 return std::filesystem::exists(p) &&
542 std::filesystem::is_directory(p);
543 });
544 if (found != probe_paths.end())
545 {
546 const std::filesystem::path base{*found};
547 auto lite_dir = base / "lite";
548 auto full_dir = base / "full";
549 if (std::filesystem::is_directory(lite_dir))
550 {
551 load(lite_dir);
552 }
553 else if (std::filesystem::is_directory(full_dir))
554 {
555 load(full_dir);
556 }
557 else
558 {
559 load(base);
560 }
561 }
562 }
563
564 // Resolve `any` culture to a random culture from those actually loaded.
565 static culture resolve_culture(culture c,
566 const std::map<culture, name_pool>& db,
567 effolkronium::random_local& engine)
568 {
569 if (c != culture::any) { return c; }
570 if (db.empty())
571 {
572 throw std::invalid_argument("No names loaded for any culture");
573 }
574 auto idx = engine.get<std::size_t>(0, db.size() - 1);
575 auto it = db.begin();
576 std::advance(it, static_cast<std::ptrdiff_t>(idx));
577 return it->first;
578 }
579
580 // Resolve `any` gender to a concrete random gender.
581 static gender resolve_gender(gender g,
582 effolkronium::random_local& engine)
583 {
584 if (g == gender::any)
585 {
586 return static_cast<gender>(engine.get<std::size_t>(0, 1));
587 }
588 return g;
589 }
590
591 // Convert a culture enum to its display name for error messages.
592 [[nodiscard]] static const char* culture_label(culture c)
593 {
594 static constexpr std::array labels = {
595 "afghan", "albanian", "algerian",
596 "american", "angolan", "argentinian",
597 "austrian", "azerbaijani", "bahraini",
598 "bangladeshi", "belgian", "bolivian",
599 "botswanan", "brazilian", "british",
600 "bruneian", "bulgarian", "burkinabe",
601 "burundian", "cambodian", "cameroonian",
602 "canadian", "chilean", "chinese",
603 "colombian", "costarican", "croatian",
604 "cypriot", "czech", "danish",
605 "djiboutian", "dutch", "ecuadorian",
606 "egyptian", "emirati", "estonian",
607 "ethiopian", "fijian", "filipino",
608 "finnish", "french", "georgian",
609 "german", "ghanaian", "greek",
610 "guatemalan", "haitian", "honduran",
611 "hongkonger", "hungarian", "icelandic",
612 "indian", "indonesian", "iranian",
613 "iraqi", "irish", "israeli",
614 "italian", "jamaican", "japanese",
615 "jordanian", "kazakh", "korean",
616 "kuwaiti", "lebanese", "libyan",
617 "lithuanian", "luxembourgish","macanese",
618 "malaysian", "maldivian", "maltese",
619 "mauritian", "mexican", "moldovan",
620 "moroccan", "namibian", "nigerian",
621 "norwegian", "omani", "palestinian",
622 "panamanian", "peruvian", "polish",
623 "portuguese", "puertorican", "qatari",
624 "russian", "salvadoran", "saudi",
625 "serbian", "singaporean", "slovenian",
626 "southafrican", "spanish", "sudanese",
627 "swedish", "swiss", "syrian",
628 "taiwanese", "tunisian", "turkish",
629 "turkmen", "uruguayan", "yemeni",
630 "any"};
631 auto idx = static_cast<std::size_t>(c);
632 if (idx < labels.size()) { return labels.at(idx); }
633 return "unknown";
634 }
635
636 // Convert a gender enum to its display name for error messages.
637 [[nodiscard]] static const char* gender_label(gender g)
638 {
639 switch (g)
640 {
641 case gender::m: return "male";
642 case gender::f: return "female";
643 case gender::any: return "any";
644 }
645 return "unknown";
646 }
647
648 // Pick a weighted-random name/surname from the appropriate map.
649 [[nodiscard]] static std::wstring pick(
650 const std::map<culture, name_pool>& db,
651 culture c, gender g, effolkronium::random_local& engine)
652 {
653 auto it = db.find(c);
654 if (it == db.end() || it->second.names.empty())
655 {
656 throw std::invalid_argument(
657 std::string("No ") + gender_label(g) +
658 " names loaded for culture '" + culture_label(c) + "'");
659 }
660 const auto& pool = it->second;
661 if (!pool.weights.empty())
662 {
663 // Use the pre-built distribution — avoids reconstructing
664 // the prefix-sum table on every call.
665 return pool.names.at(pool.dist(engine.engine()));
666 }
667 return *engine.get(pool.names);
668 }
669
670 /// @brief Number of bits to shift when XOR-folding a 64-bit seed to 32.
671 static constexpr unsigned seed_fold_shift = 32U;
672
673 // Append a forename part to an existing name.
674 void append_name_impl(name& n, gender g, culture c)
675 {
676 auto call_seed = static_cast<std::uint64_t>(_engine());
677 effolkronium::random_local call_engine;
678 call_engine.seed(static_cast<std::mt19937::result_type>(
679 (call_seed ^ (call_seed >> seed_fold_shift))));
680
681 const gender resolved_g = resolve_gender(g, call_engine);
682
683 const auto& db = (resolved_g == gender::f)
684 ? _f_pool : _m_pool;
685
686 const culture resolved_c = resolve_culture(c, db, call_engine);
687
688 const std::wstring part = pick(db, resolved_c, resolved_g,
689 call_engine);
690 n._parts.push_back(part);
691 n._full_string.append(L" ").append(part);
692 }
693
694 // Append a surname part to an existing name.
695 void append_surname_impl(name& n, culture c)
696 {
697 auto call_seed = static_cast<std::uint64_t>(_engine());
698 effolkronium::random_local call_engine;
699 call_engine.seed(static_cast<std::mt19937::result_type>(
700 (call_seed ^ (call_seed >> seed_fold_shift))));
701
702 const culture resolved_c = resolve_culture(c,
703 _sur_pool, call_engine);
704 const std::wstring part = pick(_sur_pool,
705 resolved_c, gender::any, call_engine);
706 n._parts.push_back(part);
707 n._full_string.append(L" ").append(part);
708 }
709
710 // Core generation logic.
711 [[nodiscard]] name solver(bool is_name, gender requested_gender,
712 culture requested_culture,
713 std::uint64_t call_seed)
714 {
715 effolkronium::random_local call_engine;
716 call_engine.seed(static_cast<std::mt19937::result_type>(
717 (call_seed ^ (call_seed >> seed_fold_shift))));
718
719 const gender resolved_gender = resolve_gender(requested_gender,
720 call_engine);
721
722 if (is_name)
723 {
724 const auto& db = (resolved_gender == gender::f)
725 ? _f_pool : _m_pool;
726 const culture resolved_culture = resolve_culture(
727 requested_culture, db, call_engine);
728 return {pick(db, resolved_culture, resolved_gender,
729 call_engine),
730 resolved_gender, resolved_culture, this};
731 }
732
733 const culture resolved_culture = resolve_culture(
734 requested_culture, _sur_pool, call_engine);
735 return {pick(_sur_pool,
736 resolved_culture, gender::any, call_engine),
737 resolved_gender, resolved_culture, this};
738 }
739
740 /// @brief Decode a UTF-8 byte string into a wide string.
741 ///
742 /// Handles 1–4 byte sequences and produces UTF-32 on Linux
743 /// (wchar_t is 4 bytes) or UTF-16 surrogate pairs on Windows
744 /// (wchar_t is 2 bytes). Invalid lead bytes are silently skipped.
745 // NOLINTNEXTLINE(readability-function-cognitive-complexity)
746 static std::wstring utf8_to_wstring(const std::string& utf8)
747 {
748 // UTF-8 prefix masks and value masks for each sequence length.
749 static constexpr unsigned char ascii_max = 0x80U;
750 static constexpr unsigned char two_byte_mask = 0xE0U;
751 static constexpr unsigned char two_byte_lead = 0xC0U;
752 static constexpr unsigned char two_byte_val = 0x1FU;
753 static constexpr unsigned char three_byte_mask = 0xF0U;
754 static constexpr unsigned char three_byte_lead = 0xE0U;
755 static constexpr unsigned char three_byte_val = 0x0FU;
756 static constexpr unsigned char four_byte_mask = 0xF8U;
757 static constexpr unsigned char four_byte_lead = 0xF0U;
758 static constexpr unsigned char four_byte_val = 0x07U;
759 static constexpr unsigned char cont_val = 0x3FU;
760 static constexpr unsigned char cont_check_mask = 0xC0U;
761 static constexpr unsigned char cont_check_lead = 0x80U;
762 static constexpr unsigned cont_shift = 6U;
763 static constexpr char32_t max_codepoint = 0x10FFFFU;
764 // Surrogate-pair constants (UTF-16, wchar_t == 2 bytes only).
765 static constexpr char32_t surrogate_offset = 0x10000U;
766 static constexpr char32_t high_surrogate_base = 0xD800U;
767 static constexpr char32_t low_surrogate_base = 0xDC00U;
768 static constexpr unsigned surrogate_shift = 10U;
769 static constexpr char32_t surrogate_mask = 0x3FFU;
770
771 std::wstring result;
772 result.reserve(utf8.size());
773 std::size_t i = 0;
774
775 while (i < utf8.size())
776 {
777 char32_t codepoint = 0;
778 auto lead = static_cast<unsigned char>(utf8.at(i));
779 std::size_t extra = 0;
780
781 if (lead < ascii_max)
782 {
783 codepoint = lead;
784 }
785 else if ((lead & two_byte_mask) == two_byte_lead)
786 {
787 codepoint = lead & two_byte_val;
788 extra = 1;
789 }
790 else if ((lead & three_byte_mask) == three_byte_lead)
791 {
792 codepoint = lead & three_byte_val;
793 extra = 2;
794 }
795 else if ((lead & four_byte_mask) == four_byte_lead)
796 {
797 codepoint = lead & four_byte_val;
798 extra = 3;
799 }
800 else
801 {
802 ++i;
803 continue; // skip invalid lead byte
804 }
805
806 ++i;
807 bool valid = true;
808 for (std::size_t j = 0; j < extra; ++j, ++i)
809 {
810 if (i >= utf8.size())
811 {
812 valid = false;
813 break; // truncated sequence
814 }
815 auto byte = static_cast<unsigned char>(utf8.at(i));
816 if ((byte & cont_check_mask) != cont_check_lead)
817 {
818 valid = false;
819 break; // not a valid continuation byte
820 }
821 codepoint = (codepoint << cont_shift) |
822 (static_cast<char32_t>(byte) &
823 static_cast<char32_t>(cont_val));
824 }
825
826 if (!valid || codepoint > max_codepoint)
827 {
828 continue; // skip malformed or out-of-range sequence
829 }
830
831 if constexpr (sizeof(wchar_t) >= 4)
832 {
833 result.push_back(static_cast<wchar_t>(codepoint));
834 }
835 else
836 {
837 if (codepoint < surrogate_offset)
838 {
839 result.push_back(static_cast<wchar_t>(codepoint));
840 }
841 else
842 {
843 const char32_t shifted = codepoint - surrogate_offset;
844 result.push_back(static_cast<wchar_t>(
845 high_surrogate_base + (shifted >> surrogate_shift)));
846 result.push_back(static_cast<wchar_t>(
847 low_surrogate_base + (shifted & surrogate_mask)));
848 }
849 }
850 }
851 return result;
852 }
853
854 // Parse a .names file and index it into the appropriate map.
855 // Uses std::ifstream (byte-oriented) + utf8_to_wstring so that
856 // non-ASCII names load correctly regardless of the system locale.
857 // NOLINTNEXTLINE(readability-function-cognitive-complexity)
858 void parse_file(const std::filesystem::path& file)
859 {
860 std::ifstream tentative_file{file};
861
862 if (tentative_file.is_open())
863 {
864 std::string raw_line;
865
866 // Read culture from the first line.
867 if (!std::getline(tentative_file, raw_line))
868 {
869 return;
870 }
871 if (!raw_line.empty() && raw_line.back() == '\r')
872 {
873 raw_line.pop_back();
874 }
875 const culture culture_read =
876 to_culture(utf8_to_wstring(raw_line));
877
878 // We can't continue without a valid culture.
879 if (culture_read == culture::any)
880 {
881 return;
882 }
883
884 // Read gender from the second line.
885 if (!std::getline(tentative_file, raw_line))
886 {
887 return;
888 }
889 if (!raw_line.empty() && raw_line.back() == '\r')
890 {
891 raw_line.pop_back();
892 }
893 const gender gender_read = to_gender(utf8_to_wstring(raw_line));
894
895 // Read all remaining lines as names with optional weights.
896 // Format: "name\tweight" or just "name" (weight defaults to 1.0).
897 name_pool pool;
898 while (std::getline(tentative_file, raw_line))
899 {
900 if (!raw_line.empty() && raw_line.back() == '\r')
901 {
902 raw_line.pop_back();
903 }
904 if (!raw_line.empty())
905 {
906 auto tab_pos = raw_line.find('\t');
907 if (tab_pos != std::string::npos)
908 {
909 pool.names.push_back(
910 utf8_to_wstring(raw_line.substr(0, tab_pos)));
911 pool.weights.push_back(
912 std::stod(raw_line.substr(tab_pos + 1)));
913 }
914 else
915 {
916 pool.names.push_back(utf8_to_wstring(raw_line));
917 pool.weights.push_back(1.0);
918 }
919 }
920 }
921
922 if (pool.names.empty())
923 {
924 return;
925 }
926
927 // Pre-build the weighted distribution for O(1) generation.
928 if (!pool.weights.empty())
929 {
930 pool.dist = std::discrete_distribution<std::size_t>(
931 pool.weights.begin(), pool.weights.end());
932 }
933
934 // Index by gender.
935 switch (gender_read)
936 {
937 case gender::m:
938 _m_pool[culture_read] = std::move(pool);
939 break;
940 case gender::f:
941 _f_pool[culture_read] = std::move(pool);
942 break;
943 default:
944 _sur_pool[culture_read] = std::move(pool);
945 break;
946 }
947 }
948 }
949
950 friend class name;
951 friend struct ::ng_test_access;
952};
953
954// Out-of-line definitions for name methods that call into ng.
956{
957 _owner->append_name_impl(*this, _gender, _culture);
958 return *this;
959}
960
962{
963 _owner->append_name_impl(*this, _gender, c);
964 return *this;
965}
966
968{
969 _owner->append_surname_impl(*this, _culture);
970 return *this;
971}
972
974{
975 _owner->append_surname_impl(*this, c);
976 return *this;
977}
978
979} // namespace dasmig
980
981#endif // DASMIG_NAMEGEN_HPP
Return type for name generation, holding both individual parts and the full composed string.
Definition namegen.hpp:167
friend std::wostream & operator<<(std::wostream &wos, const name &n)
Stream the name to a wide output stream.
Definition namegen.hpp:219
name & append_surname()
Append a surname to this name, preserving culture.
Definition namegen.hpp:967
name & append_name()
Append a forename to this name, preserving gender and culture.
Definition namegen.hpp:955
const std::vector< std::wstring > & parts() const
Return the individual parts (names/surnames) as a vector.
Definition namegen.hpp:179
std::uint64_t seed() const
Retrieve the random seed used to generate this name.
Definition namegen.hpp:172
Name generator that produces culture-aware names and surnames.
Definition namegen.hpp:259
name get_name(gender g=gender::any, culture c=culture::any)
Generate a first name.
Definition namegen.hpp:376
ng & operator=(const ng &)=delete
Not copyable.
ng(const ng &)=delete
Not copyable.
ng & seed(std::uint64_t seed_value)
Seed the internal random engine for deterministic sequences.
Definition namegen.hpp:434
name get_surname(culture c=culture::any)
Generate a surname.
Definition namegen.hpp:403
bool has_resources() const
Check whether any name databases have been loaded.
Definition namegen.hpp:452
ng()=default
Default constructor — creates an empty generator with no names.
ng & unseed()
Reseed the engine with a non-deterministic source.
Definition namegen.hpp:442
bool load(dataset tier)
Load a specific dataset tier from a base resources directory.
Definition namegen.hpp:490
static gender to_gender(std::wstring_view gender_string)
Translate a gender string to a gender enum.
Definition namegen.hpp:355
void load(const std::filesystem::path &resource_path)
Load name files from a directory.
Definition namegen.hpp:465
static ng & instance()
Access the global singleton instance.
Definition namegen.hpp:277
name get_surname(culture c, std::uint64_t call_seed)
Generate a deterministic surname using a specific seed.
Definition namegen.hpp:416
name get_name(gender g, culture c, std::uint64_t call_seed)
Generate a deterministic first name using a specific seed.
Definition namegen.hpp:391
ng(ng &&) noexcept=default
Move constructor.
static culture to_culture(std::wstring_view country_code)
Translate an ISO 3166 2-letter country code to a culture enum.
Definition namegen.hpp:286
@ full
Complete dataset (~39 MB).
@ lite
Top-500 names per category (~2 MB).
culture
Culture representing a country or a broader group.
Definition namegen.hpp:34
gender
Simple gender enum to distinguish between male and female names.
Definition namegen.hpp:155