{"columns":[{"alerts":[{"code":"near_unique","level":"info","message":"100.0% of rows are unique strings"},{"code":"one_word","level":"warn","message":"100.0% rows are a single word"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"}],"column":"iso_639_3","extras":{"language_counts":{},"language_sample_size":5000,"length_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7130,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[2.5,2.525,2.55,2.575,2.6,2.625,2.65,2.675,2.7,2.725,2.75,2.775,2.8,2.825,2.85,2.875,2.9,2.925,2.95,2.975,3.0,3.025,3.05,3.075,3.1,3.125,3.15,3.175,3.2,3.225,3.25,3.275,3.3,3.325,3.35,3.375,3.4,3.425,3.45,3.475,3.5]},"near_unique":true,"sample":["abq","qui","tys","sby","kzn","wca","pbn","tpy","sod","aoa","mtj","mva","esg","mgj","gwb","wlc","bae","ssu","zpn","kew","hnd","anj","cna","bts","org","rml","mvb","cui","bcr","yry","ngd","nwe","ngw","kcb","kce","khe","wlw","mkd","dkr","mlt","snc","vao","oub","vap","wmo","yyr","jra","kyz","cay","tob"],"top_values":[],"top_words":[["aou",1],["aiw",1],["aas",1],["kbt",1],["abf",1],["bzy",1],["abm",1],["aau",1],["abq",1],["aba",1],["aaq",1],["abe",1],["abi",1],["bsa",1],["pcn",1],["abk",1],["aob",1],["abo",1],["abr",1],["ado",1],["abn",1],["abz",1],["kgr",1],["abu",1],["mgj",1]],"vocab_skipped":null,"word_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7130,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[0.5,0.5333333333333333,0.5666666666666667,0.6,0.6333333333333333,0.6666666666666666,0.7,0.7333333333333334,0.7666666666666666,0.8,0.8333333333333333,0.8666666666666667,0.9,0.9333333333333333,0.9666666666666667,1.0,1.0333333333333332,1.0666666666666667,1.1,1.1333333333333333,1.1666666666666665,1.2,1.2333333333333334,1.2666666666666666,1.3,1.3333333333333335,1.3666666666666667,1.4,1.4333333333333333,1.4666666666666668,1.5]}},"kind":"text","n":7130,"n_null":0,"n_unique":7130,"null_rate":0.0,"stats":{"allcaps_rate":0.0,"boilerplate_rate":0.0,"duplicate_rate":0.0,"emoji_rate":0.0,"len_max":3,"len_mean":3.0,"len_median":3.0,"len_min":3,"len_p95":3.0,"n_duplicates":0,"n_empty":0,"one_word_rate":1.0,"readability_flesch_mean":120.37400000000001,"url_rate":0.0,"vocab_size":7130,"word_mean":1.0,"word_median":1.0}},{"alerts":[{"code":"near_unique","level":"info","message":"100.0% of rows are unique strings"},{"code":"one_word","level":"warn","message":"72.9% rows are a single word"}],"column":"name","extras":{"language_counts":{},"language_sample_size":5000,"length_histogram":{"counts":[25,195,761,1106,1096,847,529,351,278,238,213,215,191,202,149,95,75,72,66,70,154,50,26,30,23,8,7,10,14,6,8,2,10,1,4,0,1,0,1,1],"edges":[1.0,2.05,3.1,4.15,5.2,6.25,7.300000000000001,8.350000000000001,9.4,10.450000000000001,11.5,12.55,13.600000000000001,14.65,15.700000000000001,16.75,17.8,18.85,19.900000000000002,20.95,22.0,23.05,24.1,25.150000000000002,26.200000000000003,27.25,28.3,29.35,30.400000000000002,31.450000000000003,32.5,33.550000000000004,34.6,35.65,36.7,37.75,38.800000000000004,39.85,40.9,41.95,43.0]},"near_unique":true,"sample":["Abaza","Quileute","T\u00e0y Sa Pa","Soli","Kokola","Yanom\u00e1mi","Kpasam","Trumai","Songoora","Angolar","Moskona","Manam","Aheri Gondi","Abureni","Gwa","Mwali Comorian","Bar\u00e9","Susuami","Santa In\u00e9s Yatzechi Zapotec","West Kewa","Southern Hindko","Anor","Changthang","Batak Simalungun","Oring","Baltic Romani","Mattole","Cuiba","Babine","Yarluyandi","Ngando (Central African Republic)","Ngwe","Ngwaba","Kawacha","Kaivi","Korowai","Walak","Macedonian","Kuijau","Maltese","Sinaugoro","Vao","Glio-Oubi","Vaiphei","Wom (Papua New Guinea)","Yir Yoront","Jarai","Kayab\u00ed","Cayuga","Toba"],"top_values":[],"top_words":[["language",153],["sign",152],["southern",70],["northern",65],["zapotec",58],["mixtec",52],["western",51],["eastern",49],["naga",49],["central",45],["creole",34],["quechua",33],["arabic",32],["chin",32],["new",31],["guinea)",28],["nahuatl",28],["(papua",27],["north",26],["malay",26],["san",22],["south",21],["english",20],["dogon",19],["(nigeria)",18]],"vocab_skipped":null,"word_histogram":{"counts":[5199,0,0,0,0,0,1382,0,0,0,0,0,415,0,0,0,0,0,103,0,0,0,0,0,27,0,0,0,0,4],"edges":[1.0,1.1666666666666667,1.3333333333333333,1.5,1.6666666666666665,1.8333333333333333,2.0,2.1666666666666665,2.333333333333333,2.5,2.6666666666666665,2.833333333333333,3.0,3.1666666666666665,3.333333333333333,3.5,3.6666666666666665,3.833333333333333,4.0,4.166666666666666,4.333333333333333,4.5,4.666666666666666,4.833333333333333,5.0,5.166666666666666,5.333333333333333,5.5,5.666666666666666,5.833333333333333,6.0]}},"kind":"text","n":7130,"n_null":0,"n_unique":7130,"null_rate":0.0,"stats":{"allcaps_rate":0.0,"boilerplate_rate":0.0,"duplicate_rate":0.0,"emoji_rate":0.0,"len_max":43,"len_mean":8.993688639551193,"len_median":7.0,"len_min":1,"len_p95":21.0,"n_duplicates":0,"n_empty":0,"one_word_rate":0.7291725105189341,"readability_flesch_mean":56.15055000000003,"url_rate":0.0,"vocab_size":7124,"word_mean":1.3715287517531556,"word_median":1.0}},{"alerts":[{"code":"skipped","level":"info","message":"no profiler for kind=unknown"}],"column":"joshua_project","extras":{},"kind":"unknown","n":7130,"n_null":0,"n_unique":null,"null_rate":0.0,"stats":{}},{"alerts":[{"code":"skipped","level":"info","message":"no profiler for kind=unknown"}],"column":"glottolog","extras":{},"kind":"unknown","n":7130,"n_null":0,"n_unique":null,"null_rate":0.0,"stats":{}},{"alerts":[{"code":"skipped","level":"info","message":"no profiler for kind=unknown"}],"column":"language_history","extras":{},"kind":"unknown","n":7130,"n_null":0,"n_unique":null,"null_rate":0.0,"stats":{}},{"alerts":[{"code":"skipped","level":"info","message":"no profiler for kind=unknown"}],"column":"us_indigenous","extras":{},"kind":"unknown","n":7130,"n_null":0,"n_unique":null,"null_rate":0.0,"stats":{}},{"alerts":[{"code":"skipped","level":"info","message":"no profiler for kind=unknown"}],"column":"speaker_count","extras":{},"kind":"unknown","n":7130,"n_null":0,"n_unique":null,"null_rate":0.0,"stats":{}},{"alerts":[{"code":"skipped","level":"info","message":"no profiler for kind=unknown"}],"column":"data_sources","extras":{},"kind":"unknown","n":7130,"n_null":0,"n_unique":null,"null_rate":0.0,"stats":{}}],"insights":{"errors":[],"insights":[{"confidence":"medium","critiques":[],"evidence_keys":["row_count","column_count","columns[0].n_unique","columns[1].n_unique","columns[1].stats.len_max","columns[1].stats.len_mean","columns[1].top_words","columns[1].stats.one_word_rate"],"featured_charts":[{"caption":"Top words in language names \u2014 look for dominant directional terms (Southern, Northern) and prolific language families like Zapotec and Mixtec.","column":"name","kind":"bar"},{"caption":"Distribution of language name lengths \u2014 most names are short (median 7 characters) but a long tail reaches up to 43 characters.","column":"name","kind":"histogram"},{"caption":"All ISO 639-3 codes are exactly 3 characters long, confirming full standard compliance across all 7,130 entries.","column":"iso_639_3","kind":"length"}],"model":"anthropic:default","narrative":"This dataset is a reference catalogue of 7,130 world languages, each identified by a unique ISO 639-3 three-letter code alongside its name and several linked data sources (Glottolog, Joshua Project, speaker counts, and more). Every row is distinct \u2014 no duplicate language codes or names \u2014 making this primarily a lookup/reference table. Two things stand out for closer inspection: first, the name column reveals notable clusters around directional qualifiers (Southern, Northern, Western, Eastern) and language families like Zapotec (58), Mixtec (52), and Naga (49), suggesting rich geographic and genealogical structure worth exploring. Second, 'sign' appears 152 times in language names, indicating a surprisingly large representation of sign languages across the world's documented tongues.","scope":"dataset","target":"__global__"},{"confidence":"high","critiques":[],"evidence_keys":["len_min","len_max","len_mean","n_unique","n","duplicate_rate","null_rate","vocab_size","top_words"],"model":"anthropic:default","narrative":"This column contains ISO 639-3 language codes \u2014 the international standard three-letter identifiers for individual human languages. Every value is exactly 3 characters long (len_min=3, len_max=3, len_mean=3.0), all lowercase, and completely unique across all 7,130 rows with zero nulls or duplicates. The 7,130 distinct codes is strikingly close to the total number of living languages catalogued by ISO 639-3 (~7,000+), suggesting this may be a near-complete reference table of language codes.","role":"identifier","scope":"column","target":"iso_639_3","treatment":"Use as a primary key or join key; left-join to enrich with language metadata (name, family, region)."},{"confidence":"high","critiques":[],"evidence_keys":["n_unique","n","duplicate_rate","null_rate","top_words","one_word_rate","word_mean","len_max","vocab_size"],"model":"anthropic:default","narrative":"This column contains names of human languages or dialects, evidenced by the dominant top words ('language', 'sign', 'zapotec', 'mixtec', 'naga') and directional qualifiers ('southern', 'northern', 'western', 'eastern', 'central') typical of language taxonomy naming conventions. All 7,130 rows are unique with zero duplicates and zero nulls, making this a near-perfect identifier. The one-word rate of 72.9% and a mean word count of 1.37 reflect the compact, often single-token nature of language names, while the max length of 43 characters accommodates longer multi-word dialect names. The vocabulary size of 7,124 against 7,130 rows confirms near-total uniqueness with only minimal token reuse.","role":"label","scope":"column","target":"name","treatment":"Use as a human-readable label or entity key; do not encode directly \u2014 map to a numeric ID or embed with a language-model encoder if semantic similarity is needed."},{"confidence":"low","critiques":[],"evidence_keys":["alerts","kind","n","null_rate","n_unique","stats"],"model":"anthropic:default","narrative":"The column 'data_sources' has 7130 rows with zero nulls, but saturn skipped profiling it entirely \u2014 no type was resolved, no unique count was computed, and no statistics were collected. This suggests the column contains a complex or nested type (e.g., JSON, arrays, or mixed structures) that the profiler could not parse. No further characterisation is possible from the available evidence.","role":"other","scope":"column","target":"data_sources","treatment":"Inspect raw values to determine structure (e.g., JSON array, delimited string), then parse and flatten before any downstream use."},{"confidence":"low","critiques":[],"evidence_keys":["column","kind","n","null_rate","alerts"],"model":"anthropic:default","narrative":"This column contains Glottolog codes, which are standardized four-letter+four-digit identifiers for the world's languages and language families maintained by the Glottolog database. The profiler skipped detailed analysis (kind='unknown', no stats, null_rate=0.0), so cardinality and value distribution are unavailable. With 7130 rows and zero nulls, the column is fully populated, but uniqueness cannot be assessed from the available evidence.","role":"foreign_key","scope":"column","target":"glottolog","treatment":"Left-join on this code against the Glottolog reference database to enrich with language family, macroarea, and geographic metadata."},{"confidence":"low","critiques":[],"evidence_keys":["column","n","null_rate","alerts","kind"],"model":"anthropic:default","narrative":"This column references the Joshua Project, a well-known ethnoreligious people-group classification system commonly used in missiology and demographic datasets. The profiler returned a 'skipped' alert with no computed stats or uniqueness count, meaning the column type was not resolved and no distributional analysis was available. With 7,130 non-null rows and zero null rate, the data is fully populated, but nothing further can be inferred about cardinality, value distribution, or data type from the evidence provided.","role":"label","scope":"column","target":"joshua_project","treatment":"Resolve column type manually, then assess cardinality to determine whether to treat as a categorical label or foreign key joining to a Joshua Project people-group reference table."},{"confidence":"low","critiques":[],"evidence_keys":["alerts","kind","n","null_rate","stats"],"model":"anthropic:default","narrative":"This column contains an unknown data type that Saturn skipped during profiling, likely a complex or nested structure (e.g., JSON, array, or serialized object) representing a history of languages associated with each record. With 7,130 rows, zero nulls, and no stats computed, the actual content and distribution are entirely opaque from this evidence alone. The 'skipped' alert means no cardinality, frequency, or value-level analysis was performed, so downstream usability is unknown.","role":"other","scope":"column","target":"language_history","treatment":"Inspect raw values to determine structure (e.g., parse JSON/arrays), then flatten or encode before modelling."},{"confidence":"low","critiques":[],"evidence_keys":["column","n","null_rate","alerts","kind"],"model":"anthropic:default","narrative":"The column 'speaker_count' is likely a numeric count of speakers per record (e.g., per document, meeting, or audio segment). No distributional statistics or uniqueness data were computed \u2014 the profiler skipped this column, possibly due to an unrecognised dtype or an upstream parsing issue. With 7,130 non-null rows and zero nulls, the data is complete, but no further characterisation is possible from the available evidence.","role":"feature","scope":"column","target":"speaker_count","treatment":"Resolve the 'skipped' profiling alert by checking dtype (may be stored as string or object); cast to integer and re-profile before using as a numeric feature."},{"confidence":"low","critiques":[],"evidence_keys":["alerts","n","null_rate","n_unique","stats"],"model":"anthropic:default","narrative":"The column 'us_indigenous' likely represents a binary or categorical indicator of US Indigenous identity/ethnicity for 7,130 records. The profiler emitted a 'skipped' alert with no computed stats or uniqueness count, meaning the column's data type or content prevented standard analysis. No nulls are present, but without distribution or cardinality data, the actual value composition (e.g., boolean flags, counts, coded strings) cannot be determined from this evidence alone.","role":"feature","scope":"column","target":"us_indigenous","treatment":"Inspect raw values to confirm encoding (boolean, 0/1, string category), then encode appropriately before modelling."}],"providers":["anthropic:default"],"total_usage":{"completion_tokens":2199,"prompt_tokens":5669,"total_tokens":7868}},"language_counts":{},"meta":{"generated_at":"2026-06-22T00:12:06+00:00","mode":"full","row_count":7130,"sampled_rows":7130,"seed":42,"source":"/home/coolhand/html/datavis/data_trove/data/linguistic/world_languages_integrated.json"},"notes":[],"saturn_version":"0.2.0","schema":{"data_sources":"unknown","glottolog":"unknown","iso_639_3":"text","joshua_project":"unknown","language_history":"unknown","name":"text","speaker_count":"unknown","us_indigenous":"unknown"}}
