{"columns":[{"alerts":[{"code":"one_word","level":"warn","message":"100.0% rows are a single word"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"},{"code":"duplicates","level":"warn","message":"97.9% duplicate strings"}],"column":"glottocode","extras":{"language_counts":{},"language_sample_size":5000,"length_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,105465,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[7.5,7.525,7.55,7.575,7.6,7.625,7.65,7.675,7.7,7.725,7.75,7.775,7.8,7.825,7.85,7.875,7.9,7.925,7.95,7.975,8.0,8.025,8.05,8.075,8.1,8.125,8.15,8.175,8.2,8.225,8.25,8.275,8.3,8.325,8.35,8.375,8.4,8.425,8.45,8.475,8.5]},"near_unique":false,"sample":["lakk1252","lazz1240","kham1282","nort2722","west2456","gurd1238","suga1248","kham1282","gata1239","paez1247","jenn1240","izer1241","wann1242","telu1262","kara1476","aika1237","bero1242","urdu1245","west2443","torw1241","kwaz1243","wapi1253","naxi1245","nyah1250","juru1256","chec1245","kaum1238","tigr1270","yaku1245","wikn1246","nort2972","iton1250","iton1250","nyon1241","juan1238","huba1236","burd1238","sooo1256","tivv1240","alla1248","nene1249","dyug1238","timn1235","guda1243","kara1476","atam1239","waya1269","bamu1256","dadi1250","buru1296"],"top_values":[["kham1282",622],["osse1243",483],["dutc1256",395],["stan1293",370],["hind1269",342],["gwan1268",323],["lith1251",315],["chec1245",309],["iris1253",288],["kaba1278",281],["beng1280",263],["buru1296",251],["sind1272",235],["tibe1272",223],["shix1238",221],["lazz1240",216],["basq1248",215],["east2328",213],["katc1249",213],["khar1287",208]],"top_words":[["kham1282",102],["osse1243",100],["hind1269",61],["lith1251",60],["stan1293",59],["iris1253",57],["gwan1268",56],["dutc1256",55],["kaba1278",53],["buru1296",52],["hung1274",49],["chec1245",48],["lazz1240",48],["basq1248",48],["lakk1252",46],["khar1287",44],["sind1272",44],["shix1238",43],["tibe1272",42],["beng1280",42],["nepa1254",42],["east2328",42],["yuch1247",42],["tach1250",41],["kash1277",39]],"vocab_skipped":null,"word_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,105465,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[0.5,0.5333333333333333,0.5666666666666667,0.6,0.6333333333333333,0.6666666666666666,0.7,0.7333333333333334,0.7666666666666666,0.8,0.8333333333333333,0.8666666666666667,0.9,0.9333333333333333,0.9666666666666667,1.0,1.0333333333333332,1.0666666666666667,1.1,1.1333333333333333,1.1666666666666665,1.2,1.2333333333333334,1.2666666666666666,1.3,1.3333333333333335,1.3666666666666667,1.4,1.4333333333333333,1.4666666666666668,1.5]}},"kind":"text","n":105484,"n_null":19,"n_unique":2176,"null_rate":0.00018012210382617268,"stats":{"allcaps_rate":0.0,"boilerplate_rate":0.0,"duplicate_rate":0.9793675626985255,"emoji_rate":0.0,"len_max":8,"len_mean":8.0,"len_median":8.0,"len_min":8,"len_p95":8.0,"n_duplicates":103289,"n_empty":0,"one_word_rate":1.0,"readability_flesch_mean":97.10900000000004,"url_rate":0.0,"vocab_size":2166,"word_mean":1.0,"word_median":1.0}},{"alerts":[{"code":"one_word","level":"warn","message":"100.0% rows are a single word"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"},{"code":"duplicates","level":"warn","message":"98.0% duplicate strings"}],"column":"iso_639_3","extras":{"language_counts":{},"language_sample_size":5000,"length_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,105459,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[2.5,2.525,2.55,2.575,2.6,2.625,2.65,2.675,2.7,2.725,2.75,2.775,2.8,2.825,2.85,2.875,2.9,2.925,2.95,2.975,3.0,3.025,3.05,3.075,3.1,3.125,3.15,3.175,3.2,3.225,3.25,3.275,3.3,3.325,3.35,3.375,3.4,3.425,3.45,3.475,3.5]},"near_unique":false,"sample":["lbe","dlg","khg","ceb","xwl","gdj","cce","gla","gaq","pbb","xuj","izr","wob","tel","gby","tbi","bom","urd","mis","trw","lbj","wap","ncj","cbn","urb","che","nyf","tiw","sah","wua","emp","lec","lec","bvm","adn","kwl","bxn","bcq","tiv","bqv","yrk","dyd","tik","wmb","gbd","amz","woc","bvm","mps","bsk"],"top_values":[["mis",828],["khg",622],["oss",525],["nld",395],["eng",370],["hin",342],["gwn",323],["lit",315],["che",309],["gle",288],["nyf",282],["kbd",281],["ben",263],["eus",258],["sgw",254],["bsk",251],["xtc",245],["snd",235],["bod",223],["sxg",221]],"top_words":[["mis",158],["khg",106],["oss",104],["eng",63],["nld",63],["gwn",63],["hin",61],["kbd",58],["gle",57],["bsk",57],["lit",56],["lzz",55],["sgw",55],["eus",47],["snd",47],["che",46],["mhr",46],["nyf",45],["lbe",45],["mya",44],["xtc",44],["bod",44],["khr",43],["tel",42],["sxg",41]],"vocab_skipped":null,"word_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,105459,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[0.5,0.5333333333333333,0.5666666666666667,0.6,0.6333333333333333,0.6666666666666666,0.7,0.7333333333333334,0.7666666666666666,0.8,0.8333333333333333,0.8666666666666667,0.9,0.9333333333333333,0.9666666666666667,1.0,1.0333333333333332,1.0666666666666667,1.1,1.1333333333333333,1.1666666666666665,1.2,1.2333333333333334,1.2666666666666666,1.3,1.3333333333333335,1.3666666666666667,1.4,1.4333333333333333,1.4666666666666668,1.5]}},"kind":"text","n":105484,"n_null":25,"n_unique":2094,"null_rate":0.0002370027681923325,"stats":{"allcaps_rate":0.0,"boilerplate_rate":0.0,"duplicate_rate":0.9801439421955452,"emoji_rate":0.0,"len_max":3,"len_mean":3.0,"len_median":3.0,"len_min":3,"len_p95":3.0,"n_duplicates":103365,"n_empty":0,"one_word_rate":1.0,"readability_flesch_mean":119.95100000000002,"url_rate":0.0,"vocab_size":2089,"word_mean":1.0,"word_median":1.0}},{"alerts":[{"code":"one_word","level":"warn","message":"100.0% rows are a single word"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"},{"code":"duplicates","level":"warn","message":"97.0% duplicate strings"}],"column":"phoneme","extras":{"language_counts":{},"language_sample_size":5000,"length_histogram":{"counts":[67114,0,0,0,28726,0,0,0,6559,0,0,0,2225,0,0,0,401,0,0,0,267,0,0,0,104,0,0,0,5,0,0,0,70,0,0,0,1,0,0,12],"edges":[1.0,1.25,1.5,1.75,2.0,2.25,2.5,2.75,3.0,3.25,3.5,3.75,4.0,4.25,4.5,4.75,5.0,5.25,5.5,5.75,6.0,6.25,6.5,6.75,7.0,7.25,7.5,7.75,8.0,8.25,8.5,8.75,9.0,9.25,9.5,9.75,10.0,10.25,10.5,10.75,11.0]},"near_unique":false,"sample":["\u03c7","u","ndz","r","\u02e6","\u0268","d\u0324\u026e\u0324","u","i\u0303","a\u0303","t\u032a","ts","o","l","u","u\u0303","p","\u027d\u0324","u","t\u032a","\u0294","\u0290","\u0282","i","j","u\u032fo\u0303\u02d0","t","j","l","t\u032a\u0349","a\u0303","l","\u0294","b","\u0254","m","r","\u025b","t\u0320\u0283","f","w","\u0288\u0349","\u0254","n","\u0235","p","a","t\u032a","o\u031e\u0303","t\u0255\u02b0"],"top_values":[["m",2915],["i",2779],["k",2729],["j",2716],["u",2646],["a",2600],["p",2593],["w",2483],["n",2350],["t",2064],["l",2044],["s",2021],["b",1906],["\u014b",1898],["e",1842],["o",1826],["\u0261",1712],["h",1703],["d",1376],["r",1332]],"top_words":[["m",572],["i",537],["k",520],["u",516],["j",508],["a",506],["p",498],["w",459],["n",446],["l",395],["s",392],["b",363],["t",361],["o",358],["\u014b",349],["e",345],["\u0261",323],["h",307],["r",272],["f",256],["d",243],["\u0272",240],["t\u0320\u0283",237],["\u0294",215],["\u0254",212]],"vocab_skipped":null,"word_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,105484,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[0.5,0.5333333333333333,0.5666666666666667,0.6,0.6333333333333333,0.6666666666666666,0.7,0.7333333333333334,0.7666666666666666,0.8,0.8333333333333333,0.8666666666666667,0.9,0.9333333333333333,0.9666666666666667,1.0,1.0333333333333332,1.0666666666666667,1.1,1.1333333333333333,1.1666666666666665,1.2,1.2333333333333334,1.2666666666666666,1.3,1.3333333333333335,1.3666666666666667,1.4,1.4333333333333333,1.4666666666666668,1.5]}},"kind":"text","n":105484,"n_null":0,"n_unique":3142,"null_rate":0.0,"stats":{"allcaps_rate":0.0017538204846232605,"boilerplate_rate":0.0,"duplicate_rate":0.9702134920935876,"emoji_rate":0.0,"len_max":11,"len_mean":1.5006067270865724,"len_median":1.0,"len_min":1,"len_p95":3.0,"n_duplicates":102342,"n_empty":0,"one_word_rate":1.0,"readability_flesch_mean":114.43930000000005,"url_rate":0.0,"vocab_size":1339,"word_mean":1.0,"word_median":1.0}},{"alerts":[],"column":"segment_class","extras":{"singletons":0,"top_values":[["consonant",72282],["vowel",31052],["tone",2150]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":3,"null_rate":0.0,"stats":{"cardinality":3,"entropy":1.0075090073337545,"entropy_ratio":0.6356674097181094,"top_rate":0.685241363619127,"top_value":"consonant"}},{"alerts":[{"code":"imbalance","level":"warn","message":"top value is 98.0% of rows"}],"column":"tone","extras":{"singletons":0,"top_values":[["0",103334],["+",2150]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":2,"null_rate":0.0,"stats":{"cardinality":2,"entropy":0.14358135440365866,"entropy_ratio":0.14358135440365866,"top_rate":0.9796177619354594,"top_value":"0"}},{"alerts":[{"code":"imbalance","level":"warn","message":"top value is 98.0% of rows"}],"column":"stress","extras":{"singletons":0,"top_values":[["-",103334],["0",2150]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":2,"null_rate":0.0,"stats":{"cardinality":2,"entropy":0.14358135440365866,"entropy_ratio":0.14358135440365866,"top_rate":0.9796177619354594,"top_value":"-"}},{"alerts":[],"column":"syllabic","extras":{"singletons":0,"top_values":[["-",72248],["+",30692],["0",2150],["+,-",244],["-,+",124],["-,+,-",12],["-,+,+",12],["+,+,-",2]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":8,"null_rate":0.0,"stats":{"cardinality":8,"entropy":1.0416449706070494,"entropy_ratio":0.3472149902023498,"top_rate":0.6849190398543855,"top_value":"-"}},{"alerts":[],"column":"source","extras":{"singletons":0,"top_values":[["ph",36274],["ea",16883],["upsid",13966],["er",9423],["saphon",9047],["aa",8064],["spa",7566],["ra",4261]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":8,"null_rate":0.0,"stats":{"cardinality":8,"entropy":2.6973229908177236,"entropy_ratio":0.8991076636059079,"top_rate":0.34388153653634673,"top_value":"ph"}}],"insights":{"errors":[],"insights":[{"confidence":"high","critiques":[],"evidence_keys":["row_count","column_count","columns.iso_639_3.n_unique","columns.glottocode.n_unique","columns.segment_class.top_values","columns.source.top_values","columns.source.top_rate","columns.phoneme.top_values","columns.stress.top_rate","columns.tone.top_rate","columns.syllabic.top_values"],"featured_charts":[{"caption":"Shows the consonant/vowel/tone split \u2014 consonants dominate at ~68% of rows.","column":"segment_class","kind":"donut"},{"caption":"Compares the 8 contributing inventories; 'ph' supplies about a third of all rows.","column":"source","kind":"bar"},{"caption":"Top phoneme symbols across languages \u2014 /m/, /i/, /k/ lead, matching typological expectations.","column":"phoneme","kind":"bar"},{"caption":"Distribution of the syllabic feature; mostly '-' vs '+' with a long tail of rare combined values.","column":"syllabic","kind":"bar"},{"caption":"Language coverage by ISO code \u2014 check which languages contribute the most phoneme rows.","column":"iso_639_3","kind":"bar"}],"model":"anthropic:claude-opus-4-7","narrative":"This dataset is a phoneme inventory table with 105,484 rows and 8 columns, indexing phonemes by language (via iso_639_3 and glottocode) along with phonological features like segment_class, syllabic, stress, and tone, plus a source attribution. Coverage spans roughly 2,094 ISO languages and 2,176 Glottolog codes, with 'mis' (828 rows) and 'kham1282' (622 rows) being the most represented. Worth a closer look first: the segment_class and source distributions, since segment_class shows a clear consonant-heavy mix (72,282 consonants vs 31,052 vowels vs 2,150 tones) and source is dominated by 'ph' at 34% but spreads across 8 datasets, hinting at where data density comes from. The phoneme column itself is also informative \u2014 common segments like /m/, /i/, /k/, /j/ top the list, matching well-known cross-linguistic frequencies. Note that stress and tone are highly imbalanced (~98% one value) and largely redundant with the 'tone' segment_class.","scope":"dataset","target":"__global__"},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.len_min","stats.len_max","stats.len_mean","stats.one_word_rate","stats.duplicate_rate","stats.vocab_size","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This column holds Glottocodes \u2014 fixed 8-character language identifiers (len_min/max/mean all 8, one_word_rate 1.0) from the Glottolog catalog. Across 105,484 rows there are only 2,176 unique codes with a 97.9% duplicate rate, so each code tags many records; top codes like kham1282 (622) and osse1243 (483) dominate. Null rate is negligible (0.0002) and vocabulary (2,166) closely matches unique count, indicating clean categorical data rather than free text.","role":"foreign_key","scope":"column","target":"glottocode","treatment":"Treat as a categorical key; left-join to a Glottolog reference table for language metadata."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.len_min","stats.len_max","stats.len_mean","stats.one_word_rate","stats.duplicate_rate","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This column holds ISO 639-3 language codes: every value is exactly 3 characters and one word, with 2,094 distinct codes across 105,484 rows. The distribution is heavy-tailed \u2014 the top code 'mis' (the ISO 'uncoded languages' placeholder) leads at 828 occurrences, followed by 'khg' and 'oss', and duplicates account for 98.01% of rows by design. Null rate is negligible (0.0002), and the prevalence of 'mis' is worth flagging since it signals unidentified languages rather than a real language label.","role":"feature","scope":"column","target":"iso_639_3","treatment":"Treat as a categorical code; one-hot or target-encode, and decide whether to drop or bucket the 'mis' placeholder."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","stats.duplicate_rate","stats.len_mean","stats.len_max","stats.one_word_rate","stats.vocab_size","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This column holds short phoneme tokens \u2014 every entry is a single word, mean length 1.5 characters and max 11, with top values being individual letters like 'm', 'i', 'k', 'j'. Despite 105,484 rows there are only 3,142 distinct values and a 97.0% duplicate rate, so this behaves as a small categorical alphabet rather than free text. Vocab size of 1,339 suggests multi-character phoneme codes exist alongside the single-letter majority.","role":"feature","scope":"column","target":"phoneme","treatment":"Treat as a categorical phoneme code \u2014 label-encode or one-hot rather than tokenizing as text."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.cardinality","stats.entropy","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This is a 3-level categorical labeling each row as a phonological segment class: consonant, vowel, or tone. The distribution is heavily imbalanced \u2014 consonants account for 68.5% (72,282), vowels 31,052, and tones only 2,150 \u2014 yielding entropy of 1.01 (entropy ratio 0.64). No nulls across 105,484 rows.","role":"label","scope":"column","target":"segment_class","treatment":"One-hot or ordinal encode; consider class-weighting or stratified sampling given the rare 'tone' class."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.cardinality","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"Binary tone flag with values \"0\" and \"+\", almost certainly encoding neutral vs. positive sentiment or polarity. The distribution is severely imbalanced: \"0\" covers 97.96% of 105,484 rows, leaving only 2,150 \"+\" cases, and entropy ratio is just 0.144. No nulls are present.","role":"label","scope":"column","target":"tone","treatment":"Treat as imbalanced binary target; stratify splits and apply class weighting or resampling."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.top_rate","stats.top_value","stats.entropy_ratio","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"Binary categorical flag with only two values, '-' and '0', where '-' dominates at 97.96% of 105,484 rows and '0' covers the remaining 2,150. Entropy ratio of 0.14 confirms the column carries almost no information, and the '-' token suggests a placeholder rather than a true category. As a stress indicator it is severely imbalanced and likely unusable as-is.","role":"feature","scope":"column","target":"stress","treatment":"Drop or recode '-' as missing; near-constant column unlikely to help modelling."},{"confidence":"high","critiques":[],"evidence_keys":["n_unique","top_value","top_rate","top_values","entropy_ratio","null_rate"],"model":"anthropic:claude-opus-4-7","narrative":"This appears to be a phonological feature column encoding the [syllabic] distinctive feature, dominated by binary values '-' (68.5%) and '+' (29.1%) with 2150 entries marked '0' (likely unspecified). Notably, 394 rows carry comma-separated compound values like '+,-' or '-,+,-', suggesting segments with sequential feature changes (e.g., affricates or contour segments). Entropy ratio of 0.347 confirms heavy concentration in the top category.","role":"feature","scope":"column","target":"syllabic","treatment":"One-hot encode, or split compound comma values into sequence positions before modelling."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.cardinality","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"Categorical provenance tag with 8 distinct values across 105,484 rows and no nulls, almost certainly indicating which source database or inventory each record came from (e.g., 'ph', 'ea', 'upsid', 'saphon'). Distribution is moderately concentrated: 'ph' accounts for 34.4% of rows while the smallest source 'ra' contributes only 4,261, yielding a high entropy ratio of 0.90. No single source dominates outright, so this is a usable stratification key rather than a near-constant flag.","role":"metadata","scope":"column","target":"source","treatment":"Keep as a categorical grouping/stratification variable; one-hot encode if used as a model feature."}],"providers":["anthropic:claude-opus-4-7"],"total_usage":{"completion_tokens":3143,"prompt_tokens":10766,"total_tokens":13909}},"language_counts":{},"meta":{"generated_at":"2026-05-01T17:11:31+00:00","mode":"full","row_count":105484,"sampled_rows":105484,"seed":42,"source":"/home/coolhand/servers/diachronica/etymology_atlas/parquet/phonemes.parquet"},"notes":[],"saturn_version":"0.2.0","schema":{"glottocode":"text","iso_639_3":"text","phoneme":"text","segment_class":"categorical","source":"categorical","stress":"categorical","syllabic":"categorical","tone":"categorical"}}
