{"columns":[{"alerts":[],"column":"phoneme_id","extras":{"histogram":{"counts":[2638,2637,2637,2637,2637,2637,2637,2637,2637,2637,2637,2637,2637,2638,2637,2637,2637,2637,2637,2637,2637,2637,2637,2637,2637,2637,2638,2637,2637,2637,2637,2637,2637,2637,2637,2637,2637,2637,2637,2638],"edges":[1.0,2638.075,5275.15,7912.224999999999,10549.3,13186.375,15823.449999999999,18460.524999999998,21097.6,23734.675,26371.75,29008.824999999997,31645.899999999998,34282.975,36920.049999999996,39557.125,42194.2,44831.274999999994,47468.35,50105.424999999996,52742.5,55379.575,58016.649999999994,60653.725,63290.799999999996,65927.875,68564.95,71202.025,73839.09999999999,76476.17499999999,79113.25,81750.325,84387.4,87024.47499999999,89661.54999999999,92298.625,94935.7,97572.775,100209.84999999999,102846.92499999999,105484.0]},"sample":[560.0,687.0,774.0,1045.0,1469.0,1882.0,1894.0,2055.0,2276.0,2330.0,2389.0,2401.0,2622.0,2647.0,3240.0,3363.0,3648.0,3917.0,3940.0,4380.0,4602.0,4720.0,4731.0,6130.0,6703.0,6838.0,7128.0,7134.0,8021.0,8325.0,8338.0,8526.0,8709.0,8887.0,8896.0,9024.0,9220.0,9371.0,9470.0,9521.0,9579.0,9676.0,9889.0,9954.0,9965.0,10141.0,10216.0,10883.0,11162.0,11167.0,11346.0,11433.0,11511.0,12038.0,12219.0,12413.0,12518.0,12542.0,12842.0,12936.0,13047.0,13278.0,13422.0,13453.0,13650.0,13816.0,13894.0,13940.0,14686.0,14693.0,14756.0,14767.0,14799.0,14877.0,15166.0,15210.0,15628.0,15947.0,15964.0,16019.0,16153.0,16207.0,16884.0,16951.0,16960.0,17339.0,17354.0,17481.0,17502.0,17558.0,17955.0,18369.0,18438.0,18599.0,18886.0,18891.0,19034.0,19137.0,19169.0,19809.0,19906.0,20445.0,20701.0,21008.0,21059.0,21153.0,21291.0,21354.0,22559.0,23537.0,23840.0,23864.0,24205.0,24244.0,24537.0,24595.0,24902.0,25112.0,25173.0,25391.0,25680.0,25801.0,26421.0,26961.0,27471.0,27654.0,27721.0,27967.0,28107.0,28640.0,29038.0,29067.0,29073.0,29125.0,29438.0,29561.0,29584.0,29614.0,29960.0,30202.0,30298.0,30367.0,30574.0,30642.0,30868.0,31025.0,31256.0,31475.0,31574.0,31712.0,31728.0,31817.0,31952.0,32190.0,32479.0,32644.0,32704.0,32821.0,33078.0,33241.0,33284.0,33402.0,33768.0,33770.0,33777.0,34228.0,34691.0,34714.0,34907.0,35152.0,35254.0,35356.0,35652.0,36035.0,36509.0,36531.0,37236.0,38055.0,38224.0,38290.0,38515.0,38526.0,38696.0,38918.0,38937.0,39065.0,39368.0,39470.0,40038.0,40073.0,40716.0,41915.0,42258.0,42731.0,42948.0,43025.0,43160.0,43220.0,43519.0,43757.0,43948.0,43960.0,44015.0,44195.0,44924.0,45462.0,45466.0,45566.0,45664.0,45708.0,45731.0,45735.0,45833.0,45910.0,45930.0,46077.0,46206.0,46210.0,46442.0,46565.0,46812.0,46923.0,47029.0,47052.0,47272.0,47293.0,47306.0,47951.0,48116.0,48232.0,48451.0,48572.0,48636.0,48737.0,48833.0,49024.0,49169.0,49290.0,49329.0,49468.0,49516.0,49559.0,49847.0,49978.0,50768.0,51229.0,51511.0,51656.0,51848.0,51986.0,52075.0,52125.0,52288.0,52303.0,52540.0,52622.0,52631.0,52692.0,52774.0,53099.0,53139.0,53370.0,53598.0,53890.0,54250.0,54662.0,54817.0,54933.0,55278.0,55982.0,56161.0,57084.0,57154.0,57278.0,57378.0,57413.0,57542.0,57587.0,58191.0,58242.0,58336.0,58397.0,58434.0,58541.0,58568.0,58783.0,59309.0,59412.0,59430.0,59775.0,59804.0,60187.0,60398.0,60941.0,61157.0,61255.0,61463.0,61847.0,62197.0,62854.0,62941.0,63162.0,63371.0,63615.0,63864.0,63919.0,64463.0,64727.0,65499.0,66008.0,66163.0,66169.0,66291.0,66340.0,66718.0,66848.0,66864.0,66893.0,66913.0,67613.0,68383.0,68722.0,68786.0,69438.0,69521.0,69594.0,69710.0,69793.0,69850.0,69871.0,69886.0,69982.0,70126.0,70254.0,70374.0,70648.0,70785.0,71204.0,71323.0,71426.0,71718.0,71751.0,71909.0,72121.0,73180.0,73219.0,73231.0,73272.0,73574.0,73577.0,73601.0,74070.0,74109.0,75335.0,75407.0,75414.0,75702.0,75866.0,75921.0,75962.0,76271.0,76554.0,76901.0,77252.0,77524.0,77611.0,77861.0,77901.0,78087.0,78235.0,78684.0,78688.0,78862.0,78962.0,79462.0,79619.0,79712.0,79810.0,79817.0,79919.0,80391.0,80411.0,80680.0,80703.0,80762.0,80942.0,81135.0,81255.0,81335.0,81757.0,81920.0,81928.0,82010.0,82052.0,82072.0,82393.0,82423.0,82537.0,82698.0,82762.0,82807.0,83002.0,83511.0,83561.0,83591.0,83740.0,84039.0,84562.0,84585.0,84765.0,84825.0,85020.0,85081.0,85126.0,85595.0,86386.0,86400.0,86920.0,86997.0,87070.0,87392.0,87449.0,87477.0,87488.0,87509.0,87577.0,88177.0,88443.0,88656.0,89085.0,89324.0,89466.0,89719.0,89877.0,89961.0,90130.0,90139.0,90145.0,91109.0,91526.0,92162.0,93246.0,93380.0,93807.0,93842.0,93932.0,93984.0,94123.0,94355.0,94371.0,94713.0,94958.0,95128.0,95379.0,95570.0,96464.0,96480.0,96907.0,97318.0,97368.0,97509.0,97728.0,97764.0,97786.0,98392.0,98432.0,98477.0,98596.0,98651.0,98709.0,98725.0,98794.0,98855.0,99860.0,100364.0,100411.0,100754.0,100897.0,100940.0,101177.0,101248.0,101447.0,101635.0,101890.0,101950.0,101953.0,102113.0,102167.0,102419.0,102437.0,102592.0,102693.0,103311.0,104164.0,104358.0,104424.0,104537.0,104736.0,105178.0,105414.0]},"kind":"numeric","n":105484,"n_null":0,"n_unique":105484,"null_rate":0.0,"stats":{"iqr":52741.5,"kurtosis":-1.200000000215694,"max":105484.0,"mean":52742.5,"median":52742.5,"min":1.0,"n_outliers":0,"outlier_rate":0.0,"q1":26371.75,"q3":79113.25,"skew":0.0,"std":30450.752234824457,"zero_rate":0.0}},{"alerts":[{"code":"one_word","level":"warn","message":"100.0% rows are a single word"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"},{"code":"duplicates","level":"warn","message":"97.9% duplicate strings"}],"column":"glottocode","extras":{"language_counts":{},"language_sample_size":5000,"length_histogram":{"counts":[19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,105465],"edges":[2.0,2.15,2.3,2.45,2.6,2.75,2.9,3.05,3.2,3.3499999999999996,3.5,3.65,3.8,3.95,4.1,4.25,4.4,4.55,4.699999999999999,4.85,5.0,5.15,5.3,5.449999999999999,5.6,5.75,5.9,6.05,6.2,6.35,6.5,6.6499999999999995,6.8,6.95,7.1,7.25,7.3999999999999995,7.55,7.7,7.85,8.0]},"near_unique":false,"sample":["lakk1252","lazz1240","kham1282","cebu1242","west2456","gurd1238","copi1238","kham1282","gata1239","paez1247","jenn1240","izer1241","wann1242","telu1262","kara1476","aika1237","bero1242","urdu1245","west2443","torw1241","kwaz1243","wapi1253","naxi1245","nyah1250","urub1250","chec1245","kaum1238","tiwi1244","yaku1245","wikn1246","nort2972","leco1242","iton1250","bamu1256","juan1238","kofy1242","burd1238","sooo1256","tivv1240","begb1241","nene1249","dyug1238","timn1235","wamb1258","kara1476","atam1239","waya1269","bamu1256","dadi1250","buru1296"],"top_values":[["kham1282",622],["osse1243",483],["dutc1256",395],["stan1293",370],["hind1269",342],["gwan1268",323],["lith1251",315],["chec1245",309],["iris1253",288],["kaba1278",281],["beng1280",263],["buru1296",251],["sind1272",235],["tibe1272",223],["shix1238",221],["lazz1240",216],["basq1248",215],["east2328",213],["katc1249",213],["khar1287",208]],"top_words":[["kham1282",103],["osse1243",97],["iris1253",62],["gwan1268",60],["dutc1256",59],["lith1251",58],["lazz1240",57],["hind1269",57],["stan1293",56],["kaba1278",55],["buru1296",51],["tibe1272",50],["hung1274",50],["east2328",47],["basq1248",46],["shix1238",45],["chec1245",45],["lakk1252",43],["sind1272",43],["stan1290",41],["nucl1310",40],["khar1287",40],["tach1250",40],["khan1273",39],["giry1241",39]],"vocab_skipped":null,"word_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,105484,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[0.5,0.5333333333333333,0.5666666666666667,0.6,0.6333333333333333,0.6666666666666666,0.7,0.7333333333333334,0.7666666666666666,0.8,0.8333333333333333,0.8666666666666667,0.9,0.9333333333333333,0.9666666666666667,1.0,1.0333333333333332,1.0666666666666667,1.1,1.1333333333333333,1.1666666666666665,1.2,1.2333333333333334,1.2666666666666666,1.3,1.3333333333333335,1.3666666666666667,1.4,1.4333333333333333,1.4666666666666668,1.5]}},"kind":"text","n":105484,"n_null":0,"n_unique":2177,"null_rate":0.0,"stats":{"allcaps_rate":0.0,"boilerplate_rate":0.0,"duplicate_rate":0.9793617989458117,"emoji_rate":0.0,"len_max":8,"len_mean":7.998919267377043,"len_median":8.0,"len_min":2,"len_p95":8.0,"n_duplicates":103307,"n_empty":0,"one_word_rate":1.0,"readability_flesch_mean":94.14800000000002,"url_rate":0.0,"vocab_size":2168,"word_mean":1.0,"word_median":1.0}},{"alerts":[{"code":"one_word","level":"warn","message":"100.0% rows are a single word"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"},{"code":"duplicates","level":"warn","message":"97.0% duplicate strings"}],"column":"phoneme","extras":{"language_counts":{},"language_sample_size":5000,"length_histogram":{"counts":[67114,0,0,0,28726,0,0,0,6559,0,0,0,2225,0,0,0,401,0,0,0,267,0,0,0,104,0,0,0,5,0,0,0,70,0,0,0,1,0,0,12],"edges":[1.0,1.25,1.5,1.75,2.0,2.25,2.5,2.75,3.0,3.25,3.5,3.75,4.0,4.25,4.5,4.75,5.0,5.25,5.5,5.75,6.0,6.25,6.5,6.75,7.0,7.25,7.5,7.75,8.0,8.25,8.5,8.75,9.0,9.25,9.5,9.75,10.0,10.25,10.5,10.75,11.0]},"near_unique":false,"sample":["\u03c7","u","ndz","r","\u02e6","\u0268","d\u0324\u026e\u0324","u","i\u0303","a\u0303","t\u032a","ts","o","l","u","u\u0303","p","\u027d\u0324","u","t\u032a","\u0294","\u0290","\u0282","i","j","u\u032fo\u0303\u02d0","t","j","l","t\u032a\u0349","a\u0303","l","\u0294","b","\u0254","m","r","\u025b","t\u0320\u0283","f","w","\u0288\u0349","\u0254","n","\u0235","p","a","t\u032a","o\u031e\u0303","t\u0255\u02b0"],"top_values":[["m",2915],["i",2779],["k",2729],["j",2716],["u",2646],["a",2600],["p",2593],["w",2483],["n",2350],["t",2064],["l",2044],["s",2021],["b",1906],["\u014b",1898],["e",1842],["o",1826],["\u0261",1712],["h",1703],["d",1376],["r",1332]],"top_words":[["m",572],["i",537],["k",520],["u",516],["j",508],["a",506],["p",498],["w",459],["n",446],["l",395],["s",392],["b",363],["t",361],["o",358],["\u014b",349],["e",345],["\u0261",323],["h",307],["r",272],["f",256],["d",243],["\u0272",240],["t\u0320\u0283",237],["\u0294",215],["\u0254",212]],"vocab_skipped":null,"word_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,105484,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[0.5,0.5333333333333333,0.5666666666666667,0.6,0.6333333333333333,0.6666666666666666,0.7,0.7333333333333334,0.7666666666666666,0.8,0.8333333333333333,0.8666666666666667,0.9,0.9333333333333333,0.9666666666666667,1.0,1.0333333333333332,1.0666666666666667,1.1,1.1333333333333333,1.1666666666666665,1.2,1.2333333333333334,1.2666666666666666,1.3,1.3333333333333335,1.3666666666666667,1.4,1.4333333333333333,1.4666666666666668,1.5]}},"kind":"text","n":105484,"n_null":0,"n_unique":3142,"null_rate":0.0,"stats":{"allcaps_rate":0.0017538204846232605,"boilerplate_rate":0.0,"duplicate_rate":0.9702134920935876,"emoji_rate":0.0,"len_max":11,"len_mean":1.5006067270865724,"len_median":1.0,"len_min":1,"len_p95":3.0,"n_duplicates":102342,"n_empty":0,"one_word_rate":1.0,"readability_flesch_mean":114.43930000000005,"url_rate":0.0,"vocab_size":1339,"word_mean":1.0,"word_median":1.0}},{"alerts":[],"column":"segment_class","extras":{"singletons":0,"top_values":[["consonant",72282],["vowel",31052],["tone",2150]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":3,"null_rate":0.0,"stats":{"cardinality":3,"entropy":1.0075090073337545,"entropy_ratio":0.6356674097181094,"top_rate":0.685241363619127,"top_value":"consonant"}},{"alerts":[],"column":"source","extras":{"singletons":0,"top_values":[["ph",36274],["ea",16883],["upsid",13966],["er",9423],["saphon",9047],["aa",8064],["spa",7566],["ra",4261]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":8,"null_rate":0.0,"stats":{"cardinality":8,"entropy":2.6973229908177236,"entropy_ratio":0.8991076636059079,"top_rate":0.34388153653634673,"top_value":"ph"}},{"alerts":[],"column":"created_at","extras":{"singletons":0,"top_values":[["2026-01-06 05:13:20",75484],["2026-01-06 05:13:19",30000]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":2,"null_rate":0.0,"stats":{"cardinality":2,"entropy":0.8613814883254317,"entropy_ratio":0.8613814883254317,"top_rate":0.715596678169201,"top_value":"2026-01-06 05:13:20"}}],"insights":{"errors":[],"insights":[{"confidence":"high","critiques":[],"evidence_keys":["row_count","column_count","segment_class.top_values","source.top_values","glottocode.n_unique","glottocode.top_values","phoneme.n_unique","phoneme.top_values","phoneme.stats.len_mean"],"featured_charts":[{"caption":"Shows that consonants make up the bulk of records, with tones a small minority.","column":"segment_class","kind":"donut"},{"caption":"Reveals heavy reliance on the 'ph' source (~34%) versus a long tail of smaller contributors.","column":"source","kind":"bar"},{"caption":"Highlights the most common phoneme symbols (m, i, k, j, u, a) across the dataset.","column":"phoneme","kind":"bar"},{"caption":"Top languages like kham1282 and osse1243 dominate, indicating uneven per-language coverage.","column":"glottocode","kind":"bar"},{"caption":"Most phoneme strings are 1 character; check the tail up to 11 characters for compound symbols.","column":"phoneme","kind":"length"}],"model":"anthropic:claude-opus-4-7","narrative":"This dataset contains 105,484 rows of phoneme records linked to languages by glottocode, drawn from 8 different sources. Each row pairs a language identifier (2,177 unique glottocodes) with a phoneme (3,142 unique values, mostly 1-character IPA-like symbols) and a segment class. The segment_class breakdown is the most informative summary: consonants dominate at 72,282 rows, vowels account for 31,052, and tones only 2,150. Source coverage is uneven \u2014 'ph' alone supplies about 34% of records, while the long tail (ra, spa, aa) is much smaller, which matters if you compare across sources. Glottocode frequency is also skewed: kham1282 and osse1243 each appear hundreds of times, suggesting some languages have far richer phoneme inventories recorded than others.","scope":"dataset","target":"__global__"},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.min","stats.max","stats.mean","stats.median","stats.skew","stats.n_outliers"],"model":"anthropic:claude-opus-4-7","narrative":"This is a sequential row identifier: every one of the 105484 rows has a unique value, running from min 1 to max 105484 with mean and median both at 52742.5 and skew 0.0. The perfectly uniform distribution and zero null/outlier rate confirm it carries no analytic signal beyond row ordering.","role":"identifier","scope":"column","target":"phoneme_id","treatment":"drop from modelling; retain only as a join key."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","len_min","len_max","len_mean","duplicate_rate","one_word_rate","vocab_size","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This column holds Glottolog language codes \u2014 fixed 8-character identifiers (len_mean 7.999, len_max 8) drawn from a vocabulary of 2,168 distinct codes across 105,484 rows. With a 97.94% duplicate rate and top codes like kham1282 (622) and osse1243 (483) repeating heavily, each code labels many rows rather than identifying them. The 2-character minimum length suggests a small number of malformed or truncated entries worth inspecting.","role":"foreign_key","scope":"column","target":"glottocode","treatment":"Treat as a categorical join key to Glottolog metadata; verify the short (len=2) entries."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","stats.duplicate_rate","stats.one_word_rate","stats.len_mean","stats.len_max","stats.vocab_size","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This column holds phoneme tokens \u2014 single-word strings averaging 1.5 characters with a max length of 11 and every row being one word. Despite 105,484 rows, only 3,142 unique values exist and 97.0% are duplicates, with single letters like 'm', 'i', 'k', 'j' dominating the top values. The small vocabulary (1,339 words) and tiny token sizes suggest these are IPA-like phonetic units rather than full words.","role":"feature","scope":"column","target":"phoneme","treatment":"Treat as a categorical token and label-encode or embed before modelling."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.top_value","stats.top_rate","stats.entropy_ratio","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"A 3-level categorical tag classifying segments as consonant, vowel, or tone, with no nulls across 105,484 rows. The distribution is heavily skewed: consonant dominates at 68.5%, vowel takes most of the rest, and tone is rare at only 2,150 occurrences. Entropy ratio of 0.64 confirms the imbalance.","role":"feature","scope":"column","target":"segment_class","treatment":"One-hot encode; consider class-imbalance handling if predicting the rare 'tone' class."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"Categorical provenance tag with 8 distinct codes (ph, ea, upsid, er, saphon, aa, spa, ra) marking which source each of the 105,484 rows came from. Distribution is fairly balanced for a source field \u2014 entropy ratio 0.899 and the top code 'ph' covers only 34.4% \u2014 suggesting the dataset is a merge of multiple comparably-sized corpora rather than one dominant source with minor supplements. No nulls, so every row is attributable.","role":"metadata","scope":"column","target":"source","treatment":"Keep as a categorical grouping/stratification key; one-hot encode if used as a feature."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","stats.cardinality","stats.top_rate","stats.top_value","top_values","null_rate"],"model":"anthropic:claude-opus-4-7","narrative":"Despite its name, created_at holds only 2 distinct timestamp values across 105,484 rows, both within one second of each other on 2026-01-06. This looks like a batch ingestion or load timestamp rather than per-record creation time, with 71.6% of rows sharing the dominant value. There is no temporal variation to exploit as a feature.","role":"metadata","scope":"column","target":"created_at","treatment":"Drop; no temporal signal beyond a batch-load marker."}],"providers":["anthropic:claude-opus-4-7"],"total_usage":{"completion_tokens":2181,"prompt_tokens":7987,"total_tokens":10168}},"language_counts":{},"meta":{"generated_at":"2026-05-01T23:25:14+00:00","mode":"full","row_count":105484,"sampled_rows":105484,"seed":42,"source":"/home/coolhand/data/linguistic.db"},"notes":[],"saturn_version":"0.2.0","schema":{"created_at":"categorical","glottocode":"text","phoneme":"text","phoneme_id":"numeric","segment_class":"categorical","source":"categorical"}}
