{"columns":[{"alerts":[],"column":"InventoryID","extras":{"histogram":{"counts":[2729,2947,2742,2385,2399,2530,2270,2323,2533,3039,2896,2609,2693,2513,2637,2514,2898,3314,3634,2993,3172,3290,2686,3001,1779,2046,1878,1864,2685,3387,3096,3142,3264,3557,2966,1975,1761,1659,1855,1823],"edges":[1.0,76.475,151.95,227.42499999999998,302.9,378.375,453.84999999999997,529.3249999999999,604.8,680.275,755.75,831.2249999999999,906.6999999999999,982.175,1057.6499999999999,1133.125,1208.6,1284.0749999999998,1359.55,1435.0249999999999,1510.5,1585.975,1661.4499999999998,1736.925,1812.3999999999999,1887.8749999999998,1963.35,2038.8249999999998,2114.2999999999997,2189.7749999999996,2265.25,2340.725,2416.2,2491.6749999999997,2567.1499999999996,2642.625,2718.1,2793.575,2869.0499999999997,2944.5249999999996,3020.0]},"sample":[12.0,15.0,17.0,25.0,37.0,52.0,53.0,60.0,65.0,66.0,67.0,68.0,73.0,74.0,91.0,94.0,101.0,109.0,109.0,123.0,128.0,131.0,131.0,162.0,176.0,178.0,185.0,185.0,214.0,224.0,224.0,229.0,235.0,241.0,242.0,245.0,252.0,257.0,260.0,262.0,264.0,266.0,274.0,277.0,277.0,282.0,285.0,305.0,315.0,315.0,322.0,325.0,327.0,344.0,351.0,356.0,358.0,359.0,367.0,370.0,373.0,381.0,387.0,388.0,393.0,399.0,402.0,404.0,426.0,426.0,429.0,429.0,430.0,432.0,439.0,440.0,451.0,460.0,461.0,463.0,469.0,471.0,493.0,495.0,496.0,508.0,509.0,513.0,514.0,516.0,527.0,538.0,541.0,546.0,556.0,556.0,560.0,564.0,565.0,588.0,592.0,608.0,618.0,630.0,632.0,636.0,641.0,643.0,673.0,698.0,705.0,705.0,715.0,716.0,723.0,724.0,733.0,738.0,739.0,744.0,750.0,753.0,770.0,784.0,797.0,802.0,804.0,811.0,814.0,827.0,837.0,838.0,838.0,839.0,847.0,850.0,851.0,852.0,861.0,868.0,871.0,874.0,880.0,882.0,891.0,896.0,902.0,908.0,911.0,915.0,915.0,917.0,921.0,927.0,934.0,940.0,942.0,945.0,951.0,956.0,958.0,961.0,970.0,970.0,970.0,987.0,1002.0,1003.0,1008.0,1014.0,1017.0,1019.0,1028.0,1041.0,1055.0,1055.0,1075.0,1099.0,1105.0,1107.0,1112.0,1113.0,1118.0,1125.0,1126.0,1129.0,1137.0,1140.0,1157.0,1158.0,1178.0,1214.0,1224.0,1236.0,1242.0,1244.0,1247.0,1248.0,1256.0,1261.0,1267.0,1267.0,1269.0,1275.0,1292.0,1305.0,1305.0,1307.0,1309.0,1310.0,1311.0,1311.0,1313.0,1315.0,1316.0,1318.0,1319.0,1319.0,1326.0,1329.0,1335.0,1339.0,1342.0,1343.0,1346.0,1347.0,1347.0,1359.0,1363.0,1366.0,1370.0,1373.0,1374.0,1376.0,1378.0,1380.0,1383.0,1384.0,1385.0,1389.0,1390.0,1391.0,1398.0,1401.0,1418.0,1425.0,1433.0,1437.0,1441.0,1445.0,1447.0,1449.0,1452.0,1452.0,1459.0,1461.0,1461.0,1462.0,1464.0,1471.0,1472.0,1478.0,1484.0,1491.0,1501.0,1512.0,1517.0,1520.0,1528.0,1542.0,1546.0,1568.0,1571.0,1574.0,1576.0,1577.0,1580.0,1581.0,1594.0,1596.0,1598.0,1599.0,1600.0,1601.0,1602.0,1608.0,1621.0,1623.0,1623.0,1630.0,1631.0,1639.0,1646.0,1659.0,1664.0,1666.0,1673.0,1689.0,1701.0,1716.0,1718.0,1724.0,1728.0,1733.0,1739.0,1741.0,1754.0,1760.0,1779.0,1791.0,1794.0,1794.0,1797.0,1798.0,1811.0,1817.0,1817.0,1818.0,1819.0,1849.0,1881.0,1896.0,1898.0,1921.0,1924.0,1926.0,1930.0,1933.0,1934.0,1936.0,1936.0,1939.0,1945.0,1951.0,1955.0,1965.0,1971.0,1986.0,1990.0,1992.0,2004.0,2006.0,2014.0,2024.0,2070.0,2071.0,2072.0,2073.0,2085.0,2086.0,2087.0,2105.0,2106.0,2150.0,2153.0,2153.0,2162.0,2165.0,2166.0,2167.0,2172.0,2178.0,2186.0,2194.0,2200.0,2202.0,2207.0,2208.0,2213.0,2216.0,2229.0,2229.0,2232.0,2234.0,2245.0,2249.0,2251.0,2254.0,2254.0,2256.0,2266.0,2266.0,2273.0,2274.0,2276.0,2279.0,2284.0,2287.0,2289.0,2301.0,2305.0,2305.0,2307.0,2308.0,2308.0,2315.0,2316.0,2318.0,2323.0,2324.0,2325.0,2329.0,2341.0,2342.0,2343.0,2346.0,2352.0,2366.0,2367.0,2372.0,2374.0,2379.0,2380.0,2381.0,2392.0,2410.0,2411.0,2424.0,2426.0,2428.0,2437.0,2438.0,2438.0,2439.0,2439.0,2440.0,2456.0,2463.0,2467.0,2475.0,2481.0,2484.0,2489.0,2491.0,2493.0,2497.0,2497.0,2497.0,2519.0,2527.0,2541.0,2564.0,2566.0,2577.0,2577.0,2579.0,2580.0,2584.0,2589.0,2589.0,2596.0,2601.0,2605.0,2611.0,2616.0,2645.0,2646.0,2662.0,2679.0,2682.0,2688.0,2698.0,2700.0,2701.0,2719.0,2720.0,2723.0,2728.0,2731.0,2734.0,2735.0,2738.0,2741.0,2780.0,2804.0,2807.0,2824.0,2831.0,2833.0,2842.0,2845.0,2854.0,2861.0,2873.0,2876.0,2876.0,2882.0,2884.0,2896.0,2896.0,2902.0,2906.0,2930.0,2965.0,2972.0,2975.0,2979.0,2989.0,3008.0,3018.0]},"kind":"numeric","n":105484,"n_null":0,"n_unique":3020,"null_rate":0.0,"stats":{"iqr":1468.0,"kurtosis":-1.1461956846362658,"max":3020.0,"mean":1479.331083387054,"median":1464.0,"min":1.0,"n_outliers":0,"outlier_rate":0.0,"q1":769.0,"q3":2237.0,"skew":-0.0023972460391718485,"std":843.1107593327903,"zero_rate":0.0}},{"alerts":[{"code":"one_word","level":"warn","message":"100.0% rows are a single word"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"},{"code":"duplicates","level":"warn","message":"97.9% duplicate strings"}],"column":"Glottocode","extras":{"language_counts":{},"language_sample_size":5000,"length_histogram":{"counts":[19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,105465],"edges":[2.0,2.15,2.3,2.45,2.6,2.75,2.9,3.05,3.2,3.3499999999999996,3.5,3.65,3.8,3.95,4.1,4.25,4.4,4.55,4.699999999999999,4.85,5.0,5.15,5.3,5.449999999999999,5.6,5.75,5.9,6.05,6.2,6.35,6.5,6.6499999999999995,6.8,6.95,7.1,7.25,7.3999999999999995,7.55,7.7,7.85,8.0]},"near_unique":false,"sample":["lakk1252","lazz1240","kham1282","cebu1242","west2456","gurd1238","copi1238","kham1282","gata1239","paez1247","jenn1240","izer1241","wann1242","telu1262","kara1476","aika1237","bero1242","urdu1245","west2443","torw1241","kwaz1243","wapi1253","naxi1245","nyah1250","urub1250","chec1245","kaum1238","tiwi1244","yaku1245","wikn1246","nort2972","leco1242","iton1250","bamu1256","juan1238","kofy1242","burd1238","sooo1256","tivv1240","begb1241","nene1249","dyug1238","timn1235","wamb1258","kara1476","atam1239","waya1269","bamu1256","dadi1250","buru1296"],"top_values":[["kham1282",622],["osse1243",483],["dutc1256",395],["stan1293",370],["hind1269",342],["gwan1268",323],["lith1251",315],["chec1245",309],["iris1253",288],["kaba1278",281],["beng1280",263],["buru1296",251],["sind1272",235],["tibe1272",223],["shix1238",221],["lazz1240",216],["basq1248",215],["east2328",213],["katc1249",213],["khar1287",208]],"top_words":[["kham1282",103],["osse1243",97],["iris1253",62],["gwan1268",60],["dutc1256",59],["lith1251",58],["lazz1240",57],["hind1269",57],["stan1293",56],["kaba1278",55],["buru1296",51],["tibe1272",50],["hung1274",50],["east2328",47],["basq1248",46],["shix1238",45],["chec1245",45],["lakk1252",43],["sind1272",43],["stan1290",41],["nucl1310",40],["khar1287",40],["tach1250",40],["khan1273",39],["giry1241",39]],"vocab_skipped":null,"word_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,105484,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[0.5,0.5333333333333333,0.5666666666666667,0.6,0.6333333333333333,0.6666666666666666,0.7,0.7333333333333334,0.7666666666666666,0.8,0.8333333333333333,0.8666666666666667,0.9,0.9333333333333333,0.9666666666666667,1.0,1.0333333333333332,1.0666666666666667,1.1,1.1333333333333333,1.1666666666666665,1.2,1.2333333333333334,1.2666666666666666,1.3,1.3333333333333335,1.3666666666666667,1.4,1.4333333333333333,1.4666666666666668,1.5]}},"kind":"text","n":105484,"n_null":0,"n_unique":2177,"null_rate":0.0,"stats":{"allcaps_rate":0.0,"boilerplate_rate":0.0,"duplicate_rate":0.9793617989458117,"emoji_rate":0.0,"len_max":8,"len_mean":7.998919267377043,"len_median":8.0,"len_min":2,"len_p95":8.0,"n_duplicates":103307,"n_empty":0,"one_word_rate":1.0,"readability_flesch_mean":94.14800000000002,"url_rate":0.0,"vocab_size":2168,"word_mean":1.0,"word_median":1.0}},{"alerts":[{"code":"one_word","level":"warn","message":"100.0% rows are a single word"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"},{"code":"duplicates","level":"warn","message":"98.0% duplicate strings"}],"column":"ISO6393","extras":{"language_counts":{},"language_sample_size":5000,"length_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,105484,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[2.5,2.525,2.55,2.575,2.6,2.625,2.65,2.675,2.7,2.725,2.75,2.775,2.8,2.825,2.85,2.875,2.9,2.925,2.95,2.975,3.0,3.025,3.05,3.075,3.1,3.125,3.15,3.175,3.2,3.225,3.25,3.275,3.3,3.325,3.35,3.375,3.4,3.425,3.45,3.475,3.5]},"near_unique":false,"sample":["lbe","lzz","khg","ceb","xwl","gdj","cce","khg","gaq","pbb","xuj","izr","wan","tel","gbd","tba","bom","urd","mis","trw","xwa","wap","nxq","cbn","urb","che","nyf","tiw","sah","wua","emp","lec","ito","bvm","jun","kwl","bxn","teu","tiv","bqv","yrk","dyd","tem","wmb","gbd","amz","way","bvm","mps","bsk"],"top_values":[["mis",828],["khg",622],["oss",525],["nld",395],["eng",370],["hin",342],["gwn",323],["lit",315],["che",309],["gle",288],["nyf",282],["kbd",281],["ben",263],["eus",258],["sgw",254],["bsk",251],["xtc",245],["snd",235],["bod",223],["sxg",221]],"top_words":[["mis",157],["oss",108],["khg",103],["gle",62],["gwn",60],["sgw",59],["nld",59],["lit",58],["lzz",57],["hin",57],["eng",56],["kbd",55],["eus",54],["bsk",51],["bod",50],["hun",50],["nyf",49],["mhr",47],["sxg",45],["che",45],["xtc",45],["lbe",43],["snd",43],["fra",41],["mya",40]],"vocab_skipped":null,"word_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,105484,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[0.5,0.5333333333333333,0.5666666666666667,0.6,0.6333333333333333,0.6666666666666666,0.7,0.7333333333333334,0.7666666666666666,0.8,0.8333333333333333,0.8666666666666667,0.9,0.9333333333333333,0.9666666666666667,1.0,1.0333333333333332,1.0666666666666667,1.1,1.1333333333333333,1.1666666666666665,1.2,1.2333333333333334,1.2666666666666666,1.3,1.3333333333333335,1.3666666666666667,1.4,1.4333333333333333,1.4666666666666668,1.5]}},"kind":"text","n":105484,"n_null":0,"n_unique":2095,"null_rate":0.0,"stats":{"allcaps_rate":0.0,"boilerplate_rate":0.0,"duplicate_rate":0.9801391680254825,"emoji_rate":0.0,"len_max":3,"len_mean":3.0,"len_median":3.0,"len_min":3,"len_p95":3.0,"n_duplicates":103389,"n_empty":0,"one_word_rate":1.0,"readability_flesch_mean":119.52800000000003,"url_rate":0.0,"vocab_size":2086,"word_mean":1.0,"word_median":1.0}},{"alerts":[{"code":"one_word","level":"warn","message":"84.3% rows are a single word"},{"code":"allcaps","level":"info","message":"13.1% rows are all-caps"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"},{"code":"duplicates","level":"warn","message":"97.4% duplicate strings"}],"column":"LanguageName","extras":{"language_counts":{},"language_sample_size":5000,"length_histogram":{"counts":[3915,29541,34409,14985,7358,4849,3897,2893,1292,993,198,224,218,39,93,130,64,23,37,57,0,52,0,23,0,0,20,0,40,0,87,0,0,0,0,0,0,0,0,47],"edges":[2.0,3.925,5.85,7.775,9.7,11.625,13.55,15.475,17.4,19.325,21.25,23.175,25.1,27.025000000000002,28.95,30.875,32.8,34.725,36.65,38.575,40.5,42.425000000000004,44.35,46.275,48.2,50.125,52.050000000000004,53.975,55.9,57.825,59.75,61.675000000000004,63.6,65.525,67.45,69.375,71.3,73.22500000000001,75.15,77.075,79.0]},"near_unique":false,"sample":["Lak","Laz","Kami Tibetan","Cebuano","western xwla","Kurtjar","Copi","Kham Tibetan","Gta\u0294","Paez","Kattunaika","Izere","Wan","Telugu","Garadjari","HUARI","BIROM","Urdu","West Djadjawurung","Torwali","Kwaza","Wapishana","NAXI","NYAH KUR","Kaapor","Chechen","Kauma","TIWI","Yakut","Kugu Nganhcara","Northern Ember\u00e1","Leco","Itonama","Bamunka","Juang","Kofyar","Purduna","Soo","Tiv","Tin\u0254r","Tundra Nenets","Jukun","Themne","Wambaya","Karajarri","Atampaya","Wayana","Bamunka","DADIBI","Burushaski"],"top_values":[["Iron Ossetic",444],["Dutch",395],["Chechen",309],["Lithuanian",263],["Sindhi",235],["Kabardian",225],["Bengali",220],["Laz",216],["Burushaski",208],["Basque",187],["Hindi",187],["Kharia",172],["Hungarian",168],["Hausa",165],["Nepali",165],["Telugu",163],["!Xo\u0301o\u0303",161],["Northeastern Thai",161],["Burmese",160],["Mbembe",153]],"top_words":[["tibetan",240],["northern",131],["ossetic",102],["southern",96],["quechua",93],["eastern",92],["iron",91],["saami",90],["western",86],["thai",63],["english",63],["chinese",63],["dutch",63],["arabic",62],["irish",62],["gwandara",60],["fulfulde",60],["lithuanian",58],["laz",57],["german",56],["kabardian",55],["central",55],["balochi",54],["burmese",54],["basque",54]],"vocab_skipped":null,"word_histogram":{"counts":[88959,0,0,14036,0,0,1563,0,0,0,393,0,0,258,0,0,109,0,0,0,20,0,0,99,0,0,0,0,0,47],"edges":[1.0,1.3,1.6,1.9,2.2,2.5,2.8,3.1,3.4,3.6999999999999997,4.0,4.3,4.6,4.9,5.2,5.5,5.8,6.1,6.3999999999999995,6.7,7.0,7.3,7.6,7.8999999999999995,8.2,8.5,8.8,9.1,9.4,9.7,10.0]}},"kind":"text","n":105484,"n_null":0,"n_unique":2716,"null_rate":0.0,"stats":{"allcaps_rate":0.13140381479655683,"boilerplate_rate":0.0,"duplicate_rate":0.974252019263585,"emoji_rate":0.0,"len_max":79,"len_mean":7.822219483523567,"len_median":7.0,"len_min":2,"len_p95":16.0,"n_duplicates":102768,"n_empty":0,"one_word_rate":0.8433411702248682,"readability_flesch_mean":53.18077500000003,"url_rate":0.0,"vocab_size":2670,"word_mean":1.2005422623336242,"word_median":1.0}},{"alerts":[],"column":"SpecificDialect","extras":{"singletons":0,"top_values":[["NA",75807],["",7692],["W2",120],["Lezgian (G\u00fcne)",96],["Santa",92],["Central Pakistan",83],["Babungo (Grassfields Bantu, Ring)",82],["Scottish Gaelic (Lewis)",82],["Tangari",81],["Kanga",76],["Kufa",75],["Skolt Saami (Su\u00f5\u02b9nn\u02bcjel)",75],["Standard Hindi (as spoken in Varanasi, Lucknow, Delhi etc.)",74],["Standard (eastern)",74],["Guovdageaidnu",74],["Nuosu (Black Yi)",74],["Northern Qiang (Yadu)",73],["Bangladeshi Standard (spoken in Dhaka and other urban aread of Bangladesh)",72],["Standard Italian",70],["Chechen (Ploskost)",70]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":546,"null_rate":0.0,"stats":{"cardinality":546,"entropy":2.9687253765324386,"entropy_ratio":0.3264934200400422,"top_rate":0.7186587539342459,"top_value":"NA"}},{"alerts":[{"code":"one_word","level":"warn","message":"100.0% rows are a single word"},{"code":"allcaps","level":"info","message":"100.0% rows are all-caps"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"},{"code":"duplicates","level":"warn","message":"97.0% duplicate strings"}],"column":"GlyphID","extras":{"language_counts":{},"language_sample_size":5000,"length_histogram":{"counts":[67114,0,0,0,28726,0,0,0,6559,0,0,0,2225,0,0,0,401,0,0,0,267,0,0,0,104,0,0,0,5,0,0,0,70,0,0,0,1,0,0,12],"edges":[4.0,5.25,6.5,7.75,9.0,10.25,11.5,12.75,14.0,15.25,16.5,17.75,19.0,20.25,21.5,22.75,24.0,25.25,26.5,27.75,29.0,30.25,31.5,32.75,34.0,35.25,36.5,37.75,39.0,40.25,41.5,42.75,44.0,45.25,46.5,47.75,49.0,50.25,51.5,52.75,54.0]},"near_unique":false,"sample":["03C7","0075","006E+0064+007A","0072","02E6","0268","0064+0324+026E+0324","0075","0069+0303","0061+0303","0074+032A","0074+0073","006F","006C","0075","0075+0303","0070","027D+0324","0075","0074+032A","0294","0290","0282","0069","006A","0075+032F+006F+0303+02D0","0074","006A","006C","0074+032A+0349","0061+0303","006C","0294","0062","0254","006D","0072","025B","0074+0320+0283","0066","0077","0288+0349","0254","006E","0235","0070","0061","0074+032A","006F+031E+0303","0074+0255+02B0"],"top_values":[["006D",2915],["0069",2779],["006B",2729],["006A",2716],["0075",2646],["0061",2600],["0070",2593],["0077",2483],["006E",2350],["0074",2064],["006C",2044],["0073",2021],["0062",1906],["014B",1898],["0065",1842],["006F",1826],["0261",1712],["0068",1703],["0064",1376],["0072",1332]],"top_words":[["006d",572],["0069",537],["006b",520],["0075",516],["006a",508],["0061",506],["0070",498],["0077",459],["006e",443],["006c",395],["0073",392],["0062",363],["0074",361],["006f",358],["014b",349],["0065",345],["0261",323],["0068",307],["0072",268],["0066",256],["0064",243],["0272",240],["0074+0320+0283",237],["0294",215],["0254",212]],"vocab_skipped":null,"word_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,105484,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[0.5,0.5333333333333333,0.5666666666666667,0.6,0.6333333333333333,0.6666666666666666,0.7,0.7333333333333334,0.7666666666666666,0.8,0.8333333333333333,0.8666666666666667,0.9,0.9333333333333333,0.9666666666666667,1.0,1.0333333333333332,1.0666666666666667,1.1,1.1333333333333333,1.1666666666666665,1.2,1.2333333333333334,1.2666666666666666,1.3,1.3333333333333335,1.3666666666666667,1.4,1.4333333333333333,1.4666666666666668,1.5]}},"kind":"text","n":105484,"n_null":0,"n_unique":3142,"null_rate":0.0,"stats":{"allcaps_rate":1.0,"boilerplate_rate":0.0,"duplicate_rate":0.9702134920935876,"emoji_rate":0.0,"len_max":54,"len_mean":6.503033635432862,"len_median":4.0,"len_min":4,"len_p95":14.0,"n_duplicates":102342,"n_empty":0,"one_word_rate":1.0,"readability_flesch_mean":121.22000000000004,"url_rate":0.0,"vocab_size":1343,"word_mean":1.0,"word_median":1.0}},{"alerts":[{"code":"one_word","level":"warn","message":"100.0% rows are a single word"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"},{"code":"duplicates","level":"warn","message":"97.0% duplicate strings"}],"column":"Phoneme","extras":{"language_counts":{},"language_sample_size":5000,"length_histogram":{"counts":[67114,0,0,0,28726,0,0,0,6559,0,0,0,2225,0,0,0,401,0,0,0,267,0,0,0,104,0,0,0,5,0,0,0,70,0,0,0,1,0,0,12],"edges":[1.0,1.25,1.5,1.75,2.0,2.25,2.5,2.75,3.0,3.25,3.5,3.75,4.0,4.25,4.5,4.75,5.0,5.25,5.5,5.75,6.0,6.25,6.5,6.75,7.0,7.25,7.5,7.75,8.0,8.25,8.5,8.75,9.0,9.25,9.5,9.75,10.0,10.25,10.5,10.75,11.0]},"near_unique":false,"sample":["\u03c7","u","ndz","r","\u02e6","\u0268","d\u0324\u026e\u0324","u","i\u0303","a\u0303","t\u032a","ts","o","l","u","u\u0303","p","\u027d\u0324","u","t\u032a","\u0294","\u0290","\u0282","i","j","u\u032fo\u0303\u02d0","t","j","l","t\u032a\u0349","a\u0303","l","\u0294","b","\u0254","m","r","\u025b","t\u0320\u0283","f","w","\u0288\u0349","\u0254","n","\u0235","p","a","t\u032a","o\u031e\u0303","t\u0255\u02b0"],"top_values":[["m",2915],["i",2779],["k",2729],["j",2716],["u",2646],["a",2600],["p",2593],["w",2483],["n",2350],["t",2064],["l",2044],["s",2021],["b",1906],["\u014b",1898],["e",1842],["o",1826],["\u0261",1712],["h",1703],["d",1376],["r",1332]],"top_words":[["m",572],["i",537],["k",520],["u",516],["j",508],["a",506],["p",498],["w",459],["n",446],["l",395],["s",392],["b",363],["t",361],["o",358],["\u014b",349],["e",345],["\u0261",323],["h",307],["r",272],["f",256],["d",243],["\u0272",240],["t\u0320\u0283",237],["\u0294",215],["\u0254",212]],"vocab_skipped":null,"word_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,105484,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[0.5,0.5333333333333333,0.5666666666666667,0.6,0.6333333333333333,0.6666666666666666,0.7,0.7333333333333334,0.7666666666666666,0.8,0.8333333333333333,0.8666666666666667,0.9,0.9333333333333333,0.9666666666666667,1.0,1.0333333333333332,1.0666666666666667,1.1,1.1333333333333333,1.1666666666666665,1.2,1.2333333333333334,1.2666666666666666,1.3,1.3333333333333335,1.3666666666666667,1.4,1.4333333333333333,1.4666666666666668,1.5]}},"kind":"text","n":105484,"n_null":0,"n_unique":3142,"null_rate":0.0,"stats":{"allcaps_rate":0.0017538204846232605,"boilerplate_rate":0.0,"duplicate_rate":0.9702134920935876,"emoji_rate":0.0,"len_max":11,"len_mean":1.5006067270865724,"len_median":1.0,"len_min":1,"len_p95":3.0,"n_duplicates":102342,"n_empty":0,"one_word_rate":1.0,"readability_flesch_mean":114.43930000000005,"url_rate":0.0,"vocab_size":1339,"word_mean":1.0,"word_median":1.0}},{"alerts":[{"code":"one_word","level":"warn","message":"91.3% rows are a single word"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"},{"code":"duplicates","level":"warn","message":"93.5% duplicate strings"}],"column":"Allophones","extras":{"language_counts":{},"language_sample_size":5000,"length_histogram":{"counts":[26762,66019,4934,3125,1246,1181,789,409,334,0,226,160,69,65,41,36,12,16,12,0,14,9,8,0,3,3,3,3,2,0,1,0,0,0,0,0,1,0,0,1],"edges":[1.0,1.9,2.8,3.7,4.6,5.5,6.4,7.3,8.2,9.1,10.0,10.9,11.8,12.700000000000001,13.6,14.5,15.4,16.3,17.2,18.1,19.0,19.900000000000002,20.8,21.7,22.6,23.5,24.400000000000002,25.3,26.2,27.1,28.0,28.900000000000002,29.8,30.7,31.6,32.5,33.4,34.300000000000004,35.2,36.1,37.0]},"near_unique":false,"sample":["\u03c7","NA","NA","NA","\u02e6","NA","d\u0324\u026e\u0324","NA","NA","a\u0303 \u0259\u0303","NA","ts","o","l\u032a l","u","NA","NA","NA","NA","t\u032a","\u0294","\u0290\u02c0 \u0290","NA","NA","NA","NA","t t\u02b0","NA","l\u02b2 l","NA","NA","NA","NA","b b\u02b7 b\u02b2","\u0254 \u0254\u0303","m","NA","\u025b","t\u0320\u0283","f","NA","NA","\u0254","NA","NA","NA","a a\u02d0 \u0251 a\u0303","t\u032a t\u02b7 t\u02b2","NA","NA"],"top_values":[["NA",53580],["m",1091],["j",1048],["w",965],["s",938],["n",912],["a",900],["u",883],["i",878],["l",865],["k",821],["p",815],["b",800],["f",784],["t",782],["o",768],["h",767],["e",757],["\u014b",750],["\u0261",744]],"top_words":[["na",10183],["m",280],["a",270],["k",254],["i",252],["n",251],["j",248],["w",242],["b",241],["u",241],["s",240],["p",239],["l",227],["e",205],["o",201],["t",200],["\u0261",199],["f",189],["\u014b",187],["d",182],["h",175],["r",161],["t\u0320\u0283",158],["\u0272",158],["\u025b",154]],"vocab_skipped":null,"word_histogram":{"counts":[96315,0,6317,0,1847,0,698,0,0,191,0,62,0,25,0,0,13,0,7,0,5,0,0,2,0,1,0,0,0,1],"edges":[1.0,1.4333333333333333,1.8666666666666667,2.3,2.7333333333333334,3.166666666666667,3.6,4.033333333333333,4.466666666666667,4.9,5.333333333333334,5.766666666666667,6.2,6.633333333333334,7.066666666666666,7.5,7.933333333333334,8.366666666666667,8.8,9.233333333333334,9.666666666666668,10.1,10.533333333333333,10.966666666666667,11.4,11.833333333333334,12.266666666666667,12.700000000000001,13.133333333333333,13.566666666666666,14.0]}},"kind":"text","n":105484,"n_null":0,"n_unique":6892,"null_rate":0.0,"stats":{"allcaps_rate":0.002910393993401843,"boilerplate_rate":0.0,"duplicate_rate":0.9346630768647378,"emoji_rate":0.0,"len_max":37,"len_mean":2.0834154942929732,"len_median":2.0,"len_min":1,"len_p95":4.0,"n_duplicates":98592,"n_empty":0,"one_word_rate":0.9130768647377802,"readability_flesch_mean":116.18602500000004,"url_rate":0.0,"vocab_size":1263,"word_mean":1.1285977020211595,"word_median":1.0}},{"alerts":[],"column":"Marginal","extras":{"singletons":0,"top_values":[["FALSE",83263],["NA",20874],["TRUE",1347]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":3,"null_rate":0.0,"stats":{"cardinality":3,"entropy":0.8122325109446773,"entropy_ratio":0.5124616579730514,"top_rate":0.7893424595199272,"top_value":"FALSE"}},{"alerts":[],"column":"SegmentClass","extras":{"singletons":0,"top_values":[["consonant",72282],["vowel",31052],["tone",2150]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":3,"null_rate":0.0,"stats":{"cardinality":3,"entropy":1.0075090073337545,"entropy_ratio":0.6356674097181094,"top_rate":0.685241363619127,"top_value":"consonant"}},{"alerts":[],"column":"Source","extras":{"singletons":0,"top_values":[["ph",36274],["ea",16883],["upsid",13966],["er",9423],["saphon",9047],["aa",8064],["spa",7566],["ra",4261]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":8,"null_rate":0.0,"stats":{"cardinality":8,"entropy":2.6973229908177236,"entropy_ratio":0.8991076636059079,"top_rate":0.34388153653634673,"top_value":"ph"}},{"alerts":[{"code":"imbalance","level":"warn","message":"top value is 98.0% of rows"}],"column":"tone","extras":{"singletons":0,"top_values":[["0",103334],["+",2150]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":2,"null_rate":0.0,"stats":{"cardinality":2,"entropy":0.14358135440365866,"entropy_ratio":0.14358135440365866,"top_rate":0.9796177619354594,"top_value":"0"}},{"alerts":[{"code":"imbalance","level":"warn","message":"top value is 98.0% of rows"}],"column":"stress","extras":{"singletons":0,"top_values":[["-",103334],["0",2150]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":2,"null_rate":0.0,"stats":{"cardinality":2,"entropy":0.14358135440365866,"entropy_ratio":0.14358135440365866,"top_rate":0.9796177619354594,"top_value":"-"}},{"alerts":[],"column":"syllabic","extras":{"singletons":0,"top_values":[["-",72248],["+",30692],["0",2150],["+,-",244],["-,+",124],["-,+,-",12],["-,+,+",12],["+,+,-",2]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":8,"null_rate":0.0,"stats":{"cardinality":8,"entropy":1.0416449706070494,"entropy_ratio":0.3472149902023498,"top_rate":0.6849190398543855,"top_value":"-"}},{"alerts":[{"code":"imbalance","level":"warn","message":"top value is 97.8% of rows"}],"column":"short","extras":{"singletons":0,"top_values":[["-",103125],["0",2150],["+",204],["-,+",5]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":4,"null_rate":0.0,"stats":{"cardinality":4,"entropy":0.1644919893889015,"entropy_ratio":0.08224599469445075,"top_rate":0.9776364187933715,"top_value":"-"}},{"alerts":[],"column":"long","extras":{"singletons":1,"top_values":[["-",94844],["+",8386],["0",2150],["-,+",63],["+,-",40],["-,-,+",1]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":6,"null_rate":0.0,"stats":{"cardinality":6,"entropy":0.5536707334539547,"entropy_ratio":0.21418907752027,"top_rate":0.8991316218573433,"top_value":"-"}},{"alerts":[],"column":"consonantal","extras":{"singletons":1,"top_values":[["+",64257],["-",39041],["0",2151],["+,-",34],["-,+",1]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":5,"null_rate":0.0,"stats":{"cardinality":5,"entropy":1.0847542499500977,"entropy_ratio":0.46717822672399323,"top_rate":0.6091634750293884,"top_value":"+"}},{"alerts":[],"column":"sonorant","extras":{"singletons":1,"top_values":[["+",55920],["-",45322],["0",2150],["+,-",1948],["-,+",89],["+,-,-",29],["+,-,+",25],["+,-,+,-",1]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":8,"null_rate":0.0,"stats":{"cardinality":8,"entropy":1.2447279843992523,"entropy_ratio":0.41490932813308407,"top_rate":0.5301277918926093,"top_value":"+"}},{"alerts":[],"column":"continuant","extras":{"singletons":1,"top_values":[["+",57952],["-",44585],["0",2151],["-,+",728],["-,-,+",50],["+,-",9],["0,-,+",4],["-,+,+",4],["0,0,-,+",1]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":9,"null_rate":0.0,"stats":{"cardinality":9,"entropy":1.1715712985088285,"entropy_ratio":0.3695895953297838,"top_rate":0.549391376891282,"top_value":"+"}},{"alerts":[],"column":"delayedRelease","extras":{"singletons":1,"top_values":[["0",58035],["-",27384],["+",19533],["-,+",492],["0,-,+",33],["+,-",6],["0,0,-,+",1]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":7,"null_rate":0.0,"stats":{"cardinality":7,"entropy":1.4706196977124786,"entropy_ratio":0.5238453058278119,"top_rate":0.5501782260816807,"top_value":"0"}},{"alerts":[],"column":"approximant","extras":{"singletons":0,"top_values":[["-",58966],["+",44266],["0",2150],["-,+",71],["-,-,+",25],["+,-",6]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":6,"null_rate":0.0,"stats":{"cardinality":6,"entropy":1.1199829953143658,"entropy_ratio":0.4332685657923129,"top_rate":0.559004209169163,"top_value":"-"}},{"alerts":[{"code":"imbalance","level":"warn","message":"top value is 96.7% of rows"}],"column":"tap","extras":{"singletons":0,"top_values":[["-",102023],["0",2203],["+",1218],["-,+",25],["-,-,+",15]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":5,"null_rate":0.0,"stats":{"cardinality":5,"entropy":0.24210734477763832,"entropy_ratio":0.10426995793312155,"top_rate":0.9671893367714535,"top_value":"-"}},{"alerts":[{"code":"imbalance","level":"warn","message":"top value is 96.2% of rows"}],"column":"trill","extras":{"singletons":0,"top_values":[["-",101427],["0",2202],["+",1819],["-,+",26],["-,-,+",8],["+,-",2]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":6,"null_rate":0.0,"stats":{"cardinality":6,"entropy":0.27623548437485484,"entropy_ratio":0.10686247258820596,"top_rate":0.9615391907777483,"top_value":"-"}},{"alerts":[],"column":"nasal","extras":{"singletons":2,"top_values":[["-",85269],["+",15941],["0",2150],["+,-",1973],["-,+",95],["+,-,-",54],["+,-,+,-",1],["-,+,-",1]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":8,"null_rate":0.0,"stats":{"cardinality":8,"entropy":0.8969753823462909,"entropy_ratio":0.2989917941154303,"top_rate":0.8083595616396799,"top_value":"-"}},{"alerts":[],"column":"lateral","extras":{"singletons":1,"top_values":[["-",98968],["+",4211],["0",2150],["-,+",135],["+,-",12],["-,-,+",4],["-,+,-",3],["0,-,+",1]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":8,"null_rate":0.0,"stats":{"cardinality":8,"entropy":0.4012197735040855,"entropy_ratio":0.13373992450136182,"top_rate":0.9382275984983505,"top_value":"-"}},{"alerts":[],"column":"labial","extras":{"singletons":2,"top_values":[["-",71961],["+",28241],["-,+",2414],["0",2160],["+,-",531],["-,-,+",121],["+,-,-",21],["0,+,-",8],["-,+,-",6],["0,-,+",5],["-,+,+",5],["+,+,-",5],["+,-,+",4],["-,-,+,+",1],["0,+,-,-",1]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":15,"null_rate":0.0,"stats":{"cardinality":15,"entropy":1.1818094024210288,"entropy_ratio":0.3024936003453549,"top_rate":0.6821982480755375,"top_value":"-"}},{"alerts":[],"column":"round","extras":{"singletons":2,"top_values":[["0",74155],["+",16956],["-",14082],["-,+",269],["-,-,+",17],["-,0,+",3],["0,-,+",1],["+,-",1]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":8,"null_rate":0.0,"stats":{"cardinality":8,"entropy":1.1938930558003973,"entropy_ratio":0.39796435193346574,"top_rate":0.7029976110120966,"top_value":"0"}},{"alerts":[],"column":"labiodental","extras":{"singletons":1,"top_values":[["0",74124],["-",28726],["+",2574],["+,-",56],["-,+",3],["+,+,-",1]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":6,"null_rate":0.0,"stats":{"cardinality":6,"entropy":1.0058099473034294,"entropy_ratio":0.38910040165875803,"top_rate":0.7027037275795381,"top_value":"0"}},{"alerts":[],"column":"coronal","extras":{"singletons":1,"top_values":[["-",66234],["+",36955],["0",2160],["+,-",87],["-,+",41],["-,-,+",6],["+,-,+",1]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":7,"null_rate":0.0,"stats":{"cardinality":7,"entropy":1.0803736810067606,"entropy_ratio":0.3848368699369578,"top_rate":0.627905653938038,"top_value":"-"}},{"alerts":[],"column":"anterior","extras":{"singletons":0,"top_values":[["0",68372],["+",25704],["-",11391],["-,+",9],["+,-",5],["-,-,+",3]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":6,"null_rate":0.0,"stats":{"cardinality":6,"entropy":1.2508420205386073,"entropy_ratio":0.4838917470522864,"top_rate":0.6481741306738462,"top_value":"0"}},{"alerts":[],"column":"distributed","extras":{"singletons":3,"top_values":[["0",69639],["-",22283],["+",13228],["-,+",296],["-,-,+",25],["+,-",5],["0,-,+",3],["+,-,+",2],["0,+,-",1],["+,+,-",1],["0,0,-,+",1]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":11,"null_rate":0.0,"stats":{"cardinality":11,"entropy":1.273464741434796,"entropy_ratio":0.36811386430480325,"top_rate":0.6601854309658337,"top_value":"0"}},{"alerts":[],"column":"strident","extras":{"singletons":1,"top_values":[["0",68410],["-",25410],["+",11039],["-,+",585],["-,-,+",26],["+,-",7],["-,+,-",3],["0,-,+",3],["0,0,-,+",1]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":9,"null_rate":0.0,"stats":{"cardinality":9,"entropy":1.2870836705484179,"entropy_ratio":0.40602969154248014,"top_rate":0.6485343748814986,"top_value":"0"}},{"alerts":[],"column":"dorsal","extras":{"singletons":4,"top_values":[["+",54535],["-",47052],["0",2160],["-,+",1530],["+,-",144],["-,-,+",44],["0,-,+",6],["+,-,+",5],["-,+,+",4],["+,+,-,-",1],["-,+,-",1],["+,+,-",1],["0,0,-,+",1]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":13,"null_rate":0.0,"stats":{"cardinality":13,"entropy":1.235390288221728,"entropy_ratio":0.33384959148647436,"top_rate":0.5169978385347541,"top_value":"+"}},{"alerts":[],"column":"high","extras":{"singletons":2,"top_values":[["0",49247],["+",35559],["-",19156],["-,+",845],["+,-",627],["+,-,+",38],["+,+,-",6],["-,+,+",2],["-,-,+",2],["+,-,0",1],["-,+,-",1]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":11,"null_rate":0.0,"stats":{"cardinality":11,"entropy":1.5943943467615205,"entropy_ratio":0.46088332492884115,"top_rate":0.4668670130067119,"top_value":"0"}},{"alerts":[],"column":"low","extras":{"singletons":1,"top_values":[["-",49930],["0",49244],["+",5598],["+,-",417],["-,+",270],["-,+,-",21],["-,-,+",3],["+,-,-",1]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":8,"null_rate":0.0,"stats":{"cardinality":8,"entropy":1.3052472604529497,"entropy_ratio":0.43508242015098325,"top_rate":0.4733419286337264,"top_value":"-"}},{"alerts":[],"column":"front","extras":{"singletons":0,"top_values":[["0",49316],["-",34225],["+",20683],["-,+",838],["+,-",359],["-,-,+",24],["+,-,-",14],["-,+,+",10],["+,-,+",6],["-,0,+",3],["+,+,-",2],["0,-,+",2],["-,+,-",2]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":13,"null_rate":0.0,"stats":{"cardinality":13,"entropy":1.5917646093961206,"entropy_ratio":0.4301555303259311,"top_rate":0.46752114064692274,"top_value":"0"}},{"alerts":[],"column":"back","extras":{"singletons":3,"top_values":[["0",49270],["-",39749],["+",15547],["+,-",511],["-,+",367],["+,-,-",19],["-,-,+",8],["-,+,+",5],["-,+,-",5],["0,+,-",1],["+,-,+",1],["+,+,-",1]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":12,"null_rate":0.0,"stats":{"cardinality":12,"entropy":1.5214653035630892,"entropy_ratio":0.42440201348187856,"top_rate":0.46708505555344887,"top_value":"0"}},{"alerts":[],"column":"tense","extras":{"singletons":1,"top_values":[["0",75230],["+",23411],["-",6386],["+,-",268],["-,+",179],["+,-,+",6],["+,-,-",3],["+,+,-",1]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":8,"null_rate":0.0,"stats":{"cardinality":8,"entropy":1.113632933001339,"entropy_ratio":0.37121097766711303,"top_rate":0.7131887300443669,"top_value":"0"}},{"alerts":[{"code":"imbalance","level":"warn","message":"top value is 97.4% of rows"}],"column":"retractedTongueRoot","extras":{"singletons":2,"top_values":[["-",102788],["0",2235],["-,+",251],["+",199],["-,-,+",9],["-,+,-",1],["+,-",1]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":7,"null_rate":0.0,"stats":{"cardinality":7,"entropy":0.19349667140642499,"entropy_ratio":0.06892490503644791,"top_rate":0.9744416214781388,"top_value":"-"}},{"alerts":[{"code":"imbalance","level":"warn","message":"top value is 97.9% of rows"}],"column":"advancedTongueRoot","extras":{"singletons":0,"top_values":[["-",103238],["0",2235],["+",11]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":3,"null_rate":0.0,"stats":{"cardinality":3,"entropy":0.14958668464920608,"entropy_ratio":0.09437869008329491,"top_rate":0.9787076713056009,"top_value":"-"}},{"alerts":[],"column":"periodicGlottalSource","extras":{"singletons":0,"top_values":[["+",71694],["-",31179],["0",2139],["+,-",371],["-,+",87],["+,-,-",8],["+,-,+",6]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":7,"null_rate":0.0,"stats":{"cardinality":7,"entropy":1.0513847061624437,"entropy_ratio":0.3745107887505185,"top_rate":0.6796670585112434,"top_value":"+"}},{"alerts":[{"code":"imbalance","level":"warn","message":"top value is 97.9% of rows"}],"column":"epilaryngealSource","extras":{"singletons":0,"top_values":[["-",103303],["0",2150],["+",31]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":3,"null_rate":0.0,"stats":{"cardinality":3,"entropy":0.14744452201097066,"entropy_ratio":0.09302713593784306,"top_rate":0.9793238785029009,"top_value":"-"}},{"alerts":[],"column":"spreadGlottis","extras":{"singletons":2,"top_values":[["-",96855],["+",6156],["0",2138],["-,+",206],["+,-",115],["-,-,+",5],["+,0,-",5],["+,-,-",2],["+,0,-,-",1],["+,-,+",1]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":10,"null_rate":0.0,"stats":{"cardinality":10,"entropy":0.4965463130368085,"entropy_ratio":0.14947533446043632,"top_rate":0.9181961245307345,"top_value":"-"}},{"alerts":[],"column":"constrictedGlottis","extras":{"singletons":2,"top_values":[["-",99727],["+",3383],["0",2138],["+,-",141],["-,+",93],["+,-,-",1],["-,-,+",1]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":7,"null_rate":0.0,"stats":{"cardinality":7,"entropy":0.3717319778155273,"entropy_ratio":0.13241360217577067,"top_rate":0.9454230025406697,"top_value":"-"}},{"alerts":[],"column":"fortis","extras":{"singletons":0,"top_values":[["-",71867],["0",33202],["+",415]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":3,"null_rate":0.0,"stats":{"cardinality":3,"entropy":0.9335379523674724,"entropy_ratio":0.5889968702368124,"top_rate":0.6813071176671344,"top_value":"-"}},{"alerts":[],"column":"lenis","extras":{"singletons":0,"top_values":[["-",71866],["0",33202],["+",416]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":3,"null_rate":0.0,"stats":{"cardinality":3,"entropy":0.9336084306152392,"entropy_ratio":0.589041337060308,"top_rate":0.6812976375564067,"top_value":"-"}},{"alerts":[{"code":"imbalance","level":"warn","message":"top value is 96.4% of rows"}],"column":"raisedLarynxEjective","extras":{"singletons":1,"top_values":[["-",101652],["0",2150],["+",1573],["-,+",85],["+,-",23],["-,-,+",1]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":6,"null_rate":0.0,"stats":{"cardinality":6,"entropy":0.2674935220600104,"entropy_ratio":0.10348061992596981,"top_rate":0.9636722156914793,"top_value":"-"}},{"alerts":[{"code":"imbalance","level":"warn","message":"top value is 97.3% of rows"}],"column":"loweredLarynxImplosive","extras":{"singletons":0,"top_values":[["-",102609],["0",2150],["+",716],["-,+",7],["+,-",2]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":5,"null_rate":0.0,"stats":{"cardinality":5,"entropy":0.20336772197908162,"entropy_ratio":0.0875857105251776,"top_rate":0.9727446816578817,"top_value":"-"}},{"alerts":[],"column":"click","extras":{"singletons":0,"top_values":[["-",71971],["0",33202],["+",253],["+,-",52],["-,+",6]]},"kind":"categorical","n":105484,"n_null":0,"n_unique":5,"null_rate":0.0,"stats":{"cardinality":5,"entropy":0.9283203493380655,"entropy_ratio":0.3998058128424079,"top_rate":0.6822930491828144,"top_value":"-"}}],"insights":{"errors":[],"insights":[{"confidence":"high","critiques":[],"evidence_keys":["SegmentClass.top_values","Source.top_values","LanguageName.n_unique","Glottocode.n_unique","Phoneme.top_values","row_count","column_count"],"featured_charts":[{"caption":"Look at the consonant-to-vowel-to-tone split \u2014 consonants make up roughly 68.5% of all records, which will skew any feature-level analysis.","column":"SegmentClass","kind":"donut"},{"caption":"Check how unevenly data is distributed across the eight source databases, with 'ph' contributing more than a third of all rows on its own.","column":"Source","kind":"bar"},{"caption":"See which languages contribute the most phoneme records \u2014 Iron Ossetic leads with 444 entries, hinting at uneven language-level representation.","column":"LanguageName","kind":"bar"},{"caption":"Review the balance of the nasal feature values (+, -, 0) as a representative example of how distinctive features are coded across the full inventory.","column":"nasal","kind":"bar"},{"caption":"Note that about 1.3% of phonemes are flagged as marginal (borrowed or rare) while ~19.8% carry NA, worth checking before any typological counts.","column":"Marginal","kind":"donut"}],"model":"anthropic:default","narrative":"This dataset is PHOIBLE, a cross-linguistic phonological inventory database containing 105,484 phoneme-level records spanning roughly 2,177 languages and dialects, each row describing a single phoneme and its distinctive feature values. The most immediate thing to examine is the breakdown by SegmentClass: consonants dominate (~68.5%), followed by vowels (~29.5%) and tones (~2%), which shapes how almost every other feature distributes. A second focus is the Source column, which reveals that data comes from eight different linguistic databases ('ph' alone accounts for 34%), meaning coverage and coding conventions are uneven across the corpus and could introduce systematic biases in any cross-linguistic comparison.","scope":"dataset","target":"__global__"},{"confidence":"high","critiques":[],"evidence_keys":["n_unique","n","duplicate_rate","n_duplicates","allcaps_rate","one_word_rate","len_median","len_mean","top_values"],"model":"anthropic:default","narrative":"GlyphID is a categorical identifier column containing Unicode code point values (e.g., '006D' = 'm', '0069' = 'i', '0061' = 'a') stored as uppercase hexadecimal strings, almost certainly representing character-level glyph references in a typography, OCR, or text-rendering dataset. With only 3,142 unique values across 105,484 rows, the duplicate rate is extremely high at 97.0%, which is expected for a glyph/character frequency table but means this column functions as a low-cardinality label rather than a unique identifier despite its name. All values are fully uppercase (allcaps_rate 1.0), single-token (one_word_rate 1.0), and short (median length 4 characters), consistent with 4\u20136 character hex codes. The top values map to common Latin lowercase letters, suggesting the underlying corpus is predominantly Latin-script text.","role":"label","scope":"column","target":"GlyphID","treatment":"Map hex strings to Unicode characters for interpretability, then encode as categorical (low-cardinality, 3142 levels) or group by Unicode block/script before modelling."},{"confidence":"high","critiques":[],"evidence_keys":["n_unique","duplicate_rate","n_duplicates","allcaps_rate","top_values","one_word_rate","vocab_size"],"model":"anthropic:default","narrative":"This column contains human language names (e.g., 'Dutch', 'Chechen', 'Bengali', 'Iron Ossetic'), functioning as a categorical label drawn from a linguistically diverse vocabulary of 2,716 unique values across 105,484 rows. The duplicate rate of 97.4% (102,768 duplicates) confirms it is a low-cardinality repeating label, not a free-text field. Notably, 13.1% of values are all-caps, suggesting some entries use ISO-style abbreviations or codes alongside full names. The distribution is uneven \u2014 the top value 'Iron Ossetic' appears 444 times while many languages appear rarely \u2014 indicating a long-tail spread across minority and regional languages.","role":"label","scope":"column","target":"LanguageName","treatment":"Encode as a categorical (label or target-encode) after normalising casing inconsistencies flagged by the 13.1% all-caps rate."},{"confidence":"high","critiques":[],"evidence_keys":["column","n","n_unique","duplicate_rate","n_duplicates","top_values","vocab_size","len_mean","len_median","len_max","one_word_rate"],"model":"anthropic:default","narrative":"This column contains allophone representations from a phonology or linguistics dataset, storing the phonetic variants of phonemes (e.g., 'm', 'j', 'w', 's') as very short strings with a mean length of ~2 characters. Strikingly, 53,580 of 105,484 rows (roughly 50.8%) carry the sentinel value 'NA', indicating no allophone recorded, which dominates the duplicate rate of 93.5%. The 6,892 unique values across a vocabulary of only 1,263 words suggest a modest set of phonetic symbols combined in small clusters, consistent with IPA or similar notation.","role":"feature","scope":"column","target":"Allophones","treatment":"Treat 'NA' as missing; encode remaining values as categorical or tokenize individual phoneme symbols before modelling."},{"confidence":"high","critiques":[],"evidence_keys":["column","n_unique","n","duplicate_rate","n_duplicates","len_mean","len_median","len_max","top_values"],"model":"anthropic:default","narrative":"This column contains Glottocodes \u2014 the standardized 8-character language identifiers used by the Glottolog database (e.g., 'kham1282', 'dutc1256'), confirmed by the near-uniform length of 8 characters (mean 7.999, median 8.0) and the structured alphanumeric format. With only 2,177 unique codes across 105,484 rows, the duplicate rate is extremely high at 97.9%, meaning each language code recurs on average ~48 times \u2014 consistent with a dataset where many observations (e.g., words, features, speakers) are annotated per language. The top code 'kham1282' (Kham) appears 622 times, suggesting uneven language coverage in the dataset.","role":"foreign_key","scope":"column","target":"Glottocode","treatment":"Left-join on this code against the Glottolog reference table to enrich with language family, geographic coordinates, and macro-area metadata."},{"confidence":"high","critiques":[],"evidence_keys":["column","len_min","len_mean","len_max","null_rate","n_unique","duplicate_rate","n_duplicates","top_values"],"model":"anthropic:default","narrative":"This column contains ISO 639-3 three-letter language codes, a standardised identifier for natural languages. Every value is exactly 3 characters long (min, mean, max all equal 3) with zero nulls, confirming strict conformity to the standard. The duplicate rate is extremely high at 98.0%, meaning 2,095 distinct codes repeat across 105,484 rows \u2014 expected behaviour for a language tag applied to many records. The most frequent code 'mis' (miscellaneous/unattested language) appearing 828 times may warrant attention, as it signals a non-trivial share of records with unidentified languages.","role":"label","scope":"column","target":"ISO6393","treatment":"Use as a categorical grouping key; consider flagging or separating records where ISO6393 equals 'mis' (unattested) before language-based analysis."},{"confidence":"high","critiques":[],"evidence_keys":["column","n","n_unique","duplicate_rate","n_duplicates","len_min","len_median","len_mean","len_max","vocab_size","top_values","one_word_rate"],"model":"anthropic:default","narrative":"This column contains phoneme symbols from a linguistic or speech dataset, with 3,142 unique phoneme strings across 105,484 rows. Values are overwhelmingly single characters (len_median 1.0, len_mean 1.5, len_max 11), consistent with IPA or ARPABET-style phoneme notation. The duplicate rate is 97.0% (102,342 duplicates), which is expected given a finite phoneme inventory repeated across many words or utterances. The vocab_size of 1,339 distinct tokens against only 3,142 unique values suggests multi-character phoneme strings (e.g., digraphs or diacritics) are also present alongside single-character ones.","role":"label","scope":"column","target":"Phoneme","treatment":"Encode as categorical (label or one-hot) for modelling; 3,142 unique values is manageable but consider grouping by manner/place of articulation if dimensionality is a concern."},{"confidence":"high","critiques":[],"evidence_keys":["top_value","top_rate","top_values","entropy_ratio","n_unique","n","alerts"],"model":"anthropic:default","narrative":"This column encodes the Advanced Tongue Root (ATR) phonological feature, a binary articulatory distinction marked as '+' (present), '-' (absent), or '0' (neutral/not applicable), typical of linguistic phoneme inventories. The distribution is severely imbalanced: '-' dominates at 97.87% (103,238 of 105,484 rows), '0' appears in only 2,235 rows, and '+' is nearly absent with just 11 occurrences. The near-zero entropy ratio (0.094) confirms that '+ATR' is a vanishingly rare feature in this dataset, which would make any model predicting '+' extremely difficult to train without resampling.","role":"feature","scope":"column","target":"advancedTongueRoot","treatment":"Treat as ordinal or one-hot encoded categorical; oversample or reweight the '+' class (n=11) before modelling, or collapse '+' and '0' if class separation is infeasible."},{"confidence":"high","critiques":[],"evidence_keys":["top_value","top_rate","n","top_values","entropy","n_unique","alerts"],"model":"anthropic:default","narrative":"This column encodes whether an epilaryngeal phonation source is present, using a ternary scheme: absent ('-'), present ('+')), or a neutral/ambiguous state ('0'). The distribution is severely imbalanced: the '-' (absent) class dominates at 97.9% of 105,484 rows, while '+' (present) appears in only 31 records (~0.03%), making positive-class detection extremely challenging. The near-zero entropy (0.147) confirms almost no informational variance in this column as-is.","role":"feature","scope":"column","target":"epilaryngealSource","treatment":"Flag severe class imbalance ('+' = 31 of 105,484); apply oversampling or class-weighted modelling, and consider binary encoding after collapsing '-' vs. non-'-'."},{"confidence":"high","critiques":[],"evidence_keys":["top_rate","top_value","top_values","entropy_ratio","cardinality","n","alerts"],"model":"anthropic:default","narrative":"This column encodes a phonological feature \u2014 specifically whether a sound has a lowered larynx implosive articulation \u2014 using a symbolic notation system ('+', '-', and combinations). The distribution is severely imbalanced: 97.3% of the 105,484 rows carry the default/absent value '-', with only 2,150 instances of '0', 716 of '+', and fewer than 10 combined entries. The near-zero entropy ratio (0.088) confirms this column carries almost no information for most records, which would make it a very weak standalone predictor.","role":"feature","scope":"column","target":"loweredLarynxImplosive","treatment":"Flag the severe imbalance; consider binarising ('+' vs. all others) or dropping if the rare positive class is insufficient for modelling purposes."},{"confidence":"high","critiques":[],"evidence_keys":["top_value","top_rate","n_unique","entropy_ratio","null_rate","top_values"],"model":"anthropic:default","narrative":"This column encodes the presence or absence of a raised-larynx ejective phonological feature, using a diacritic notation system common in linguistic databases (e.g., IPA or feature matrices). The dominant value is '-' (absent/not applicable), appearing in 96.4% of 105,484 rows, creating severe class imbalance with an entropy ratio of only 0.10. Minority values include '0' (2,150 rows), '+' (1,573 rows), and compound sequences like '-,+' and '+,-', suggesting some segments receive multi-valued or composite feature annotations. The near-absence of nulls (0.0%) indicates complete annotation coverage, but the extreme skew means this feature will contribute negligible signal in most modelling contexts without deliberate resampling or grouping.","role":"feature","scope":"column","target":"raisedLarynxEjective","treatment":"Collapse compound values ('-,+', '+,-', '-,-,+') into a unified category and apply oversampling or class-weight adjustment before modelling given 96.4% imbalance."},{"confidence":"high","critiques":[],"evidence_keys":["top_value","top_rate","n","n_unique","cardinality","entropy_ratio","top_values","alerts"],"model":"anthropic:default","narrative":"This column encodes the phonetic feature 'retracted tongue root' (RTR), a binary or multi-valued linguistic annotation used in phonological databases to mark vowel or consonant articulation. The dominant value is '-' (absence of RTR) at 97.44% of 105,484 rows, making the feature extremely rare in this dataset \u2014 only ~2,696 tokens show any positive marking. The compound values ('-,+', '-,-,+', etc.) suggest per-segment annotation strings rather than single-token labels, indicating a sequence or multi-segment scope.","role":"feature","scope":"column","target":"retractedTongueRoot","treatment":"Flag severe class imbalance (97.44% negative); use oversampling or class-weighted models if predicting RTR; consider splitting compound strings into per-segment binary indicators before modelling."},{"confidence":"high","critiques":[],"evidence_keys":["top_value","top_rate","n_unique","entropy_ratio","top_values","n"],"model":"anthropic:default","narrative":"This column appears to encode a directional or sign indicator with only 4 distinct values: '-', '0', '+', and the compound '-,+'. It is severely imbalanced: the dominant value '-' accounts for 97,764% of rows (103,125 of 105,484), while '+' appears just 204 times and the mixed label '-,+' only 5 times. The near-zero entropy ratio (0.082) confirms the column carries very little information variance, and any model trained on it will be overwhelmed by the '-' class.","role":"feature","scope":"column","target":"short","treatment":"Treat as ordinal or one-hot encode, but flag severe class imbalance (97.8% '-'); consider oversampling minority classes or collapsing '+' and '-,+' before modelling."},{"confidence":"medium","critiques":[],"evidence_keys":["top_value","top_rate","n","top_values","entropy_ratio","n_unique","null_rate"],"model":"anthropic:default","narrative":"This column is a binary categorical flag likely indicating the presence or absence of stress (e.g., a linguistic or physiological stress marker), encoded as '-' for the negative/absent case and '0' for the positive/present case. The distribution is severely imbalanced: '-' accounts for 97.96% of all 105,484 rows (103,334 occurrences) versus only 2,150 rows for '0'. The entropy ratio of 0.144 confirms near-minimum information content, meaning the positive class is rare and models trained on this column will need class-balancing strategies.","role":"label","scope":"column","target":"stress","treatment":"Apply class-balancing (oversampling or weighted loss) before modelling; consider encoding '-' as 0 and '0' as 1 for numeric compatibility."},{"confidence":"medium","critiques":[],"evidence_keys":["top_value","top_rate","n_unique","entropy_ratio","top_values","null_rate"],"model":"anthropic:default","narrative":"This column appears to encode a directional or signed-change indicator, with values like '-', '+', '0', and compound sequences such as '-,+' and '-,-,+' that suggest a history of sign changes or transitions. The dominant value '-' accounts for 96.7% of all 105,484 rows, producing severe class imbalance (entropy ratio of only 0.10), meaning positive or mixed-direction events are rare signals. The compound multi-value entries ('-,+', '-,-,+') are notable \u2014 they imply the field can store ordered sequences of transitions, not just a single state. No nulls are present.","role":"feature","scope":"column","target":"tap","treatment":"One-hot encode single-value categories; treat compound sequence values ('-,+', '-,-,+') separately via sequence parsing or a dedicated binary flag for multi-transition events; oversample or weight minority classes before modelling."},{"confidence":"high","critiques":[],"evidence_keys":["top_value","top_rate","top_values","n","cardinality","entropy","null_rate","alerts"],"model":"anthropic:default","narrative":"This column is a binary tone indicator with only two values: '0' (neutral/absent) and '+' (positive), across 105,484 rows with no nulls. The distribution is severely imbalanced \u2014 '0' accounts for 97.96% of records (103,334) while '+' appears in just 2,150 rows (~2%). The entropy of 0.144 (out of a maximum of 1.0 for a binary variable) confirms the near-total dominance of a single class, which would make any model trained on this label prone to predicting '0' exclusively.","role":"label","scope":"column","target":"tone","treatment":"Apply class-balancing techniques (e.g., oversampling '+' or class-weight adjustment) before using as a modelling target."},{"confidence":"high","critiques":[],"evidence_keys":["top_value","top_rate","n_unique","entropy","top_values","n"],"model":"anthropic:default","narrative":"This column encodes a phonological feature \u2014 specifically the presence or absence of a trill articulation \u2014 using a small symbol vocabulary of 6 distinct values. The dominant value is \"-\" (absent/negative), which accounts for 96.15% of 105,484 rows, producing an extremely low entropy of 0.276 and triggering an imbalance alert. The minority values (\"0\", \"+\", and compound sequences like \"-,+\") together cover fewer than 4,000 rows, suggesting trill is a rare feature in this phonological dataset. The compound values (\"-,+\", \"-,-,+\", \"+,-\") with counts of 26, 8, and 2 respectively hint at multi-segment or allophonic annotation, but are statistically negligible.","role":"feature","scope":"column","target":"trill","treatment":"One-hot or ordinal encode with caution; severe class imbalance (96.15% negative class) means most models will need oversampling, class weighting, or collapsing rare compound categories before use."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","stats.min","stats.max","stats.skew","stats.kurtosis","stats.n_outliers","null_rate"],"model":"anthropic:default","narrative":"InventoryID is a numeric foreign key referencing an inventory dimension table, with exactly 3,020 distinct values spanning 1\u20133,020 across 105,484 rows, implying heavy repeated use of each ID (average ~35 rows per ID). The distribution is remarkably flat and symmetric (skew \u2248 \u22120.002, kurtosis \u2248 \u22121.15, zero outliers), consistent with a well-populated lookup identifier rather than a measured quantity.","role":"foreign_key","scope":"column","target":"InventoryID","treatment":"Left-join on this ID to the inventory dimension table; do not use raw numeric value as a feature."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","top_value","top_rate","top_values"],"model":"anthropic:default","narrative":"This column is a boolean flag indicating whether a record is classified as 'marginal', with three distinct string values: FALSE, NA, and TRUE. The dominant value is FALSE (78.9% of 105,484 rows), while TRUE is strikingly rare at only 1,347 records (\u22481.3%). The presence of 'NA' as a literal string value in 20,874 rows (\u224819.8%) is noteworthy \u2014 these are not system nulls (null_rate is 0.0) but encoded string missings, which must be handled explicitly rather than via standard null imputation.","role":"feature","scope":"column","target":"Marginal","treatment":"Encode as ternary (FALSE=0, TRUE=1, NA=missing) after converting string 'NA' to actual nulls, then decide on imputation strategy given 19.8% string-missing rate."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","top_value","top_rate","top_values","cardinality","entropy_ratio"],"model":"anthropic:default","narrative":"SegmentClass is a phonological category label classifying 105,484 linguistic segments into exactly three classes: consonant, vowel, and tone. Consonants dominate at 68.5% (72,282 occurrences), vowels account for 29.4% (31,052), and tones are a small minority at just 2.0% (2,150) \u2014 a distribution consistent with natural language phoneme inventories but with tones notably underrepresented, suggesting the dataset skews toward non-tonal languages or tonal markings are partially absent. Zero nulls and perfect coverage make this a clean, reliable label.","role":"label","scope":"column","target":"SegmentClass","treatment":"One-hot encode or use as a stratification variable; monitor class imbalance for the 'tone' minority class (2,150 samples) in any classification task."},{"confidence":"high","critiques":[],"evidence_keys":["n_unique","top_value","top_rate","top_values","entropy_ratio","null_rate","n"],"model":"anthropic:default","narrative":"This column identifies the source database or linguistic inventory from which each phonological record was drawn, with 8 distinct source codes across 105,484 rows and no nulls. The top source 'ph' dominates at 34.4% (36,274 rows), likely referring to PHOIBLE or a similar phoneme database, followed by 'ea', 'upsid', 'er', 'saphon', 'aa', 'spa', and 'ra'. The high entropy ratio of 0.899 indicates a relatively even spread across sources despite 'ph' leading \u2014 no single source overwhelmingly controls the data. Analysts should be aware that cross-source comparisons may introduce systematic coding differences, as each source may apply distinct phonological conventions.","role":"label","scope":"column","target":"Source","treatment":"One-hot encode or use as a stratification/grouping variable; check for source-specific biases before pooling."},{"confidence":"high","critiques":[],"evidence_keys":["top_value","top_rate","n_unique","entropy_ratio","top_values","n","null_rate"],"model":"anthropic:default","narrative":"This column captures a specific dialect designation for each record in what appears to be a linguistic or language survey dataset. The dominant signal is extreme missingness-by-label: 71.9% of rows carry the value 'NA' and a further 7,692 rows (\u22487.3%) are empty strings, meaning roughly 79% of records lack a meaningful dialect value. Despite 546 unique values, the effective entropy ratio is only 0.33, confirming that real dialect labels are thinly and unevenly spread across the remaining ~22,000 rows.","role":"label","scope":"column","target":"SpecificDialect","treatment":"Treat 'NA' and empty-string as missing; consider collapsing rare dialects (below a frequency threshold) into an 'Other' bucket before encoding, and flag high missingness rate (~79%) before any modelling use."},{"confidence":"medium","critiques":[],"evidence_keys":["top_values","top_rate","n_unique","null_rate","cardinality"],"model":"anthropic:default","narrative":"This column captures an anterior anatomical or directional finding coded as a signed categorical variable, with 6 distinct values across 105,484 rows and zero nulls. The dominant value '0' (64.8% of rows) likely indicates absence or neutral status, while '+' (24.4%) and '-' (10.8%) denote positive/negative findings. Surprisingly, a small number of rows (9, 5, and 3) contain compound multi-value strings like '-,+', '+,-', and '-,-,+', suggesting occasional data-entry concatenation errors or multi-event encoding that deviates from the expected single-symbol schema.","role":"feature","scope":"column","target":"anterior","treatment":"Map '0'/'+'/'-' to ordinal or one-hot encoded features; isolate and investigate the 17 compound-value rows before modelling."},{"confidence":"high","critiques":[],"evidence_keys":["top_values","top_rate","n_unique","entropy","null_rate","n"],"model":"anthropic:default","narrative":"This column captures whether a phoneme or linguistic segment is classified as an approximant, using a compact symbolic encoding where '-' means absent and '+' means present. The dominant value is '-' at 55.9% of rows (58,966), with '+' accounting for 41.9% (44,266), giving a near-binary distribution with very low entropy (1.12). Surprising are the small number of compound values ('-,+', '-,-,+', '+,-') totalling just 102 rows, suggesting a minority of segments carry ambiguous or multi-valued approximant classifications \u2014 possibly transcription artifacts or multi-segment bundles.","role":"feature","scope":"column","target":"approximant","treatment":"One-hot encode the two dominant values ('-', '+'); bin the three compound values into an 'ambiguous' category or flag and investigate as potential data quality issues before modelling."},{"confidence":"medium","critiques":[],"evidence_keys":["top_value","top_rate","top_values","n_unique","cardinality","null_rate"],"model":"anthropic:default","narrative":"This column encodes directional movement or change signals for a 'back' dimension, using a compact notation of '+', '-', and '0' tokens \u2014 likely representing price/value movement sequences (e.g., up, down, flat) over a lookback window. The dominant value is '0' at 46.7% (49,270 rows), followed by '-' at 39,749 and '+' at 15,547, indicating a strong bearish/negative skew relative to positive signals. Compound multi-step sequences like '+,-' (511) and '-,+' (367) exist but are rare, and a handful of three-step sequences appear at the tail, suggesting variable-length encoding that could cause parsing issues if treated as a simple label.","role":"feature","scope":"column","target":"back","treatment":"Parse composite multi-token values (e.g., '+,-') into structured sequences or ordinal scores before modelling; consider one-hot or frequency encoding for the simple single-token majority."},{"confidence":"high","critiques":[],"evidence_keys":["top_value","top_rate","cardinality","n","top_values","null_rate","entropy_ratio"],"model":"anthropic:default","narrative":"This column captures click direction or interaction type, encoded as symbolic tokens ('-', '0', '+') with combination values ('+,-', '-,+'). The dominant value is '-' at 68.2% of 105,484 rows, while positive-click signals ('+', '+,-', '-,+') account for only ~311 rows combined (~0.3%), suggesting a heavily imbalanced outcome or event flag. The presence of multi-value strings like '+,-' and '-,+' implies ordered sequences of click events were sometimes collapsed into a single cell rather than normalized \u2014 this warrants investigation before modelling.","role":"label","scope":"column","target":"click","treatment":"Ordinal-encode or one-hot after splitting multi-value entries ('+,-', '-,+'); treat class imbalance (~0.3% positive) before using as a classification target."},{"confidence":"high","critiques":[],"evidence_keys":["top_values","top_rate","n_unique","null_rate","cardinality"],"model":"anthropic:default","narrative":"This column encodes a binary phonological feature \u2014 consonantal \u2014 marking whether a segment is consonantal (+) or not (-), a standard distinctive feature in linguistics datasets. The dominant value is '+' at 60.9% (64,257 rows), with '-' covering 37.0% (39,041 rows). The near-zero third category '0' (2,151 rows) suggests underspecified segments, while the composite values '+,-' (34) and '-,+' (1) are anomalous multi-value entries that likely reflect data quality issues or ambiguous annotations.","role":"feature","scope":"column","target":"consonantal","treatment":"Encode as ordinal or one-hot after splitting composite entries ('+,-', '-,+') into separate flags or flagging them for review."},{"confidence":"high","critiques":[],"evidence_keys":["top_value","top_rate","cardinality","entropy","entropy_ratio","top_values","null_rate"],"model":"anthropic:default","narrative":"This column captures a clinical or phonetic annotation for constricted glottis, encoded as presence/absence symbols across 105,484 records with no nulls and only 7 unique values. The dominant value is '-' (absent/negative), appearing in 94.5% of rows (99,727), while '+' (present) accounts for just 3,383 rows \u2014 indicating a rare positive condition. The compound values ('+,-', '-,+', '+,-,-', '-,-,+') suggest multi-observation or multi-segment sequences for a tiny fraction of records, but with counts of 141, 93, 1, and 1 respectively, these are near-negligible. The extremely low entropy (0.372, entropy ratio 0.132) confirms the column is heavily imbalanced toward the negative class.","role":"feature","scope":"column","target":"constrictedGlottis","treatment":"Binarize into presence/absence flag; treat compound multi-segment values as positive; account for severe class imbalance (94.5% negative) during modelling."},{"confidence":"high","critiques":[],"evidence_keys":["top_value","top_rate","top_values","n_unique","cardinality","entropy_ratio","null_rate"],"model":"anthropic:default","narrative":"This column encodes a phonological feature \u2014 whether a speech sound is a continuant (airflow continues through the vocal tract) \u2014 using '+', '-', and '0' (unspecified/N-A) notation standard in distinctive feature theory. The dominant values are '+' (54.9%, 57,952 rows) and '-' (44,585 rows), which together account for ~97% of records. Surprisingly, 7 of the 9 unique values are composite strings like '-,+' or '-,-,+', suggesting some rows encode sequences or multi-segment entries rather than single-segment features, which is atypical and may indicate data quality or schema inconsistency. Entropy ratio of 0.37 reflects moderate concentration driven almost entirely by the binary +/- split.","role":"feature","scope":"column","target":"continuant","treatment":"Treat '+'/'-'/'0' as a 3-class categorical; investigate and potentially split or flag the 796 composite multi-value rows before encoding."},{"confidence":"high","critiques":[],"evidence_keys":["top_values","n_unique","top_rate","n","null_rate","cardinality"],"model":"anthropic:default","narrative":"This column encodes coronal articulation features in what appears to be a phonological or linguistic dataset, using binary '+'/'-' notation common in distinctive feature theory. The dominant value is '-' (62.8% of 105,484 rows), with '+' covering another 35.0%, making the two primary values account for ~98% of observations. The remaining values ('+,-', '-,+', '-,-,+', '+,-,+') suggest multi-segment or compound entries, which are rare (\u226487 occurrences) but worth flagging as potential encoding inconsistencies or multi-value cells that may need splitting.","role":"feature","scope":"column","target":"coronal","treatment":"Encode primary values '+'/'-'/'0' as ordinal or one-hot features; isolate and inspect the 135 multi-value rows for parsing or splitting before modelling."},{"confidence":"high","critiques":[],"evidence_keys":["top_values","n_unique","null_rate","top_rate","entropy_ratio","cardinality"],"model":"anthropic:default","narrative":"This column encodes a delayed-release flag or classification for 105,484 records using a small set of symbolic tokens: '0' (no delay, 55% of rows), '-' (negative/early shift, 26%), '+' (positive/late shift, 19%), and compound combinations thereof. The compound values ('-,+', '0,-,+', '+,-', '0,0,-,+') suggest the column is sometimes multi-valued \u2014 packed as comma-separated strings \u2014 which is structurally inconsistent with a simple categorical and implies set-like semantics. No nulls are present, and the entropy ratio of 0.52 reflects a moderately skewed but non-trivial distribution dominated by the '0' class.","role":"feature","scope":"column","target":"delayedRelease","treatment":"Split compound values on ',' into multi-hot binary columns ('has_0', 'has_minus', 'has_plus') before modelling."},{"confidence":"medium","critiques":[],"evidence_keys":["top_value","top_rate","n_unique","top_values","entropy_ratio","n","null_rate"],"model":"anthropic:default","narrative":"This column appears to encode a distribution or change-direction indicator, using a compact symbolic notation: '0' (no change), '-' (decrease), '+' (increase), and comma-separated sequences for multi-step or compound movements. The dominant value '0' covers 66% of 105,484 rows, with '-' (21.1%) and '+' (12.5%) accounting for most of the remainder. Surprisingly, 11 distinct values arise from combinations of these three symbols, suggesting some records capture sequences of directional changes rather than a single state. Entropy ratio of 0.37 confirms moderate but uneven information content, heavily concentrated in the single '0' class.","role":"label","scope":"column","target":"distributed","treatment":"Encode as an ordinal or multi-hot feature by splitting on commas and mapping {'-': -1, '0': 0, '+': 1}; treat compound values as sequences."},{"confidence":"high","critiques":[],"evidence_keys":["top_value","top_rate","top_values","cardinality","n_unique","null_rate","entropy_ratio"],"model":"anthropic:default","narrative":"This column captures a dorsal-surface marking or pattern indicator for biological specimens (likely fish or reptiles), encoding the presence/absence of a feature using '+', '-', and '0' symbols \u2014 sometimes as comma-separated sequences indicating multiple zones or segments. The dominant values are '+' (51.7%, n=54,535) and '-' (n=47,052), together accounting for ~96.3% of rows, with 'neutral/absent' coded as '0' (n=2,160). Surprisingly, 11 of the 13 categories are multi-value strings like '-,+' or '-,-,+', suggesting some records encode ordered sequences of dorsal sub-regions rather than a single binary state, creating an implicit structural inconsistency that warrants normalization.","role":"feature","scope":"column","target":"dorsal","treatment":"Split multi-value entries on ',' into ordered sub-features or one-hot encode each position before modelling; treat '+', '-', '0' as a ternary categorical for single-value rows."},{"confidence":"high","critiques":[],"evidence_keys":["top_values","top_rate","top_value","n_unique","n","null_rate","entropy_ratio"],"model":"anthropic:default","narrative":"This column is a three-valued categorical flag, likely representing a directional or strength indicator with values '-', '0', and '+'. The dominant value is '-' at 68.1% (71,867 rows), '0' accounts for 31.5% (33,202 rows), and '+' is strikingly rare at only 415 occurrences (~0.4%), creating a heavily imbalanced distribution. The near-absence of '+' values is a notable surprise and may indicate a rare positive condition, signal, or classification outcome worth investigating for class imbalance before modelling.","role":"feature","scope":"column","target":"fortis","treatment":"Ordinal-encode as -1/0/1 or one-hot encode, and apply class-imbalance handling (e.g. oversampling '+') before modelling."},{"confidence":"medium","critiques":[],"evidence_keys":["top_value","top_rate","top_values","n_unique","cardinality","null_rate"],"model":"anthropic:default","narrative":"This column appears to encode a directional or signed-change indicator for a 'front' measurement, using values '0', '-', and '+' as atomic tokens that can be combined into sequences (e.g., '-,+', '+,-,-'). The dominant value '0' accounts for 46.75% of rows, with '-' (34,225) and '+' (20,683) making up most of the remainder \u2014 notably, negative changes outnumber positive ones by ~1.65:1. The compound sequence values ('-,+', '+,-', etc.) suggest multi-step event chains are encoded as a single string, which is an unusual encoding pattern that may require parsing before modelling.","role":"feature","scope":"column","target":"front","treatment":"Parse composite sequences (e.g., '-,+') into structured event arrays or ordinal counts; then one-hot or embed atomic states before modelling."},{"confidence":"medium","critiques":[],"evidence_keys":["top_values","top_rate","n_unique","cardinality","null_rate","n"],"model":"anthropic:default","narrative":"This column encodes the direction of price/value movement at a 'high' point \u2014 likely a signal or pattern indicator from a time-series or financial dataset, using tokens '0' (no movement), '+' (up), and '-' (down), including multi-step sequences like '-,+' or '+,-,+'. The dominant value is '0' at 46.7% (49,247 rows), followed by '+' at 35,559 and '-' at 19,156, suggesting a strong asymmetry between upward and downward signals. The presence of compound sequence values (e.g., '+,-,+', '-,+,+') with very low frequency \u2014 some appearing only once or twice \u2014 indicates these multi-step patterns are rare edge cases that may need consolidation or separate treatment.","role":"feature","scope":"column","target":"high","treatment":"Encode top 3 values ('0', '+', '-') as ordinal or one-hot features; collapse rare multi-step sequences (\u2264845 occurrences) into an 'other' or structured sub-category."},{"confidence":"high","critiques":[],"evidence_keys":["top_value","top_rate","n_unique","cardinality","top_values","null_rate","n"],"model":"anthropic:default","narrative":"This column captures labial articulation features in a phonological or linguistic dataset, encoding presence/absence/neutrality of a labial feature per segment (or per segment sequence). The dominant value is '-' (absent) at 68.2% of 105,484 rows, with '+' (present) at 26.8%, suggesting most segments are non-labial. Surprisingly, ~2.9% of values are compound strings like '-,+', '+,-', '-,-,+', indicating multi-segment bundles packed into a single cell rather than a flat atomic feature, which creates a parsing challenge and implies the column is not fully normalized.","role":"feature","scope":"column","target":"labial","treatment":"Split compound values on ',' into separate per-segment records or encode as ordered multi-label; then one-hot or ordinal encode the atomic {'-', '+', '0'} values for modelling."},{"confidence":"high","critiques":[],"evidence_keys":["top_value","top_rate","top_values","cardinality","entropy_ratio","null_rate"],"model":"anthropic:default","narrative":"This column encodes labiodental phonetic feature annotations, likely from a linguistic or phonological dataset where each row represents a speech sound or segment. The dominant value '0' (74,124 rows, 70.3%) indicates the feature is absent or neutral, while '+' and '-' mark positive and negative feature values respectively \u2014 a standard binary distinctive-feature notation. Notably, multi-valued strings like '+,-', '-,+', and '+,+,-' appear (60 rows total), suggesting a small number of segments carry conflicting or composite annotations, which may indicate data entry inconsistency or deliberate underspecification. Entropy ratio of 0.39 confirms moderate imbalance with '0' dominating.","role":"feature","scope":"column","target":"labiodental","treatment":"One-hot encode '+', '-', '0'; flag or inspect the 60 multi-valued rows ('+,-', '-,+', '+,+,-') for normalization before modelling."},{"confidence":"high","critiques":[],"evidence_keys":["top_value","top_rate","n_unique","entropy_ratio","top_values","null_rate"],"model":"anthropic:default","narrative":"This column encodes lateral direction or laterality coding, likely indicating the side(s) of a clinical finding, anatomical measurement, or test result, using '-' (e.g., left or negative), '+' (right or positive), and '0' (midline or neutral). The dominant value '-' accounts for 93.8% of all 105,484 rows, producing a very low entropy ratio of 0.134, meaning the column is heavily skewed toward a single class. Surprisingly, some values represent sequences of laterality codes (e.g., '-,+', '-,-,+'), suggesting multi-segment or bilateral recordings encoded as comma-delimited strings rather than separate fields. No nulls are present.","role":"feature","scope":"column","target":"lateral","treatment":"Split comma-delimited compound values into multi-hot binary flags for '-', '+', and '0' presence before modelling; expect severe class imbalance."},{"confidence":"high","critiques":[],"evidence_keys":["top_value","top_rate","top_values","n","n_unique","null_rate","entropy"],"model":"anthropic:default","narrative":"This column represents a ternary linguistic or phonological feature, almost certainly a 'lenis' (weak/voiced) marker with values '-' (absent/irrelevant), '0' (neutral), and '+' (present/lenis). The dominant value is '-' at 68.1% (71,866 rows), while the positive lenis marker '+' is strikingly rare at only 416 occurrences (~0.4%), creating a heavily imbalanced distribution. No nulls exist across all 105,484 rows, and entropy is moderate at 0.93 bits, well below the theoretical maximum, confirming the skew toward the negative class.","role":"feature","scope":"column","target":"lenis","treatment":"Encode as ordinal or one-hot; be aware that the '+' class (416 samples) is severely underrepresented and will require oversampling or class-weight adjustment before modelling."},{"confidence":"medium","critiques":[],"evidence_keys":["n_unique","top_value","top_rate","top_values","entropy_ratio","null_rate"],"model":"anthropic:default","narrative":"This column encodes the sign or direction of longitude (or a signed numeric field), with only 6 distinct values across 105,484 rows. The dominant value '-' accounts for 89.9% of records, suggesting the dataset is heavily skewed toward negative longitudes (e.g., Western Hemisphere coordinates). The compound values '-,+', '+,-', and '-,-,+' appear in just 104 rows combined, hinting at rare multi-value or malformed entries that warrant inspection before modelling.","role":"feature","scope":"column","target":"long","treatment":"Investigate compound values ('-,+', '+,-', '-,-,+') for parsing errors; encode '-' as -1, '+' as +1, '0' as 0, and flag or impute the 104 compound rows."}],"providers":["anthropic:default"],"total_usage":{"completion_tokens":11385,"prompt_tokens":40071,"total_tokens":51456}},"language_counts":{},"meta":{"generated_at":"2026-06-22T00:14:33+00:00","mode":"full","row_count":105484,"sampled_rows":105484,"seed":42,"source":"/home/coolhand/html/datavis/data_trove/data/linguistic/phoible.csv"},"notes":[],"saturn_version":"0.2.0","schema":{"Allophones":"text","Glottocode":"text","GlyphID":"text","ISO6393":"text","InventoryID":"numeric","LanguageName":"text","Marginal":"categorical","Phoneme":"text","SegmentClass":"categorical","Source":"categorical","SpecificDialect":"categorical","advancedTongueRoot":"categorical","anterior":"categorical","approximant":"categorical","back":"categorical","click":"categorical","consonantal":"categorical","constrictedGlottis":"categorical","continuant":"categorical","coronal":"categorical","delayedRelease":"categorical","distributed":"categorical","dorsal":"categorical","epilaryngealSource":"categorical","fortis":"categorical","front":"categorical","high":"categorical","labial":"categorical","labiodental":"categorical","lateral":"categorical","lenis":"categorical","long":"categorical","low":"categorical","loweredLarynxImplosive":"categorical","nasal":"categorical","periodicGlottalSource":"categorical","raisedLarynxEjective":"categorical","retractedTongueRoot":"categorical","round":"categorical","short":"categorical","sonorant":"categorical","spreadGlottis":"categorical","stress":"categorical","strident":"categorical","syllabic":"categorical","tap":"categorical","tense":"categorical","tone":"categorical","trill":"categorical"}}
