{"columns":[{"alerts":[{"code":"near_unique","level":"info","message":"100.0% of rows are unique strings"},{"code":"one_word","level":"warn","message":"100.0% rows are a single word"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"}],"column":"ID","extras":{"language_counts":{},"language_sample_size":3573,"length_histogram":{"counts":[11,2651,0,0,0,0,0,2,17,53,89,137,122,0,110,90,67,52,48,0,21,22,20,17,10,11,0,7,3,0,2,3,2,0,1,2,2,0,0,1],"edges":[2.0,2.85,3.7,4.55,5.4,6.25,7.1,7.95,8.8,9.649999999999999,10.5,11.35,12.2,13.049999999999999,13.9,14.75,15.6,16.45,17.299999999999997,18.15,19.0,19.849999999999998,20.7,21.55,22.4,23.25,24.099999999999998,24.95,25.8,26.65,27.5,28.349999999999998,29.2,30.05,30.9,31.75,32.599999999999994,33.45,34.3,35.15,36.0]},"near_unique":true,"sample":["abd","genus-araucanian","genus-misumalpan","subfamily-palaihnihan","mmp","family-tunica","mrj","genus-northwestcaucasian","genus-huavean","arg","sss","pkm","kab","ada","kfy","eud","bou","family-kartvelian","family-yawa","mem","kkv","ash","dhm","brl","wly","genus-centralmalayopolynesian","rag","for","biu","family-tsimshianic","trb","tso","ady","mbz","let","mrd","family-southbirdshead","ojm","muh","pir","genus-guato","genus-angamipochuri","kan","genus-siamou","genus-totonacan","genus-samoyedic","kwa","mco","dig","genus-centralkainji"],"top_values":[],"top_words":[["aab",1],["aar",1],["aba",1],["abb",1],["abd",1],["abe",1],["abh",1],["abi",1],["abk",1],["abm",1],["abn",1],["abo",1],["abu",1],["abv",1],["abw",1],["abz",1],["ace",1],["acg",1],["ach",1],["aci",1],["acl",1],["acm",1],["acn",1],["aco",1],["acu",1]],"vocab_skipped":null,"word_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3573,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[0.5,0.5333333333333333,0.5666666666666667,0.6,0.6333333333333333,0.6666666666666666,0.7,0.7333333333333334,0.7666666666666666,0.8,0.8333333333333333,0.8666666666666667,0.9,0.9333333333333333,0.9666666666666667,1.0,1.0333333333333332,1.0666666666666667,1.1,1.1333333333333333,1.1666666666666665,1.2,1.2333333333333334,1.2666666666666666,1.3,1.3333333333333335,1.3666666666666667,1.4,1.4333333333333333,1.4666666666666668,1.5]}},"kind":"text","n":3573,"n_null":0,"n_unique":3573,"null_rate":0.0,"stats":{"allcaps_rate":0.0,"boilerplate_rate":0.0,"duplicate_rate":0.0,"emoji_rate":0.0,"len_max":36,"len_mean":5.9818080044780295,"len_median":3.0,"len_min":2,"len_p95":17.0,"n_duplicates":0,"n_empty":0,"one_word_rate":1.0,"readability_flesch_mean":61.57700000000003,"url_rate":0.0,"vocab_size":3573,"word_mean":1.0,"word_median":1.0}},{"alerts":[{"code":"one_word","level":"warn","message":"80.0% rows are a single word"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"}],"column":"Name","extras":{"language_counts":{},"language_sample_size":3573,"length_histogram":{"counts":[112,361,528,581,502,305,198,152,84,88,131,89,76,77,66,46,34,31,21,19,21,8,9,9,5,2,3,4,2,3,2,1,1,0,0,0,0,0,0,2],"edges":[2.0,3.1,4.2,5.300000000000001,6.4,7.5,8.600000000000001,9.700000000000001,10.8,11.9,13.0,14.100000000000001,15.200000000000001,16.3,17.400000000000002,18.5,19.6,20.700000000000003,21.8,22.900000000000002,24.0,25.1,26.200000000000003,27.3,28.400000000000002,29.500000000000004,30.6,31.700000000000003,32.800000000000004,33.900000000000006,35.0,36.1,37.2,38.300000000000004,39.400000000000006,40.5,41.6,42.7,43.800000000000004,44.900000000000006,46.0]},"near_unique":false,"sample":["Abidji","Araucanian","Misumalpan","Palaihnihan","Mampruli","Tunica","Mirniny","Northwest Caucasian","Huavean","Arabic (Gulf)","Salish (Samish Straits)","Pokomch\u00ed","Kabardian","Adamorobe Sign Language","Kirghiz (Fu-Yu)","Eudeve","Berber (Wargla)","Kartvelian","Yawa","Manem","Lusi","Adyghe (Shapsugh)","Dhimal","Baragaunle","Wolaytta","Central Malayo-Polynesian","Raga","Fore","Bisu","Tsimshianic","Teribe","Tsou","Adyghe (Abzakh)","Mbe'","Leti","Marind","South Bird's Head","Ojibwe (Minnesota)","Muher","Piro","Guat\u00f3","Angami-Pochuri","Kana","Siamou","Totonacan","Samoyedic","Kwaio","Mixe (Coatl\u00e1n)","Digaro","Central Kainji"],"top_values":[["Abun",3],["Andoke",3],["Aikan\u00e1",3],["Ainu",3],["An\u00eam",3],["Atakapa",3],["Beothuk",3],["Berta",3],["Bangime",3],["Basque",3],["Betoi",3],["Burushaski",3],["Cams\u00e1",3],["Candoshi",3],["Cof\u00e1n",3],["Chiquitano",3],["Chitimacha",3],["Cuitlatec",3],["Cayuvava",3],["Esselen",3]],"top_words":[["sign",25],["language",22],["(in",22],["arabic",21],["german",17],["central",16],["mixtec",16],["(northern)",15],["basque",14],["(southern)",13],["creole",12],["south",12],["nahuatl",12],["eastern",12],["(western)",11],["west",11],["east",11],["(eastern)",10],["quechua",10],["western",10],["zapotec",10],["northern",10],["berber",9],["(san",8],["romani",8]],"vocab_skipped":null,"word_histogram":{"counts":[2859,0,0,0,0,0,566,0,0,0,0,0,104,0,0,0,0,0,29,0,0,0,0,0,14,0,0,0,0,1],"edges":[1.0,1.1666666666666667,1.3333333333333333,1.5,1.6666666666666665,1.8333333333333333,2.0,2.1666666666666665,2.333333333333333,2.5,2.6666666666666665,2.833333333333333,3.0,3.1666666666666665,3.333333333333333,3.5,3.6666666666666665,3.833333333333333,4.0,4.166666666666666,4.333333333333333,4.5,4.666666666666666,4.833333333333333,5.0,5.166666666666666,5.333333333333333,5.5,5.666666666666666,5.833333333333333,6.0]}},"kind":"text","n":3573,"n_null":0,"n_unique":3198,"null_rate":0.0,"stats":{"allcaps_rate":0.0,"boilerplate_rate":0.0,"duplicate_rate":0.10495382031905962,"emoji_rate":0.0,"len_max":46,"len_mean":8.705009795689897,"len_median":7.0,"len_min":2,"len_p95":19.0,"n_duplicates":375,"n_empty":0,"one_word_rate":0.8001679261125105,"readability_flesch_mean":48.158075000000025,"url_rate":0.0,"vocab_size":3383,"word_mean":1.2580464595577945,"word_median":1.0}},{"alerts":[{"code":"null_rate","level":"warn","message":"25.5% null"}],"column":"Macroarea","extras":{"singletons":0,"top_values":[["Eurasia",659],["Africa",606],["Papunesia",560],["North America",396],["South America",258],["Australia",183]]},"kind":"categorical","n":3573,"n_null":911,"n_unique":6,"null_rate":0.25496781416176884,"stats":{"cardinality":6,"entropy":2.458602299570494,"entropy_ratio":0.9511172014621451,"top_rate":0.24755822689706988,"top_value":"Eurasia"}},{"alerts":[{"code":"null_rate","level":"warn","message":"25.5% null"}],"column":"Latitude","extras":{"histogram":{"counts":[2,1,1,2,3,5,9,18,24,29,47,64,85,86,129,187,190,130,119,196,161,104,145,82,63,74,84,66,84,67,86,62,74,52,44,18,20,23,19,7],"edges":[-55.0,-51.84375,-48.6875,-45.53125,-42.375,-39.21875,-36.0625,-32.90625,-29.75,-26.59375,-23.4375,-20.28125,-17.125,-13.96875,-10.8125,-7.65625,-4.5,-1.34375,1.8125,4.96875,8.125,11.28125,14.4375,17.59375,20.75,23.90625,27.0625,30.21875,33.375,36.53125,39.6875,42.84375,46.0,49.15625,52.3125,55.46875,58.625,61.78125,64.9375,68.09375,71.25]},"sample":[-8.25,5.5,4.41666666667,-2.66667,32.0,-12.3333333333,15.4166666667,-6.16666666667,-12.6666666667,5.25,21.5,-4.66666666667,48.5,61.0,-22.5,-15.4166666667,-24.0,25.0,-14.0,-7.21666666667,26.0,40.0,-9.25,34.0,-16.3333333333,-6.5,0.0,42.3333333333,-17.0,40.5,2.66666666666667,26.0,-4.33333333333,-8.33333333333,12.5,-13.0833333333,-3.08333333333,-27.6666666667,-3.16666666667,10.3333333333,29.5833333333,27.5,11.9166666667,-8.25,-3.71666666667,8.25,-14.0,13.3333333333,6.5,15.75,54.0,35.0,-6.0,-17.75,-6.41666666667,7.83333333333,1.41666666667,5.26,-1.33333333333,43.3333333333,43.0,43.3333333333,43.0,42.0,10.0,48.0,28.5,-5.33333333333,1.5,17.3333333333,-4.5,29.1666666667,7.16666666667,3.5,-11.6333333333,-6.75,36.5,9.66666666667,3.0,6.33333333333,5.5,34.0,17.5833333333,42.6666666667,15.5,13.45,30.0,35.5,17.5833333333,-23.5,55.5,12.0,-14.0,23.0,23.0,36.8333333333333,48.0,-23.6666666667,15.2,50.3333333333,35.0,42.75,47.0,-12.1666666667,5.25,4.75,52.0,6.16666666667,-28.0,25.5,3.216666667,-3.33333333333,42.25,56.0,28.0,-17.8333333333,-4.33333333333,-3.0,-5.5,-0.5,-11.0,68.0,62.0,-17.8333333333,10.25,15.0,26.0,5.66666666667,8.66666666667,-9.75,47.0,6.0,52.5,18.75,10.0,-6.25,42.0,-28.4166666667,7.0,-5.0,22.9166666667,-6.33333333333,10.3333333333,12.0,4.58333333333,-18.3333333333,15.6666666667,-12.25,39.0,12.25,60.0,45.75,-17.0,6.66666666667,-12.0,4.58333333333,28.3333333333,-0.75,52.0,9.91666666667,47.4166666667,25.0,-1.0,10.25,49.25,-4.0,25.0,-6.33333333333,16.2166666667,42.1666666667,6.91666666667,-2.08333333333,-3.25,18.25,44.5,40.9166666667,45.0,20.5,-11.5,-12.25,31.5,17.5,30.6666666667,37.0,-5.0,-19.0,43.5,-6.25,35.5,4.75,16.5,-4.25,16.0,12.0,9.83333333333,14.25,12.5,-6.0,22.5,42.25,-4.25,37.0,10.25,8.83333333333,-8.0,-16.5,-5.46666666667,43.0,-3.0,22.5,20.75,27.5,42.5,3.66666666667,-20.5,-9.0,-3.91666666667,-26.0,10.25,-8.66666666667,39.5,-5.16666666667,-5.5,5.3,11.8666666667,15.3333333333,7.5,13.0,-2.83333333333,-20.0,54.6666666667,10.5,42.5833333333,41.0833333333,21.0833333333,67.0,-4.5,25.0,43.0,-11.0,65.0,34.0,22.6666666667,-12.6666666667,-9.58333333333,41.5,22.6666666667,2.0,-15.0,-2.83333333333,-19.45,27.1666666667,40.0,-9.75,42.0,55.0,11.25,-5.0,-2.06666666667,-0.5,-12.0,-3.0,-38.0,10.5,-11.9166666667,19.4166666667,2.5,-25.0,41.6666666667,52.0,-12.0,-28.0,-17.3333333333,46.5,8.75,-3.0,-9.91666666667,19.0,-3.75,12.0,-7.66666666667,-5.41666666667,11.5,44.5,-5.71666666667,3.0,9.66666666667,10.3333333333,-9.08333333333,40.0,-1.5,19.0,-7.05,53.0,10.5,14.8333333333,12.8333333333,16.5833333333,-4.83333333333,11.0,-13.8333333333,-8.58333333333,7.83333333333,-28.75,52.0,-4.91666666667,42.5,-4.88333333333,8.16666666667,-1.0,9.5,5.58333333333,17.1666666667,17.0833333333,16.9166666667,3.16666666667,10.0,18.25,8.0,36.3833,-5.58333333333,15.0833333333,31.75,12.0,3.56,8.66666666667,9.33333333333,-25.3333333333,7.75,-12.1666666667,52.25,-14.25,-36.0,-19.0,19.6666666667,20.1666666667,7.0,-2.5,-14.0,19.25,8.05,6.5,-22.3333333333,-14.5,-26.0,21.9166666667,-10.0,12.1666666667,-18.0,25.1666666667,-3.0,18.9166666667,37.5,47.6666666667,53.0,49.5,17.95,32.0,12.1666666667,37.0,9.0,-14.5,31.0,12.25,19.8333333333,6.21666666667,-26.0,15.4166666667,-32.5,1.5,39.0,39.0,-16.5,-10.0,-2.83333333333,-14.0,39.0,32.75,7.33333333333,28.0,-20.0,-14.5,0.333333333333,-15.5,23.75,11.75,-2.41666666667,-5.0,1.5,46.75,-10.6666666667,-6.0,-7.16666666667,-9.61666666667,6.16666666667,3.5,48.45,-18.0,40.0,-6.0,28.0,-6.0,-0.75,4.33333333333,7.0,69.0,20.0,32.25,47.25,52.5,0.5,38.75,-17.6666666667,9.75,6.16666666667,15.4166666667,20.5,16.0,-4.0,-3.46666666667,9.5,-7.71666666667,-6.5,49.75,15.0,-4.0,-5.0,45.3333333333,-21.6666666667,4.25,17.0833333333,37.5,11.0,3.0,42.3666666667,1.25,-19.5666666667,16.3333333333,1.08333333333,19.9166666667,21.75,-3.0,-6.0,55.3333333333,36.0,-17.0,27.0,39.0,63.0,47.6666666667,27.5,20.4166666667,-13.0,16.5,57.5,-1.83333333333,-4.5,-16.75,40.0,-22.0,-20.5,59.5,1.0,36.5,4.0,-22.0,-10.1666666667,-27.0,-32.0,40.8333333333,-19.5,-5.5,-6.16666666667,-4.25,-8.25,-18.5833333333,1.5,-13.1666666667,-4.5,31.6666666667,-37.5,44.3333333333,3.25,-4.08333333333,-14.5,-4.16666666667,28.0,39.6666666667,10.1666666667,-21.0,-7.5,22.0,7.83333333333,-16.4166666667,-6.58333333333,1.0,-17.3333333333,8.5,17.3333333333,-30.0]},"kind":"numeric","n":3573,"n_null":911,"n_unique":887,"null_rate":0.25496781416176884,"stats":{"iqr":33.0,"kurtosis":-0.5022853458255461,"max":71.25,"mean":11.880207700476019,"median":8.291666666665,"min":-55.0,"n_outliers":1,"outlier_rate":0.0003756574004507889,"q1":-5.0,"q3":28.0,"skew":0.3561639041397985,"std":22.72235997679527,"zero_rate":0.002253944402704733}},{"alerts":[{"code":"null_rate","level":"warn","message":"25.5% null"}],"column":"Longitude","extras":{"histogram":{"counts":[13,5,7,8,4,15,96,36,57,107,33,114,95,55,22,7,0,1,42,109,108,170,107,159,94,62,17,20,67,73,102,83,56,129,112,190,177,48,51,11],"edges":[-178.166666667,-169.23333333365,-160.30000000029997,-151.36666666694998,-142.4333333336,-133.50000000025,-124.56666666689999,-115.63333333354998,-106.70000000019999,-97.76666666685,-88.8333333335,-79.90000000014999,-70.9666666668,-62.03333333345,-53.100000000099996,-44.16666666674999,-35.2333333334,-26.300000000050005,-17.366666666700013,-8.433333333349992,0.5,9.433333333349992,18.366666666700013,27.300000000050005,36.2333333334,45.16666666674999,54.10000000009998,63.03333333345,71.9666666668,80.90000000015002,89.83333333350001,98.76666666685,107.70000000019999,116.63333333354998,125.56666666689998,134.50000000024997,143.43333333359996,152.36666666695,161.3000000003,170.23333333365,179.166666667]},"sample":[124.666666667,95.5,-72.25,-76.0,22.0,141.833333333,-91.3333333333,140.166666667,-60.6666666667,-4.5,92.5,143.333333333,7.5,165.0,135.0,167.883333333,136.0,42.0,136.5,146.25,49.0,45.0,161.166666667,38.0,167.666666667,145.75,16.5,46.3333333333,-69.0,48.5,-62.5,100.0,123.0,115.25,-7.5,-64.1666666667,119.083333333,118.0,132.666666667,34.6666666667,74.3333333333,92.5833333333,18.75,148.0,141.266666667,124.833333333,-55.0,123.5,34.5,38.5,28.0,76.0,21.0,125.75,155.25,-4.25,124.75,-67.56,17.3333333333,-3.0,-2.5,-2.5,-2.0,-3.0,2.5,-3.0,67.0,36.0,30.03,106.0,21.5,25.5,-71.25,8.66666666667,166.833333333,155.75,74.5,4.0,11.0,116.333333333,-56.0,-88.0,-96.4166666667,-124.0,-92.25,144.75,80.25,-83.5,-96.6666666667,-60.5,47.5,105.5,34.0,93.1666666667,113.0,-121.75,-95.0,-64.3333333333,145.75,-5.0,115.0,-76.75,-95.0,136.25,-5.66666666667,6.75,11.0,36.25,139.0,93.0,19.28333333,138.166666667,47.4166666667,10.0,98.3333333333,145.583333333,140.083333333,140.5,143.666666667,25.25,-66.0,130.0,-7.0,178.0,11.25,-14.0,119.5,-0.166666666667,7.41666666667,149.833333333,7.41666666667,15.0,13.3333333333,83.5,14.0,146.0,44.0,152.416666667,35.75,-46.0,105.5,144.75,12.5833333333,-72.0,7.33333333333,126.333333333,-88.0,134.416666667,22.0,1.0,-44.0,21.25,-58.0,-7.75,136.0,11.25,84.3333333333,34.8333333333,7.5,8.16666666667,8.5,116.0,134.0,123.0,-121.916666667,-78.0,118.0,145.333333333,-95.0,46.25,7.5,132.083333333,-74.0,121.0,11.3333333333,14.25,7.66666666667,122.0,132.666666667,-62.25,102.0,55.0,80.0,140.0,-54.5,21.0,43.5,145.666666667,72.5,7.41666666667,120.833333333,144.133333333,-89.8333333333,18.0,15.0833333333,-0.916666666667,-13.25,144.0,84.3333333333,46.1666666667,144.75,-99.0,-4.83333333333,-10.1666666667,143.5,142.5,120.333333333,60.0,137.166666667,78.5,73.5,87.0,-121.5,115.416666667,27.5,147.5,143.666666667,-52.0,-1.58333333333,140.916666667,-121.5,144.333333333,134.5,163.0,-2.91666666667,120.5,0.833333333333,99.1666666667,151.116666667,31.0,24.9166666667,30.0,46.3333333333,48.0,104.0,-146.0,140.5,102.75,47.3333333333,-55.5,55.0,78.0,93.6666666667,130.833333333,161.5,41.5,104.833333333,30.5,-67.9166666667,27.1666666667,169.25,88.5,-8.0,160.666666667,13.0,24.0,13.5833333333,105.0,147.333333333,34.75,22.0,36.0,-72.0,15.5,133.5,-99.9166666667,26.5,-57.5,21.75,-100.166666667,-72.6666666667,-60.5,132.083333333,-102.5,8.25,140.833333333,144.083333333,76.0,142.5,-16.25,139.416666667,119.583333333,13.75,-122.5,148.416666667,102.0,14.0,-0.666666666667,31.1666666667,-120.666666667,99.0,110.0,155.75,45.5,14.0,100.5,-1.25,-88.6666666667,139.25,13.0,130.0,140.666666667,124.25,147.0,-65.0,145.75,-71.0,145.75,38.1666666667,-72.5,8.83333333333,36.0833333333,-97.75,-96.0833333333,-97.5833333333,101.7,76.5,-96.8333333333,-2.58333333333,44.0,150.583333333,37.5833333333,-91.3333333333,31.0,18.36,-82.0,0.666666666667,29.0,27.0,134.116666667,5.5,130.416666667,140.0,141.5,-99.0,-98.0833333333,93.8333333333,140.166666667,-59.5,-99.1666666667,93.5,3.33333333333,20.5,30.8333333333,126.5,106.416666667,33.0,29.3333333333,124.333333333,93.5,138.0,-99.6666666667,93.0,-92.5,-90.0,-118.5,-95.0,-112.0,30.8333333333,-94.0,37.0,142.416666667,74.0,-69.0,83.8333333333,160.7,130.0,-90.5,142.5,35.5,-123.333333333,-122.666666667,168.25,-55.0,141.583333333,-57.5,-122.5,76.75,149.333333333,101.0,-66.0,-71.0,-78.0,168.166666667,92.75,-83.75,-71.5,34.6666666667,31.3333333333,9.5,123.25,155.166666667,144.25,161.45,121.833333333,125.5,-123.333333333,31.0,-83.0,17.6666666667,75.5,120.5,110.5,101.666666667,93.75,24.0,84.3333333333,78.0,-122.5,-116.0,128.0,99.75,-149.583333333,11.3333333333,124.5,-92.0833333333,-98.0,79.0,24.0,153.2,118.5,126.75,134.166666667,-121.75,101.0,-70.5,141.333333333,-123.75,165.75,126.75,-99.0,49.0,78.5,35.5,46.25,125.0,169.333333333,-91.5,124.833333333,-97.4166666667,-104.75,119.75,35.0,78.3333333333,49.6666666667,-144.0,-108.0,35.0,-137.0,-122.75,-108.0,-97.8333333333,130.333333333,-92.6666666667,52.5,120.0,-75.5,-69.0,80.0,30.0,118.5,30.0,-59.0,72.0,-77.0,123.0,150.166666667,137.0,134.0,-124.166666667,125.75,122.75,146.5,142.333333333,124.833333333,146.083333333,-67.5,131.25,146.0,119.916666667,145.5,-77.5,-54.1666666667,139.5,35.5,142.583333333,103.0,-123.5,-72.75,124.0,131.5,121.5,37.3333333333,137.166666667,139.25,-70.4166666667,138.833333333,25.25,-93.25,30.0]},"kind":"numeric","n":3573,"n_null":911,"n_unique":1360,"null_rate":0.25496781416176884,"stats":{"iqr":166.75,"kurtosis":-1.047464016029985,"max":179.166666667,"mean":35.17172277863297,"median":34.79166666665,"min":-178.166666667,"n_outliers":0,"outlier_rate":0.0,"q1":-45.75,"q3":121.0,"skew":-0.32589422156950143,"std":89.35197233204336,"zero_rate":0.0015026296018031556}},{"alerts":[{"code":"one_word","level":"warn","message":"100.0% rows are a single word"},{"code":"null_rate","level":"warn","message":"26.0% null"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"}],"column":"Glottocode","extras":{"language_counts":{},"language_sample_size":3573,"length_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2645,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[7.5,7.525,7.55,7.575,7.6,7.625,7.65,7.675,7.7,7.725,7.75,7.775,7.8,7.825,7.85,7.875,7.9,7.925,7.95,7.975,8.0,8.025,8.05,8.075,8.1,8.125,8.15,8.175,8.2,8.225,8.25,8.275,8.3,8.325,8.35,8.375,8.4,8.425,8.45,8.475,8.5]},"near_unique":false,"sample":["chad1249","nyor1246","musl1236","taga1270","kuni1267","yukp1241","libe1247","tuva1244","tali1258","yane1238","motl1237","mudb1240","guaj1255","acha1249","fare1241","chep1245","baga1272","tlac1235","chim1300","kalm1243","hani1248","aona1235","chim1301","bisu1246","pagu1249","sabu1255","kham1281","chin1283","basa1284","yuki1243","nyan1304","napu1241","adyg1241","kirg1245","east2800","pamp1243","walm1241","lyel1241","puoc1238","mend1266","suki1245","ngur1261","guan1268","wakh1245","xere1240","yems1235","ishk1246","khun1259","siyi1240","trum1247"],"top_values":[["basq1248",11],["swis1247",6],["stan1295",6],["stra1244",6],["tibe1272",6],["nort3139",4],["band1339",4],["cari1279",4],["noga1249",4],["roma1326",4],["east2295",4],["adyg1241",3],["dutc1256",3],["bava1246",3],["hava1248",3],["halk1245",3],["iris1253",3],["balk1252",3],["tewa1260",3],["tata1255",3]],"top_words":[["basq1248",11],["swis1247",6],["stan1295",6],["stra1244",6],["tibe1272",6],["nort3139",4],["band1339",4],["cari1279",4],["noga1249",4],["roma1326",4],["east2295",4],["adyg1241",3],["dutc1256",3],["bava1246",3],["hava1248",3],["halk1245",3],["iris1253",3],["balk1252",3],["tewa1260",3],["tata1255",3],["tzel1254",3],["yoku1256",3],["cent2127",3],["zulu1248",3],["chad1249",2]],"vocab_skipped":null,"word_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2645,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[0.5,0.5333333333333333,0.5666666666666667,0.6,0.6333333333333333,0.6666666666666666,0.7,0.7333333333333334,0.7666666666666666,0.8,0.8333333333333333,0.8666666666666667,0.9,0.9333333333333333,0.9666666666666667,1.0,1.0333333333333332,1.0666666666666667,1.1,1.1333333333333333,1.1666666666666665,1.2,1.2333333333333334,1.2666666666666666,1.3,1.3333333333333335,1.3666666666666667,1.4,1.4333333333333333,1.4666666666666668,1.5]}},"kind":"text","n":3573,"n_null":928,"n_unique":2502,"null_rate":0.2597257206828995,"stats":{"allcaps_rate":0.0,"boilerplate_rate":0.0,"duplicate_rate":0.05406427221172023,"emoji_rate":0.0,"len_max":8,"len_mean":8.0,"len_median":8.0,"len_min":8,"len_p95":8.0,"n_duplicates":143,"n_empty":0,"one_word_rate":1.0,"readability_flesch_mean":92.87900000000003,"url_rate":0.0,"vocab_size":2502,"word_mean":1.0,"word_median":1.0}},{"alerts":[{"code":"one_word","level":"warn","message":"100.0% rows are a single word"},{"code":"null_rate","level":"warn","message":"26.8% null"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"}],"column":"ISO639P3code","extras":{"language_counts":{},"language_sample_size":3573,"length_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2614,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[2.5,2.525,2.55,2.575,2.6,2.625,2.65,2.675,2.7,2.725,2.75,2.775,2.8,2.825,2.85,2.875,2.9,2.925,2.95,2.975,3.0,3.025,3.05,3.075,3.1,3.125,3.15,3.175,3.2,3.225,3.25,3.275,3.3,3.325,3.35,3.375,3.4,3.425,3.45,3.475,3.5]},"near_unique":false,"sample":["shu","nyo","ttt","tgl","kup","yup","kpk","tvl","tlj","adx","pmw","moc","glg","acn","ksh","cap","bgg","tpt","zen","kio","hau","anu","cac","bpr","pgu","hvn","mik","cnt","bfa","yuk","nen","npy","ady","kir","jaa","xpe","wlo","lee","puo","dmw","sua","auc","gvc","van","xho","jnj","srl","kkh","cya","tpy"],"top_values":[["eus",12],["gsw",6],["deu",6],["str",6],["bod",6],["apc",4],["bdy",4],["cbd",4],["nog",4],["roh",4],["ydd",4],["ady",3],["jaa",3],["ute",3],["nld",3],["bar",3],["kal",3],["jya",3],["yuf",3],["hur",3]],"top_words":[["eus",12],["gsw",6],["deu",6],["str",6],["bod",6],["apc",4],["bdy",4],["cbd",4],["nog",4],["roh",4],["ydd",4],["ady",3],["jaa",3],["ute",3],["nld",3],["bar",3],["kal",3],["jya",3],["yuf",3],["hur",3],["ike",3],["gle",3],["xal",3],["mpj",3],["mig",3]],"vocab_skipped":null,"word_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2614,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[0.5,0.5333333333333333,0.5666666666666667,0.6,0.6333333333333333,0.6666666666666666,0.7,0.7333333333333334,0.7666666666666666,0.8,0.8333333333333333,0.8666666666666667,0.9,0.9333333333333333,0.9666666666666667,1.0,1.0333333333333332,1.0666666666666667,1.1,1.1333333333333333,1.1666666666666665,1.2,1.2333333333333334,1.2666666666666666,1.3,1.3333333333333335,1.3666666666666667,1.4,1.4333333333333333,1.4666666666666668,1.5]}},"kind":"text","n":3573,"n_null":959,"n_unique":2442,"null_rate":0.2684019031626085,"stats":{"allcaps_rate":0.0,"boilerplate_rate":0.0,"duplicate_rate":0.06579954093343535,"emoji_rate":0.0,"len_max":3,"len_mean":3.0,"len_median":3.0,"len_min":3,"len_p95":3.0,"n_duplicates":172,"n_empty":0,"one_word_rate":1.0,"readability_flesch_mean":119.52800000000003,"url_rate":0.0,"vocab_size":2442,"word_mean":1.0,"word_median":1.0}},{"alerts":[{"code":"null_rate","level":"warn","message":"25.5% null"}],"column":"Family","extras":{"singletons":121,"top_values":[["Niger-Congo",324],["Austronesian",324],["Indo-European",176],["Sino-Tibetan",146],["Afro-Asiatic",145],["Pama-Nyungan",121],["Trans-New Guinea",98],["other",72],["Altaic",65],["Oto-Manguean",56],["Austro-Asiatic",48],["Eastern Sudanic",47],["Uto-Aztecan",44],["Algic",31],["Mayan",30],["Arawakan",29],["Nakh-Daghestanian",28],["Mande",28],["Uralic",27],["Hokan",26]]},"kind":"categorical","n":3573,"n_null":911,"n_unique":254,"null_rate":0.25496781416176884,"stats":{"cardinality":254,"entropy":5.630997266685305,"entropy_ratio":0.7048716387579085,"top_rate":0.1217129977460556,"top_value":"Niger-Congo"}},{"alerts":[{"code":"null_rate","level":"warn","message":"74.5% null"}],"column":"Subfamily","extras":{"singletons":1,"top_values":[["Benue-Congo",200],["Eastern Malayo-Polynesian",159],["Tibeto-Burman",139],["Chadic",47],["Mon-Khmer",38],["Adamawa-Ubangi",30],["Gur",27],["Daghestanian",25],["Cushitic",24],["Finno-Ugric",21],["Kwa",20],["North-Central Atlantic",20],["Nilotic",19],["Mixtecan",18],["Omotic",15],["Kainantu-Goroka",14],["Madang",13],["Awyu-Ok",10],["Surmic",10],["Je",9]]},"kind":"categorical","n":3573,"n_null":2662,"n_unique":32,"null_rate":0.7450321858382312,"stats":{"cardinality":32,"entropy":3.8558857914373705,"entropy_ratio":0.7711771582874741,"top_rate":0.21953896816684962,"top_value":"Benue-Congo"}},{"alerts":[{"code":"null_rate","level":"warn","message":"25.5% null"}],"column":"Genus","extras":{"singletons":306,"top_values":[["Oceanic",149],["Bantu",141],["Indic",50],["Western Pama-Nyungan",49],["Semitic",43],["Turkic",41],["Sign Languages",40],["Bodic",40],["Germanic",39],["Northern Pama-Nyungan",33],["Creoles and Pidgins",32],["Mayan",30],["Algonquian",29],["Central Malayo-Polynesian",29],["Iranian",26],["Romance",24],["Biu-Mandara",24],["Southeastern Pama-Nyungan",23],["Dravidian",23],["Malayo-Sumbawan",22]]},"kind":"categorical","n":3573,"n_null":911,"n_unique":625,"null_rate":0.25496781416176884,"stats":{"cardinality":625,"entropy":7.950145602565396,"entropy_ratio":0.8559853360737966,"top_rate":0.05597295266716754,"top_value":"Oceanic"}},{"alerts":[{"code":"long_tail","level":"info","message":"601 singleton categories"},{"code":"null_rate","level":"warn","message":"82.5% null"}],"column":"GenusIcon","extras":{"singletons":601,"top_values":[["c688033",2],["c803E33",2],["c804733",2],["c807D33",2],["c806233",2],["c805033",2],["c7A8033",2],["c805933",2],["c807433",2],["c806B33",2],["c718033",2],["c803533",2],["cCC8C51",1],["cCC6851",1],["cCC7E51",1],["c8FCC51",1],["cCC8051",1],["c528033",1],["cCC9F51",1],["cCCB551",1]]},"kind":"categorical","n":3573,"n_null":2948,"n_unique":613,"null_rate":0.8250769661349007,"stats":{"cardinality":613,"entropy":9.249312379549451,"entropy_ratio":0.9988735233964605,"top_rate":0.0032,"top_value":"c688033"}},{"alerts":[{"code":"one_word","level":"warn","message":"99.0% rows are a single word"},{"code":"null_rate","level":"warn","message":"26.1% null"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"}],"column":"ISO_codes","extras":{"language_counts":{},"language_sample_size":3573,"length_histogram":{"counts":[2614,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,26],"edges":[3.0,3.1,3.2,3.3,3.4,3.5,3.6,3.7,3.8,3.9,4.0,4.1,4.2,4.3,4.4,4.5,4.6,4.7,4.8,4.9,5.0,5.1,5.2,5.300000000000001,5.4,5.5,5.6,5.7,5.800000000000001,5.9,6.0,6.1,6.2,6.300000000000001,6.4,6.5,6.6,6.7,6.800000000000001,6.9,7.0]},"near_unique":false,"sample":["shu","nyo","dto","sps","kpx","yux","kff","tue","tlj","ame","pmw","moc","gqu","acn","goa","cdm","bot","tio","zoh","kio","hni","njo","sgw","bom","pgu","hvn","mik","csl","brg","ywq","nen","npy","ady","kgy","jiv","kpk","wmt","luy","kju","mxu","sui","wmb","gqa","waw","xer","tao","its","khm","crw","tft"],"top_values":[["eus",12],["gsw",6],["deu",6],["str",6],["bod",6],["apc",4],["bdy",4],["cbd",4],["nog",4],["roh",4],["ydd",4],["ady",3],["jaa",3],["ute",3],["nld",3],["bar",3],["kal",3],["jya",3],["yuf",3],["hur",3]],"top_words":[["eus",12],["gsw",6],["deu",6],["str",6],["bod",6],["apc",4],["bdy",4],["cbd",4],["nog",4],["roh",4],["ydd",4],["ady",3],["jaa",3],["ute",3],["nld",3],["bar",3],["gcf",3],["kal",3],["jya",3],["yuf",3],["hur",3],["ike",3],["gle",3],["xal",3],["mpj",3]],"vocab_skipped":null,"word_histogram":{"counts":[2614,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,26],"edges":[1.0,1.0333333333333334,1.0666666666666667,1.1,1.1333333333333333,1.1666666666666667,1.2,1.2333333333333334,1.2666666666666666,1.3,1.3333333333333333,1.3666666666666667,1.4,1.4333333333333333,1.4666666666666668,1.5,1.5333333333333332,1.5666666666666667,1.6,1.6333333333333333,1.6666666666666665,1.7,1.7333333333333334,1.7666666666666666,1.8,1.8333333333333335,1.8666666666666667,1.9,1.9333333333333333,1.9666666666666668,2.0]}},"kind":"text","n":3573,"n_null":933,"n_unique":2468,"null_rate":0.2611251049538203,"stats":{"allcaps_rate":0.0,"boilerplate_rate":0.0,"duplicate_rate":0.06515151515151515,"emoji_rate":0.0,"len_max":7,"len_mean":3.0393939393939395,"len_median":3.0,"len_min":3,"len_p95":3.0,"n_duplicates":172,"n_empty":0,"one_word_rate":0.9901515151515151,"readability_flesch_mean":117.41300000000003,"url_rate":0.0,"vocab_size":2486,"word_mean":1.0098484848484848,"word_median":1.0}},{"alerts":[{"code":"null_rate","level":"warn","message":"25.5% null"},{"code":"imbalance","level":"warn","message":"top value is 96.2% of rows"}],"column":"Samples_100","extras":{"singletons":0,"top_values":[["False",2562],["True",100]]},"kind":"categorical","n":3573,"n_null":911,"n_unique":2,"null_rate":0.25496781416176884,"stats":{"cardinality":2,"entropy":0.23101765269146687,"entropy_ratio":0.23101765269146687,"top_rate":0.9624342599549212,"top_value":"False"}},{"alerts":[{"code":"null_rate","level":"warn","message":"25.5% null"}],"column":"Samples_200","extras":{"singletons":0,"top_values":[["False",2462],["True",200]]},"kind":"categorical","n":3573,"n_null":911,"n_unique":2,"null_rate":0.25496781416176884,"stats":{"cardinality":2,"entropy":0.38478791286167435,"entropy_ratio":0.38478791286167435,"top_rate":0.9248685199098422,"top_value":"False"}},{"alerts":[{"code":"null_rate","level":"warn","message":"25.7% null"}],"column":"Country_ID","extras":{"singletons":146,"top_values":[["PG",214],["AU",185],["US",177],["ID",177],["IN",120],["MX",120],["RU",89],["NG",66],["BR",66],["CN",54],["CD",49],["CM",46],["CA",45],["CO",39],["ET",36],["PH",36],["PE",35],["NP",32],["TZ",28],["VU",28]]},"kind":"categorical","n":3573,"n_null":918,"n_unique":337,"null_rate":0.25692695214105793,"stats":{"cardinality":337,"entropy":6.314446960448029,"entropy_ratio":0.75202383880205,"top_rate":0.08060263653483993,"top_value":"PG"}},{"alerts":[{"code":"one_word","level":"warn","message":"45.5% rows are a single word"},{"code":"null_rate","level":"warn","message":"30.1% null"}],"column":"Source","extras":{"language_counts":{},"language_sample_size":3573,"length_histogram":{"counts":[937,518,280,203,136,90,61,42,35,36,27,21,12,13,6,14,6,5,3,4,4,5,3,6,7,3,4,1,1,3,1,1,0,2,1,1,4,1,0,2],"edges":[7.0,18.125,29.25,40.375,51.5,62.625,73.75,84.875,96.0,107.125,118.25,129.375,140.5,151.625,162.75,173.875,185.0,196.125,207.25,218.375,229.5,240.625,251.75,262.875,274.0,285.125,296.25,307.375,318.5,329.625,340.75,351.875,363.0,374.125,385.25,396.375,407.5,418.625,429.75,440.875,452.0]},"near_unique":false,"sample":["Abu-Absi-1995","Grenoble-1992","Goldstein-1991","Ross-2002g","Bacelar-2004","Hanes-1952 de-Vegamian-1978","Laanest-1982 Leskinen-1984 Raun-1964b Rjagoev-1993","Haas-1940 Haas-1953 Nichols-1992 Swanton-1919 Swanton-1921","Ross-2002h","Duff-Tripp-1997 Fast-1953 Wise-1958 Wise-1978 Wise-1986 Wise-1990","Farris-1992","Voorhoeve-1975","Hindle-and-Rigsby-1973 Rigsby-1986","Maring-1967 Miller-1965 Miller-1966 Nichols-1992 Spencer-1946","Rapp-1966","Borman-1962 Borman-1976","Dondrup-1990","Bhaskararao-1972 Emeneau-1980 Gil-1994b Grierson-1906 Kelley-1963 Kostic-et-al-1977 Krishnamurti-1961 Krishnamurti-1998 Krishnamurti-and-Gwynn-1985 Krishnamurti-and-Sarma-1968 Lisker-1963 Malherbe-and-Rosenberg-1996 Petrunicheva-1960 Sastry-1972 Stolz-1996 Subbarao-and-Murthy-2000 Subrahmanyam-1974","Johnson-2000 Knudson-1980","Kastenholz-1987","Frajzyngier-and-Shay-2002","Clark-1893 Coupe-1999 Coupe-2007 Gowda-1972 Gowda-1975 Gurubasave-1975 Mills-1926","Chung-1983 Chung-1998 Cooreman-1982 Cooreman-1988 Costenoble-1940 Malherbe-and-Rosenberg-1996 Nichols-1992 Safford-1903-1905 Seiden-1960 Topping-1973 Topping-1980a Topping-1980b Topping-et-al-1975 Witucki-1974","Naden-1973 Prost-1950 Stolz-1996","Lauck-1976 Lauck-1979","Gault-1979","Birk-1975 Birk-1976 Mushin-1995 Nichols-1992 Tryon-1974","Durbin-and-Seijas-1972 Huber-and-Reed-1992 Robayo-1989","Bybee-et-al-1994 Mitterrutzner-1867 Spagnolo-1933 Stolz-1996","Gao-1958","Soukka-2000","Kutsch-Lojenga-1994","Bergsland-1956 Bergsland-1959 Bergsland-1994 Cho-et-al-1997 Golovko-2001 Jacobson-1944 Menovshchikov-1968 Rozelle-1997","Bokarev-1959","Hang-et-al-1989 Pevnov-1997","Donwa-Ifode-1990 Maingard-1962 Meinhof-1930","Nguyen-1998","Gordon-1986","Sims-and-Sims-1982","Bloomfield-1962 Miner-1975 Pesetsky-1979","Hatfield-2008 Voorhoeve-1975","Hawkins-1991 Hawkins-1998","Martius-1863","Cook-1987 Tryon-1971","Yi-2008","Dench-1998","Conrad-1971 Laycock-1965b","Tenishev-1997a","Kinkade-1963 Kinkade-1976 Kinkade-1991","Sande-and-Stokhof-1977"],"top_values":[["Huber-and-Reed-1992",14],["Boelaars-1950",11],["Tucker-and-Bryan-1966",9],["Voorhoeve-1975",8],["Adelaar-2004",7],["Omar-1983",7],["Hualde-1999",7],["ZGraggen-1969",6],["Llamzon-1978",5],["Portman-1887b",4],["Malherbe-and-Rosenberg-1996",4],["Laycock-and-Zgraggen-1975",4],["Derbyshire-1986 Derbyshire-and-Payne-1990",4],["Durbin-and-Seijas-1972",4],["Stevenson-1957 Tucker-and-Bryan-1966",4],["Thompson-et-al-1974",4],["Tryon-1974",4],["Dahl-1985",3],["Voorhoeve-1971",3],["Sharma-1989a",3]],"top_words":[["nichols-1992",113],["malherbe-and-rosenberg-1996",113],["stolz-1996",79],["bybee-et-al-1994",70],["dahl-1985",54],["tucker-and-bryan-1966",53],["huber-and-reed-1992",45],["haspelmath-1997",30],["derbyshire-and-payne-1990",29],["fabricius-1998",27],["adelaar-2004",16],["abbi-1992",16],["langacker-1976",16],["boelaars-1950",15],["aikhenvald-and-dixon-1999",14],["llamzon-1978",12],["hualde-1999",11],["mushin-1995",11],["wise-1978",10],["voorhoeve-1975",10],["martius-1863",9],["corbett-1991",9],["roca-1999",9],["noonan-2003c",8],["omar-1983",8]],"vocab_skipped":null,"word_histogram":{"counts":[1136,519,293,180,91,67,32,43,27,0,19,17,10,8,8,5,3,10,4,0,2,7,3,3,1,3,2,2,2,2],"edges":[1.0,1.9,2.8,3.7,4.6,5.5,6.4,7.3,8.2,9.1,10.0,10.9,11.8,12.700000000000001,13.6,14.5,15.4,16.3,17.2,18.1,19.0,19.900000000000002,20.8,21.7,22.6,23.5,24.400000000000002,25.3,26.2,27.1,28.0]}},"kind":"text","n":3573,"n_null":1074,"n_unique":2373,"null_rate":0.30058774139378674,"stats":{"allcaps_rate":0.0,"boilerplate_rate":0.0,"duplicate_rate":0.05042016806722689,"emoji_rate":0.0,"len_max":452,"len_mean":42.07122849139656,"len_median":25.0,"len_min":7,"len_p95":135.0,"n_duplicates":126,"n_empty":0,"one_word_rate":0.4545818327330932,"readability_flesch_mean":21.332089495798346,"url_rate":0.0,"vocab_size":5899,"word_mean":2.854341736694678,"word_median":2.0}},{"alerts":[{"code":"long_tail","level":"info","message":"501 singleton categories"}],"column":"Parent_ID","extras":{"singletons":501,"top_values":[["genus-oceanic",149],["genus-bantu",141],["genus-indic",50],["genus-westernpamanyungan",49],["genus-semitic",43],["genus-turkic",41],["genus-signlanguages",40],["genus-bodic",40],["genus-germanic",39],["genus-northernpamanyungan",33],["genus-creolesandpidgins",32],["genus-mayan",30],["family-austronesian",30],["genus-algonquian",29],["genus-centralmalayopolynesian",29],["genus-iranian",26],["family-transnewguinea",25],["genus-romance",24],["genus-biumandara",24],["genus-southeasternpamanyungan",23]]},"kind":"categorical","n":3573,"n_null":254,"n_unique":911,"null_rate":0.07108872096277638,"stats":{"cardinality":911,"entropy":8.55358852285104,"entropy_ratio":0.8700357247245505,"top_rate":0.04489304007231094,"top_value":"genus-oceanic"}}],"insights":{"errors":[],"insights":[{"confidence":"high","critiques":[],"evidence_keys":["columns","row_count","kinds"],"featured_charts":[{"caption":"Shows the six-region geographic split, led by Eurasia and Africa.","column":"Macroarea","kind":"donut"},{"caption":"Top language families \u2014 Niger-Congo and Austronesian tie at the top with 324 languages each.","column":"Family","kind":"bar"},{"caption":"Latitude distribution skews toward the tropics and northern hemisphere (median ~8\u00b0).","column":"Latitude","kind":"histogram"},{"caption":"Country concentration \u2014 Papua New Guinea, Australia, the US, and Indonesia host the most languages.","column":"Country_ID","kind":"bar"},{"caption":"Highlights the strong imbalance: only 100 of 2,662 non-null rows are flagged True.","column":"Samples_100","kind":"donut"}],"model":"anthropic:claude-opus-4-7","narrative":"This dataset is a catalogue of 3,573 world languages from WALS, with identifiers (ISO codes, Glottocode), names, geographic coordinates, and classification fields (Family, Genus, Subfamily, Macroarea) plus reference sources and sampling flags. The geographic and genealogical breakdowns are the most informative starting point: Macroarea splits cleanly across six regions led by Eurasia (659) and Africa (606), while Family is dominated by Niger-Congo and Austronesian (324 each). Worth a closer look: roughly a quarter of rows are missing core fields like Family, Genus, Macroarea, and coordinates (null rate ~0.255), and Subfamily is 74.5% null, which will limit any subfamily-level analysis. The Samples_100 and Samples_200 flags are highly imbalanced (only 100 and 200 True values respectively), reflecting their role as curated sub-samples rather than balanced categories.","scope":"dataset","target":"__global__"},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.one_word_rate","stats.len_min","stats.len_max","stats.len_median","stats.n_duplicates","top_words"],"model":"anthropic:claude-opus-4-7","narrative":"Column 'ID' is a unique row identifier: all 3573 values are distinct (n_unique equals n), every value is a single token (one_word_rate 1.0), and there are no nulls or duplicates. Lengths range from 2 to 36 characters with a median of 3, and the top tokens (aab, aar, aba, abb\u2026) suggest short alphabetic codes rather than numeric keys.","role":"identifier","scope":"column","target":"ID","treatment":"Use as a join key; drop from modelling features."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","duplicate_rate","n_duplicates","one_word_rate","word_mean","len_mean","len_max","top_values","top_words"],"model":"anthropic:claude-opus-4-7","narrative":"This column holds short proper-noun labels, almost certainly language names (top values like Basque, Ainu, Beothuk, Atakapa, and frequent words 'sign', 'language', 'arabic', 'mixtec' all point to a linguistic registry). Entries are overwhelmingly single tokens (one_word_rate 0.80, word_mean 1.26, len_mean 8.7) with a 46-character max for the longer parenthesised variants like '(northern)'/'(southern)'. Notably, 375 duplicates (10.5%) exist across 3,573 rows with 3,198 uniques \u2014 names like 'Abun', 'Andoke', 'Basque' each appear 3 times, suggesting the dataset repeats languages across some other dimension rather than being a clean key.","role":"label","scope":"column","target":"Name","treatment":"Treat as a categorical language label; deduplicate or join on it rather than using as a primary key."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"Macroarea is a categorical geographic grouping with 6 values covering the standard continental/linguistic macroareas (Eurasia, Africa, Papunesia, North America, South America, Australia). The distribution is fairly balanced \u2014 entropy ratio is 0.95 and the top value Eurasia accounts for only 24.8% of rows. The main concern is a 25.5% null rate, meaning a quarter of the 3573 rows lack any macroarea assignment.","role":"feature","scope":"column","target":"Macroarea","treatment":"One-hot encode and decide whether to impute or add an explicit 'unknown' bucket for the 25.5% nulls."},{"confidence":"high","critiques":[],"evidence_keys":["null_rate","n","n_unique","stats.min","stats.max","stats.median","stats.mean","stats.skew","stats.kurtosis","stats.n_outliers"],"model":"anthropic:claude-opus-4-7","narrative":"Geographic latitude in decimal degrees, ranging from -55.0 to 71.25 with a median of 8.29 \u2014 consistent with global coverage skewed slightly toward the northern hemisphere. About 25.5% of rows are null, a notable gap for a positional field, and only 887 unique values across 3573 rows suggest coordinates are rounded or tied to a limited set of locations. Distribution is near-symmetric (skew 0.36, kurtosis -0.50) with just one outlier flagged.","role":"feature","scope":"column","target":"Latitude","treatment":"Pair with Longitude for geospatial features; impute or filter the 25.5% nulls before modelling."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.min","stats.max","stats.mean","stats.median","stats.iqr","stats.skew","stats.kurtosis","stats.zero_rate"],"model":"anthropic:claude-opus-4-7","narrative":"Geographic longitude in decimal degrees, spanning -178.17 to 179.17 with 1360 distinct values across 3573 rows. The distribution is roughly symmetric (skew -0.33) but flat (kurtosis -1.05) with an IQR of 166.75, consistent with truly global coverage rather than a regional sample. Notable concern: 25.5% of rows are null, which will silently drop a quarter of any geospatial join.","role":"feature","scope":"column","target":"Longitude","treatment":"Pair with Latitude and impute or filter the 25.5% nulls before any geospatial modelling."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.len_min","stats.len_max","stats.len_median","stats.one_word_rate","stats.duplicate_rate","stats.n_duplicates","stats.vocab_size","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This is a Glottocode field \u2014 fixed 8-character language identifiers from the Glottolog catalogue (e.g. basq1248, swis1247), with every value being a single token. About 26% of rows are null and 2502 distinct codes appear across 3573 rows, with 143 duplicates (5.4%) where the same language repeats \u2014 basq1248 leads with 11 occurrences. Length is rigidly 8 for min, median, and max, consistent with a controlled vocabulary identifier rather than free text.","role":"foreign_key","scope":"column","target":"Glottocode","treatment":"Treat as a categorical key; left-join to Glottolog metadata and handle the 26% nulls explicitly."},{"confidence":"high","critiques":[],"evidence_keys":["null_rate","n_unique","stats.len_min","stats.len_max","stats.one_word_rate","stats.duplicate_rate","stats.vocab_size","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This column holds ISO 639-3 language codes: every non-null value is exactly 3 characters and one word (len_min=len_max=3, one_word_rate=1.0), with examples like 'eus', 'gsw', 'deu' matching the standard. Coverage is incomplete \u2014 26.84% of rows are null \u2014 and 2442 unique codes appear across 3573 rows with a 6.58% duplicate rate, so most languages occur only once or twice (top value 'eus' at 12).","role":"foreign_key","scope":"column","target":"ISO639P3code","treatment":"Treat as a categorical key; left-join to an ISO 639-3 reference table and decide on an explicit bucket for the 26.84% nulls."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.top_rate","stats.top_value","stats.entropy_ratio","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"Categorical label for the language family of each row, with 254 distinct families across 3573 records. Distribution is long-tailed but not dominated: top value 'Niger-Congo' covers only 12.2% and ties exactly with 'Austronesian' at 324 each, with entropy ratio 0.70 indicating spread across many small families. Notable concern: 25.5% of rows are null, and a literal 'other' bucket already accounts for 72 rows.","role":"feature","scope":"column","target":"Family","treatment":"Impute or flag the 25.5% missing, then group rare families before one-hot or target encoding."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.top_value","stats.top_rate","stats.entropy_ratio","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This column records the linguistic subfamily classification of each row, with 32 distinct values dominated by Benue-Congo (200 occurrences, 21.95% of non-nulls), Eastern Malayo-Polynesian (159), and Tibeto-Burman (139). The striking issue is the 74.5% null rate \u2014 only about a quarter of the 3573 rows carry a subfamily label \u2014 yet entropy ratio of 0.77 indicates the populated values are reasonably spread across the 32 categories rather than collapsing onto one.","role":"feature","scope":"column","target":"Subfamily","treatment":"Treat missingness as its own category and group rare subfamilies before one-hot encoding."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.cardinality","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This column holds linguistic genus labels (e.g., Oceanic, Bantu, Indic, Semitic, Germanic), a mid-level grouping in language classification. Cardinality is high at 625 distinct values across 3573 rows with entropy ratio 0.856, so the distribution is broad and flat \u2014 the top value 'Oceanic' covers only 5.6%. Note the 25.5% null rate, which is flagged and would meaningfully shrink any analysis that conditions on genus.","role":"feature","scope":"column","target":"Genus","treatment":"Treat as a high-cardinality categorical: group rare genera into 'Other' or target-encode, and add an explicit missing indicator for the 25.5% nulls."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"GenusIcon holds 613 short hex-like codes (e.g. 'c688033') across 3573 rows, with 82.51% nulls and an entropy ratio of 0.9988 indicating values are nearly uniformly distributed among non-nulls. The top value appears only twice (top_rate 0.0032), so there is no dominant category \u2014 it behaves like a near-unique tag rather than a real categorical feature.","role":"identifier","scope":"column","target":"GenusIcon","treatment":"Drop for modelling; near-unique with 82% nulls."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.len_mean","stats.len_max","stats.one_word_rate","stats.duplicate_rate","stats.n_duplicates","stats.vocab_size","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This column holds ISO language codes \u2014 almost all values are single tokens of length 3 (len_mean 3.04, one_word_rate 0.99), matching ISO 639-3 conventions (e.g. 'eus', 'gsw', 'deu'). 26.1% of rows are null and 172 duplicates exist, but with 2,468 unique codes across 3,573 rows the vocabulary is unusually wide, suggesting broad multilingual coverage rather than a few dominant languages. No top value exceeds 12 occurrences, so the distribution has an extremely long tail.","role":"foreign_key","scope":"column","target":"ISO_codes","treatment":"Treat as a categorical code key; impute or filter the 26% nulls before joining to a language reference table."},{"confidence":"high","critiques":[],"evidence_keys":["n_unique","null_rate","top_rate","top_value","top_values","stats.entropy_ratio"],"model":"anthropic:claude-opus-4-7","narrative":"Boolean flag with only two values where 'False' dominates at 96.2% (2562 of 2662 non-null rows) and 'True' appears exactly 100 times. The null rate of 25.5% is high, suggesting the flag is only populated for a subset of records. Entropy ratio of 0.23 confirms severe imbalance.","role":"feature","scope":"column","target":"Samples_100","treatment":"Treat as a rare-event boolean indicator; impute or encode nulls explicitly and avoid as a stratification key given the imbalance."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.top_rate","stats.top_value","stats.entropy_ratio","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"Binary flag column with only two values (False/True) and heavy class imbalance \u2014 False accounts for 92.5% of non-null rows versus 200 True observations, hinting the name 'Samples_200' refers to a tagged 200-row subset. Roughly a quarter of rows (25.5%) are null, which is the main surprise and the reason for the null_rate alert. Entropy ratio of 0.385 confirms the distribution is far from balanced.","role":"label","scope":"column","target":"Samples_200","treatment":"Impute or explicitly encode nulls as a third category before using as a binary indicator."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"Country_ID looks like an ISO-style two-letter country code, with 337 distinct values across 3573 rows and a fairly even spread (entropy ratio 0.752). The top country PG accounts for only 8.06% of rows, followed by AU, US, and ID. Notably, 25.69% of values are null, and the cardinality of 337 exceeds the ~250 ISO 3166-1 codes, suggesting non-standard or sub-region codes are mixed in.","role":"foreign_key","scope":"column","target":"Country_ID","treatment":"Impute or flag the 25.69% nulls and reconcile non-standard codes against an ISO 3166-1 reference before joining."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.one_word_rate","stats.duplicate_rate","stats.len_median","stats.word_median","top_values","top_words"],"model":"anthropic:claude-opus-4-7","narrative":"This column holds bibliographic citation tags (e.g. 'Huber-and-Reed-1992', 'Boelaars-1950'), evidently the source reference for each row in what looks like a linguistic typology dataset. Values are short (median 25 chars, 2 words) and 45.5% are single tokens, consistent with author-year keys rather than prose. Cardinality is high (2373 unique of 3573) with 5% duplicates and a 30% null rate, so coverage is uneven and no single source dominates (top value appears only 14 times).","role":"metadata","scope":"column","target":"Source","treatment":"Treat as a citation key: keep as categorical provenance metadata, optionally normalize casing and join to a bibliography table; do not use as a model feature."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.cardinality","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values","alerts"],"model":"anthropic:claude-opus-4-7","narrative":"Parent_ID looks like a foreign key pointing to a linguistic genus (e.g. 'genus-oceanic', 'genus-bantu'), grouping the 3573 rows into 911 parent categories. The distribution is long-tailed but flat \u2014 the top value covers only 4.5% of rows and entropy ratio is 0.87 \u2014 and 7.1% of values are null. Oceanic and Bantu dominate the head, with Indic, Western Pama-Nyungan and Semitic trailing far behind.","role":"foreign_key","scope":"column","target":"Parent_ID","treatment":"Left-join on this id to a genus lookup; treat the 7.1% nulls explicitly rather than one-hot encoding the 911 levels."}],"providers":["anthropic:claude-opus-4-7"],"total_usage":{"completion_tokens":5584,"prompt_tokens":23485,"total_tokens":29069}},"language_counts":{},"meta":{"generated_at":"2026-05-01T17:52:07+00:00","mode":"full","row_count":3573,"sampled_rows":3573,"seed":42,"source":"/home/coolhand/servers/diachronica/data_raw/wals_language.csv"},"notes":[],"saturn_version":"0.2.0","schema":{"Country_ID":"categorical","Family":"categorical","Genus":"categorical","GenusIcon":"categorical","Glottocode":"text","ID":"text","ISO639P3code":"text","ISO_codes":"text","Latitude":"numeric","Longitude":"numeric","Macroarea":"categorical","Name":"text","Parent_ID":"categorical","Samples_100":"boolean","Samples_200":"boolean","Source":"text","Subfamily":"categorical"}}
