{"columns":[{"alerts":[{"code":"near_unique","level":"info","message":"100.0% of rows are unique strings"},{"code":"one_word","level":"warn","message":"100.0% rows are a single word"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"}],"column":"ID","extras":{"language_counts":{},"language_sample_size":3573,"length_histogram":{"counts":[11,2651,0,0,0,0,0,2,17,53,89,137,122,0,110,90,67,52,48,0,21,22,20,17,10,11,0,7,3,0,2,3,2,0,1,2,2,0,0,1],"edges":[2.0,2.85,3.7,4.55,5.4,6.25,7.1,7.95,8.8,9.649999999999999,10.5,11.35,12.2,13.049999999999999,13.9,14.75,15.6,16.45,17.299999999999997,18.15,19.0,19.849999999999998,20.7,21.55,22.4,23.25,24.099999999999998,24.95,25.8,26.65,27.5,28.349999999999998,29.2,30.05,30.9,31.75,32.599999999999994,33.45,34.3,35.15,36.0]},"near_unique":true,"sample":["abd","genus-araucanian","genus-misumalpan","subfamily-palaihnihan","mmp","family-tunica","mrj","genus-northwestcaucasian","genus-huavean","arg","sss","pkm","kab","ada","kfy","eud","bou","family-kartvelian","family-yawa","mem","kkv","ash","dhm","brl","wly","genus-centralmalayopolynesian","rag","for","biu","family-tsimshianic","trb","tso","ady","mbz","let","mrd","family-southbirdshead","ojm","muh","pir","genus-guato","genus-angamipochuri","kan","genus-siamou","genus-totonacan","genus-samoyedic","kwa","mco","dig","genus-centralkainji"],"top_values":[],"top_words":[["aab",1],["aar",1],["aba",1],["abb",1],["abd",1],["abe",1],["abh",1],["abi",1],["abk",1],["abm",1],["abn",1],["abo",1],["abu",1],["abv",1],["abw",1],["abz",1],["ace",1],["acg",1],["ach",1],["aci",1],["acl",1],["acm",1],["acn",1],["aco",1],["acu",1]],"vocab_skipped":null,"word_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3573,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[0.5,0.5333333333333333,0.5666666666666667,0.6,0.6333333333333333,0.6666666666666666,0.7,0.7333333333333334,0.7666666666666666,0.8,0.8333333333333333,0.8666666666666667,0.9,0.9333333333333333,0.9666666666666667,1.0,1.0333333333333332,1.0666666666666667,1.1,1.1333333333333333,1.1666666666666665,1.2,1.2333333333333334,1.2666666666666666,1.3,1.3333333333333335,1.3666666666666667,1.4,1.4333333333333333,1.4666666666666668,1.5]}},"kind":"text","n":3573,"n_null":0,"n_unique":3573,"null_rate":0.0,"stats":{"allcaps_rate":0.0,"boilerplate_rate":0.0,"duplicate_rate":0.0,"emoji_rate":0.0,"len_max":36,"len_mean":5.9818080044780295,"len_median":3.0,"len_min":2,"len_p95":17.0,"n_duplicates":0,"n_empty":0,"one_word_rate":1.0,"readability_flesch_mean":61.57700000000003,"url_rate":0.0,"vocab_size":3573,"word_mean":1.0,"word_median":1.0}},{"alerts":[{"code":"one_word","level":"warn","message":"80.0% rows are a single word"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"}],"column":"Name","extras":{"language_counts":{},"language_sample_size":3573,"length_histogram":{"counts":[112,361,528,581,502,305,198,152,84,88,131,89,76,77,66,46,34,31,21,19,21,8,9,9,5,2,3,4,2,3,2,1,1,0,0,0,0,0,0,2],"edges":[2.0,3.1,4.2,5.300000000000001,6.4,7.5,8.600000000000001,9.700000000000001,10.8,11.9,13.0,14.100000000000001,15.200000000000001,16.3,17.400000000000002,18.5,19.6,20.700000000000003,21.8,22.900000000000002,24.0,25.1,26.200000000000003,27.3,28.400000000000002,29.500000000000004,30.6,31.700000000000003,32.800000000000004,33.900000000000006,35.0,36.1,37.2,38.300000000000004,39.400000000000006,40.5,41.6,42.7,43.800000000000004,44.900000000000006,46.0]},"near_unique":false,"sample":["Abidji","Araucanian","Misumalpan","Palaihnihan","Mampruli","Tunica","Mirniny","Northwest Caucasian","Huavean","Arabic (Gulf)","Salish (Samish Straits)","Pokomch\u00ed","Kabardian","Adamorobe Sign Language","Kirghiz (Fu-Yu)","Eudeve","Berber (Wargla)","Kartvelian","Yawa","Manem","Lusi","Adyghe (Shapsugh)","Dhimal","Baragaunle","Wolaytta","Central Malayo-Polynesian","Raga","Fore","Bisu","Tsimshianic","Teribe","Tsou","Adyghe (Abzakh)","Mbe'","Leti","Marind","South Bird's Head","Ojibwe (Minnesota)","Muher","Piro","Guat\u00f3","Angami-Pochuri","Kana","Siamou","Totonacan","Samoyedic","Kwaio","Mixe (Coatl\u00e1n)","Digaro","Central Kainji"],"top_values":[["Abun",3],["Andoke",3],["Aikan\u00e1",3],["Ainu",3],["An\u00eam",3],["Atakapa",3],["Beothuk",3],["Berta",3],["Bangime",3],["Basque",3],["Betoi",3],["Burushaski",3],["Cams\u00e1",3],["Candoshi",3],["Cof\u00e1n",3],["Chiquitano",3],["Chitimacha",3],["Cuitlatec",3],["Cayuvava",3],["Esselen",3]],"top_words":[["sign",25],["language",22],["(in",22],["arabic",21],["german",17],["central",16],["mixtec",16],["(northern)",15],["basque",14],["(southern)",13],["creole",12],["south",12],["nahuatl",12],["eastern",12],["(western)",11],["west",11],["east",11],["(eastern)",10],["quechua",10],["western",10],["zapotec",10],["northern",10],["berber",9],["(san",8],["romani",8]],"vocab_skipped":null,"word_histogram":{"counts":[2859,0,0,0,0,0,566,0,0,0,0,0,104,0,0,0,0,0,29,0,0,0,0,0,14,0,0,0,0,1],"edges":[1.0,1.1666666666666667,1.3333333333333333,1.5,1.6666666666666665,1.8333333333333333,2.0,2.1666666666666665,2.333333333333333,2.5,2.6666666666666665,2.833333333333333,3.0,3.1666666666666665,3.333333333333333,3.5,3.6666666666666665,3.833333333333333,4.0,4.166666666666666,4.333333333333333,4.5,4.666666666666666,4.833333333333333,5.0,5.166666666666666,5.333333333333333,5.5,5.666666666666666,5.833333333333333,6.0]}},"kind":"text","n":3573,"n_null":0,"n_unique":3198,"null_rate":0.0,"stats":{"allcaps_rate":0.0,"boilerplate_rate":0.0,"duplicate_rate":0.10495382031905962,"emoji_rate":0.0,"len_max":46,"len_mean":8.705009795689897,"len_median":7.0,"len_min":2,"len_p95":19.0,"n_duplicates":375,"n_empty":0,"one_word_rate":0.8001679261125105,"readability_flesch_mean":48.158075000000025,"url_rate":0.0,"vocab_size":3383,"word_mean":1.2580464595577945,"word_median":1.0}},{"alerts":[{"code":"null_rate","level":"warn","message":"25.5% null"}],"column":"Macroarea","extras":{"singletons":0,"top_values":[["Eurasia",659],["Africa",606],["Papunesia",560],["North America",396],["South America",258],["Australia",183]]},"kind":"categorical","n":3573,"n_null":911,"n_unique":6,"null_rate":0.25496781416176884,"stats":{"cardinality":6,"entropy":2.458602299570494,"entropy_ratio":0.9511172014621451,"top_rate":0.24755822689706988,"top_value":"Eurasia"}},{"alerts":[{"code":"null_rate","level":"warn","message":"25.5% null"}],"column":"Latitude","extras":{"histogram":{"counts":[2,1,1,2,3,5,9,18,24,29,47,64,85,86,129,187,190,130,119,196,161,104,145,82,63,74,84,66,84,67,86,62,74,52,44,18,20,23,19,7],"edges":[-55.0,-51.84375,-48.6875,-45.53125,-42.375,-39.21875,-36.0625,-32.90625,-29.75,-26.59375,-23.4375,-20.28125,-17.125,-13.96875,-10.8125,-7.65625,-4.5,-1.34375,1.8125,4.96875,8.125,11.28125,14.4375,17.59375,20.75,23.90625,27.0625,30.21875,33.375,36.53125,39.6875,42.84375,46.0,49.15625,52.3125,55.46875,58.625,61.78125,64.9375,68.09375,71.25]},"sample":[-8.25,5.5,4.41666666667,-2.66667,32.0,-12.3333333333,15.4166666667,-6.16666666667,-12.6666666667,5.25,21.5,-4.66666666667,48.5,61.0,-22.5,-15.4166666667,-24.0,25.0,-14.0,-7.21666666667,26.0,40.0,-9.25,34.0,-16.3333333333,-6.5,0.0,42.3333333333,-17.0,40.5,2.66666666666667,26.0,-4.33333333333,-8.33333333333,12.5,-13.0833333333,-3.08333333333,-27.6666666667,-3.16666666667,10.3333333333,29.5833333333,27.5,11.9166666667,-8.25,-3.71666666667,8.25,-14.0,13.3333333333,6.5,15.75,54.0,35.0,-6.0,-17.75,-6.41666666667,7.83333333333,1.41666666667,5.26,-1.33333333333,43.3333333333,43.0,43.3333333333,43.0,42.0,10.0,48.0,28.5,-5.33333333333,1.5,17.3333333333,-4.5,29.1666666667,7.16666666667,3.5,-11.6333333333,-6.75,36.5,9.66666666667,3.0,6.33333333333,5.5,34.0,17.5833333333,42.6666666667,15.5,13.45,30.0,35.5,17.5833333333,-23.5,55.5,12.0,-14.0,23.0,23.0,36.8333333333333,48.0,-23.6666666667,15.2,50.3333333333,35.0,42.75,47.0,-12.1666666667,5.25,4.75,52.0,6.16666666667,-28.0,25.5,3.216666667,-3.33333333333,42.25,56.0,28.0,-17.8333333333,-4.33333333333,-3.0,-5.5,-0.5,-11.0,68.0,62.0,-17.8333333333,10.25,15.0,26.0,5.66666666667,8.66666666667,-9.75,47.0,6.0,52.5,18.75,10.0,-6.25,42.0,-28.4166666667,7.0,-5.0,22.9166666667,-6.33333333333,10.3333333333,12.0,4.58333333333,-18.3333333333,15.6666666667,-12.25,39.0,12.25,60.0,45.75,-17.0,6.66666666667,-12.0,4.58333333333,28.3333333333,-0.75,52.0,9.91666666667,47.4166666667,25.0,-1.0,10.25,49.25,-4.0,25.0,-6.33333333333,16.2166666667,42.1666666667,6.91666666667,-2.08333333333,-3.25,18.25,44.5,40.9166666667,45.0,20.5,-11.5,-12.25,31.5,17.5,30.6666666667,37.0,-5.0,-19.0,43.5,-6.25,35.5,4.75,16.5,-4.25,16.0,12.0,9.83333333333,14.25,12.5,-6.0,22.5,42.25,-4.25,37.0,10.25,8.83333333333,-8.0,-16.5,-5.46666666667,43.0,-3.0,22.5,20.75,27.5,42.5,3.66666666667,-20.5,-9.0,-3.91666666667,-26.0,10.25,-8.66666666667,39.5,-5.16666666667,-5.5,5.3,11.8666666667,15.3333333333,7.5,13.0,-2.83333333333,-20.0,54.6666666667,10.5,42.5833333333,41.0833333333,21.0833333333,67.0,-4.5,25.0,43.0,-11.0,65.0,34.0,22.6666666667,-12.6666666667,-9.58333333333,41.5,22.6666666667,2.0,-15.0,-2.83333333333,-19.45,27.1666666667,40.0,-9.75,42.0,55.0,11.25,-5.0,-2.06666666667,-0.5,-12.0,-3.0,-38.0,10.5,-11.9166666667,19.4166666667,2.5,-25.0,41.6666666667,52.0,-12.0,-28.0,-17.3333333333,46.5,8.75,-3.0,-9.91666666667,19.0,-3.75,12.0,-7.66666666667,-5.41666666667,11.5,44.5,-5.71666666667,3.0,9.66666666667,10.3333333333,-9.08333333333,40.0,-1.5,19.0,-7.05,53.0,10.5,14.8333333333,12.8333333333,16.5833333333,-4.83333333333,11.0,-13.8333333333,-8.58333333333,7.83333333333,-28.75,52.0,-4.91666666667,42.5,-4.88333333333,8.16666666667,-1.0,9.5,5.58333333333,17.1666666667,17.0833333333,16.9166666667,3.16666666667,10.0,18.25,8.0,36.3833,-5.58333333333,15.0833333333,31.75,12.0,3.56,8.66666666667,9.33333333333,-25.3333333333,7.75,-12.1666666667,52.25,-14.25,-36.0,-19.0,19.6666666667,20.1666666667,7.0,-2.5,-14.0,19.25,8.05,6.5,-22.3333333333,-14.5,-26.0,21.9166666667,-10.0,12.1666666667,-18.0,25.1666666667,-3.0,18.9166666667,37.5,47.6666666667,53.0,49.5,17.95,32.0,12.1666666667,37.0,9.0,-14.5,31.0,12.25,19.8333333333,6.21666666667,-26.0,15.4166666667,-32.5,1.5,39.0,39.0,-16.5,-10.0,-2.83333333333,-14.0,39.0,32.75,7.33333333333,28.0,-20.0,-14.5,0.333333333333,-15.5,23.75,11.75,-2.41666666667,-5.0,1.5,46.75,-10.6666666667,-6.0,-7.16666666667,-9.61666666667,6.16666666667,3.5,48.45,-18.0,40.0,-6.0,28.0,-6.0,-0.75,4.33333333333,7.0,69.0,20.0,32.25,47.25,52.5,0.5,38.75,-17.6666666667,9.75,6.16666666667,15.4166666667,20.5,16.0,-4.0,-3.46666666667,9.5,-7.71666666667,-6.5,49.75,15.0,-4.0,-5.0,45.3333333333,-21.6666666667,4.25,17.0833333333,37.5,11.0,3.0,42.3666666667,1.25,-19.5666666667,16.3333333333,1.08333333333,19.9166666667,21.75,-3.0,-6.0,55.3333333333,36.0,-17.0,27.0,39.0,63.0,47.6666666667,27.5,20.4166666667,-13.0,16.5,57.5,-1.83333333333,-4.5,-16.75,40.0,-22.0,-20.5,59.5,1.0,36.5,4.0,-22.0,-10.1666666667,-27.0,-32.0,40.8333333333,-19.5,-5.5,-6.16666666667,-4.25,-8.25,-18.5833333333,1.5,-13.1666666667,-4.5,31.6666666667,-37.5,44.3333333333,3.25,-4.08333333333,-14.5,-4.16666666667,28.0,39.6666666667,10.1666666667,-21.0,-7.5,22.0,7.83333333333,-16.4166666667,-6.58333333333,1.0,-17.3333333333,8.5,17.3333333333,-30.0]},"kind":"numeric","n":3573,"n_null":911,"n_unique":887,"null_rate":0.25496781416176884,"stats":{"iqr":33.0,"kurtosis":-0.5022853458255461,"max":71.25,"mean":11.880207700476019,"median":8.291666666665,"min":-55.0,"n_outliers":1,"outlier_rate":0.0003756574004507889,"q1":-5.0,"q3":28.0,"skew":0.3561639041397985,"std":22.72235997679527,"zero_rate":0.002253944402704733}},{"alerts":[{"code":"null_rate","level":"warn","message":"25.5% null"}],"column":"Longitude","extras":{"histogram":{"counts":[13,5,7,8,4,15,96,36,57,107,33,114,95,55,22,7,0,1,42,109,108,170,107,159,94,62,17,20,67,73,102,83,56,129,112,190,177,48,51,11],"edges":[-178.166666667,-169.23333333365,-160.30000000029997,-151.36666666694998,-142.4333333336,-133.50000000025,-124.56666666689999,-115.63333333354998,-106.70000000019999,-97.76666666685,-88.8333333335,-79.90000000014999,-70.9666666668,-62.03333333345,-53.100000000099996,-44.16666666674999,-35.2333333334,-26.300000000050005,-17.366666666700013,-8.433333333349992,0.5,9.433333333349992,18.366666666700013,27.300000000050005,36.2333333334,45.16666666674999,54.10000000009998,63.03333333345,71.9666666668,80.90000000015002,89.83333333350001,98.76666666685,107.70000000019999,116.63333333354998,125.56666666689998,134.50000000024997,143.43333333359996,152.36666666695,161.3000000003,170.23333333365,179.166666667]},"sample":[124.666666667,95.5,-72.25,-76.0,22.0,141.833333333,-91.3333333333,140.166666667,-60.6666666667,-4.5,92.5,143.333333333,7.5,165.0,135.0,167.883333333,136.0,42.0,136.5,146.25,49.0,45.0,161.166666667,38.0,167.666666667,145.75,16.5,46.3333333333,-69.0,48.5,-62.5,100.0,123.0,115.25,-7.5,-64.1666666667,119.083333333,118.0,132.666666667,34.6666666667,74.3333333333,92.5833333333,18.75,148.0,141.266666667,124.833333333,-55.0,123.5,34.5,38.5,28.0,76.0,21.0,125.75,155.25,-4.25,124.75,-67.56,17.3333333333,-3.0,-2.5,-2.5,-2.0,-3.0,2.5,-3.0,67.0,36.0,30.03,106.0,21.5,25.5,-71.25,8.66666666667,166.833333333,155.75,74.5,4.0,11.0,116.333333333,-56.0,-88.0,-96.4166666667,-124.0,-92.25,144.75,80.25,-83.5,-96.6666666667,-60.5,47.5,105.5,34.0,93.1666666667,113.0,-121.75,-95.0,-64.3333333333,145.75,-5.0,115.0,-76.75,-95.0,136.25,-5.66666666667,6.75,11.0,36.25,139.0,93.0,19.28333333,138.166666667,47.4166666667,10.0,98.3333333333,145.583333333,140.083333333,140.5,143.666666667,25.25,-66.0,130.0,-7.0,178.0,11.25,-14.0,119.5,-0.166666666667,7.41666666667,149.833333333,7.41666666667,15.0,13.3333333333,83.5,14.0,146.0,44.0,152.416666667,35.75,-46.0,105.5,144.75,12.5833333333,-72.0,7.33333333333,126.333333333,-88.0,134.416666667,22.0,1.0,-44.0,21.25,-58.0,-7.75,136.0,11.25,84.3333333333,34.8333333333,7.5,8.16666666667,8.5,116.0,134.0,123.0,-121.916666667,-78.0,118.0,145.333333333,-95.0,46.25,7.5,132.083333333,-74.0,121.0,11.3333333333,14.25,7.66666666667,122.0,132.666666667,-62.25,102.0,55.0,80.0,140.0,-54.5,21.0,43.5,145.666666667,72.5,7.41666666667,120.833333333,144.133333333,-89.8333333333,18.0,15.0833333333,-0.916666666667,-13.25,144.0,84.3333333333,46.1666666667,144.75,-99.0,-4.83333333333,-10.1666666667,143.5,142.5,120.333333333,60.0,137.166666667,78.5,73.5,87.0,-121.5,115.416666667,27.5,147.5,143.666666667,-52.0,-1.58333333333,140.916666667,-121.5,144.333333333,134.5,163.0,-2.91666666667,120.5,0.833333333333,99.1666666667,151.116666667,31.0,24.9166666667,30.0,46.3333333333,48.0,104.0,-146.0,140.5,102.75,47.3333333333,-55.5,55.0,78.0,93.6666666667,130.833333333,161.5,41.5,104.833333333,30.5,-67.9166666667,27.1666666667,169.25,88.5,-8.0,160.666666667,13.0,24.0,13.5833333333,105.0,147.333333333,34.75,22.0,36.0,-72.0,15.5,133.5,-99.9166666667,26.5,-57.5,21.75,-100.166666667,-72.6666666667,-60.5,132.083333333,-102.5,8.25,140.833333333,144.083333333,76.0,142.5,-16.25,139.416666667,119.583333333,13.75,-122.5,148.416666667,102.0,14.0,-0.666666666667,31.1666666667,-120.666666667,99.0,110.0,155.75,45.5,14.0,100.5,-1.25,-88.6666666667,139.25,13.0,130.0,140.666666667,124.25,147.0,-65.0,145.75,-71.0,145.75,38.1666666667,-72.5,8.83333333333,36.0833333333,-97.75,-96.0833333333,-97.5833333333,101.7,76.5,-96.8333333333,-2.58333333333,44.0,150.583333333,37.5833333333,-91.3333333333,31.0,18.36,-82.0,0.666666666667,29.0,27.0,134.116666667,5.5,130.416666667,140.0,141.5,-99.0,-98.0833333333,93.8333333333,140.166666667,-59.5,-99.1666666667,93.5,3.33333333333,20.5,30.8333333333,126.5,106.416666667,33.0,29.3333333333,124.333333333,93.5,138.0,-99.6666666667,93.0,-92.5,-90.0,-118.5,-95.0,-112.0,30.8333333333,-94.0,37.0,142.416666667,74.0,-69.0,83.8333333333,160.7,130.0,-90.5,142.5,35.5,-123.333333333,-122.666666667,168.25,-55.0,141.583333333,-57.5,-122.5,76.75,149.333333333,101.0,-66.0,-71.0,-78.0,168.166666667,92.75,-83.75,-71.5,34.6666666667,31.3333333333,9.5,123.25,155.166666667,144.25,161.45,121.833333333,125.5,-123.333333333,31.0,-83.0,17.6666666667,75.5,120.5,110.5,101.666666667,93.75,24.0,84.3333333333,78.0,-122.5,-116.0,128.0,99.75,-149.583333333,11.3333333333,124.5,-92.0833333333,-98.0,79.0,24.0,153.2,118.5,126.75,134.166666667,-121.75,101.0,-70.5,141.333333333,-123.75,165.75,126.75,-99.0,49.0,78.5,35.5,46.25,125.0,169.333333333,-91.5,124.833333333,-97.4166666667,-104.75,119.75,35.0,78.3333333333,49.6666666667,-144.0,-108.0,35.0,-137.0,-122.75,-108.0,-97.8333333333,130.333333333,-92.6666666667,52.5,120.0,-75.5,-69.0,80.0,30.0,118.5,30.0,-59.0,72.0,-77.0,123.0,150.166666667,137.0,134.0,-124.166666667,125.75,122.75,146.5,142.333333333,124.833333333,146.083333333,-67.5,131.25,146.0,119.916666667,145.5,-77.5,-54.1666666667,139.5,35.5,142.583333333,103.0,-123.5,-72.75,124.0,131.5,121.5,37.3333333333,137.166666667,139.25,-70.4166666667,138.833333333,25.25,-93.25,30.0]},"kind":"numeric","n":3573,"n_null":911,"n_unique":1360,"null_rate":0.25496781416176884,"stats":{"iqr":166.75,"kurtosis":-1.047464016029985,"max":179.166666667,"mean":35.17172277863297,"median":34.79166666665,"min":-178.166666667,"n_outliers":0,"outlier_rate":0.0,"q1":-45.75,"q3":121.0,"skew":-0.32589422156950143,"std":89.35197233204336,"zero_rate":0.0015026296018031556}},{"alerts":[{"code":"one_word","level":"warn","message":"100.0% rows are a single word"},{"code":"null_rate","level":"warn","message":"26.0% null"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"}],"column":"Glottocode","extras":{"language_counts":{},"language_sample_size":3573,"length_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2645,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[7.5,7.525,7.55,7.575,7.6,7.625,7.65,7.675,7.7,7.725,7.75,7.775,7.8,7.825,7.85,7.875,7.9,7.925,7.95,7.975,8.0,8.025,8.05,8.075,8.1,8.125,8.15,8.175,8.2,8.225,8.25,8.275,8.3,8.325,8.35,8.375,8.4,8.425,8.45,8.475,8.5]},"near_unique":false,"sample":["chad1249","nyor1246","musl1236","taga1270","kuni1267","yukp1241","libe1247","tuva1244","tali1258","yane1238","motl1237","mudb1240","guaj1255","acha1249","fare1241","chep1245","baga1272","tlac1235","chim1300","kalm1243","hani1248","aona1235","chim1301","bisu1246","pagu1249","sabu1255","kham1281","chin1283","basa1284","yuki1243","nyan1304","napu1241","adyg1241","kirg1245","east2800","pamp1243","walm1241","lyel1241","puoc1238","mend1266","suki1245","ngur1261","guan1268","wakh1245","xere1240","yems1235","ishk1246","khun1259","siyi1240","trum1247"],"top_values":[["basq1248",11],["swis1247",6],["stan1295",6],["stra1244",6],["tibe1272",6],["nort3139",4],["band1339",4],["cari1279",4],["noga1249",4],["roma1326",4],["east2295",4],["adyg1241",3],["dutc1256",3],["bava1246",3],["hava1248",3],["halk1245",3],["iris1253",3],["balk1252",3],["tewa1260",3],["tata1255",3]],"top_words":[["basq1248",11],["swis1247",6],["stan1295",6],["stra1244",6],["tibe1272",6],["nort3139",4],["band1339",4],["cari1279",4],["noga1249",4],["roma1326",4],["east2295",4],["adyg1241",3],["dutc1256",3],["bava1246",3],["hava1248",3],["halk1245",3],["iris1253",3],["balk1252",3],["tewa1260",3],["tata1255",3],["tzel1254",3],["yoku1256",3],["cent2127",3],["zulu1248",3],["chad1249",2]],"vocab_skipped":null,"word_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2645,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[0.5,0.5333333333333333,0.5666666666666667,0.6,0.6333333333333333,0.6666666666666666,0.7,0.7333333333333334,0.7666666666666666,0.8,0.8333333333333333,0.8666666666666667,0.9,0.9333333333333333,0.9666666666666667,1.0,1.0333333333333332,1.0666666666666667,1.1,1.1333333333333333,1.1666666666666665,1.2,1.2333333333333334,1.2666666666666666,1.3,1.3333333333333335,1.3666666666666667,1.4,1.4333333333333333,1.4666666666666668,1.5]}},"kind":"text","n":3573,"n_null":928,"n_unique":2502,"null_rate":0.2597257206828995,"stats":{"allcaps_rate":0.0,"boilerplate_rate":0.0,"duplicate_rate":0.05406427221172023,"emoji_rate":0.0,"len_max":8,"len_mean":8.0,"len_median":8.0,"len_min":8,"len_p95":8.0,"n_duplicates":143,"n_empty":0,"one_word_rate":1.0,"readability_flesch_mean":92.87900000000003,"url_rate":0.0,"vocab_size":2502,"word_mean":1.0,"word_median":1.0}},{"alerts":[{"code":"one_word","level":"warn","message":"100.0% rows are a single word"},{"code":"null_rate","level":"warn","message":"26.8% null"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"}],"column":"ISO639P3code","extras":{"language_counts":{},"language_sample_size":3573,"length_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2614,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[2.5,2.525,2.55,2.575,2.6,2.625,2.65,2.675,2.7,2.725,2.75,2.775,2.8,2.825,2.85,2.875,2.9,2.925,2.95,2.975,3.0,3.025,3.05,3.075,3.1,3.125,3.15,3.175,3.2,3.225,3.25,3.275,3.3,3.325,3.35,3.375,3.4,3.425,3.45,3.475,3.5]},"near_unique":false,"sample":["shu","nyo","ttt","tgl","kup","yup","kpk","tvl","tlj","adx","pmw","moc","glg","acn","ksh","cap","bgg","tpt","zen","kio","hau","anu","cac","bpr","pgu","hvn","mik","cnt","bfa","yuk","nen","npy","ady","kir","jaa","xpe","wlo","lee","puo","dmw","sua","auc","gvc","van","xho","jnj","srl","kkh","cya","tpy"],"top_values":[["eus",12],["gsw",6],["deu",6],["str",6],["bod",6],["apc",4],["bdy",4],["cbd",4],["nog",4],["roh",4],["ydd",4],["ady",3],["jaa",3],["ute",3],["nld",3],["bar",3],["kal",3],["jya",3],["yuf",3],["hur",3]],"top_words":[["eus",12],["gsw",6],["deu",6],["str",6],["bod",6],["apc",4],["bdy",4],["cbd",4],["nog",4],["roh",4],["ydd",4],["ady",3],["jaa",3],["ute",3],["nld",3],["bar",3],["kal",3],["jya",3],["yuf",3],["hur",3],["ike",3],["gle",3],["xal",3],["mpj",3],["mig",3]],"vocab_skipped":null,"word_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2614,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[0.5,0.5333333333333333,0.5666666666666667,0.6,0.6333333333333333,0.6666666666666666,0.7,0.7333333333333334,0.7666666666666666,0.8,0.8333333333333333,0.8666666666666667,0.9,0.9333333333333333,0.9666666666666667,1.0,1.0333333333333332,1.0666666666666667,1.1,1.1333333333333333,1.1666666666666665,1.2,1.2333333333333334,1.2666666666666666,1.3,1.3333333333333335,1.3666666666666667,1.4,1.4333333333333333,1.4666666666666668,1.5]}},"kind":"text","n":3573,"n_null":959,"n_unique":2442,"null_rate":0.2684019031626085,"stats":{"allcaps_rate":0.0,"boilerplate_rate":0.0,"duplicate_rate":0.06579954093343535,"emoji_rate":0.0,"len_max":3,"len_mean":3.0,"len_median":3.0,"len_min":3,"len_p95":3.0,"n_duplicates":172,"n_empty":0,"one_word_rate":1.0,"readability_flesch_mean":119.52800000000003,"url_rate":0.0,"vocab_size":2442,"word_mean":1.0,"word_median":1.0}},{"alerts":[{"code":"null_rate","level":"warn","message":"25.5% null"}],"column":"Family","extras":{"singletons":121,"top_values":[["Niger-Congo",324],["Austronesian",324],["Indo-European",176],["Sino-Tibetan",146],["Afro-Asiatic",145],["Pama-Nyungan",121],["Trans-New Guinea",98],["other",72],["Altaic",65],["Oto-Manguean",56],["Austro-Asiatic",48],["Eastern Sudanic",47],["Uto-Aztecan",44],["Algic",31],["Mayan",30],["Arawakan",29],["Nakh-Daghestanian",28],["Mande",28],["Uralic",27],["Hokan",26]]},"kind":"categorical","n":3573,"n_null":911,"n_unique":254,"null_rate":0.25496781416176884,"stats":{"cardinality":254,"entropy":5.630997266685305,"entropy_ratio":0.7048716387579085,"top_rate":0.1217129977460556,"top_value":"Niger-Congo"}},{"alerts":[{"code":"null_rate","level":"warn","message":"74.5% null"}],"column":"Subfamily","extras":{"singletons":1,"top_values":[["Benue-Congo",200],["Eastern Malayo-Polynesian",159],["Tibeto-Burman",139],["Chadic",47],["Mon-Khmer",38],["Adamawa-Ubangi",30],["Gur",27],["Daghestanian",25],["Cushitic",24],["Finno-Ugric",21],["Kwa",20],["North-Central Atlantic",20],["Nilotic",19],["Mixtecan",18],["Omotic",15],["Kainantu-Goroka",14],["Madang",13],["Awyu-Ok",10],["Surmic",10],["Je",9]]},"kind":"categorical","n":3573,"n_null":2662,"n_unique":32,"null_rate":0.7450321858382312,"stats":{"cardinality":32,"entropy":3.8558857914373705,"entropy_ratio":0.7711771582874741,"top_rate":0.21953896816684962,"top_value":"Benue-Congo"}},{"alerts":[{"code":"null_rate","level":"warn","message":"25.5% null"}],"column":"Genus","extras":{"singletons":306,"top_values":[["Oceanic",149],["Bantu",141],["Indic",50],["Western Pama-Nyungan",49],["Semitic",43],["Turkic",41],["Sign Languages",40],["Bodic",40],["Germanic",39],["Northern Pama-Nyungan",33],["Creoles and Pidgins",32],["Mayan",30],["Algonquian",29],["Central Malayo-Polynesian",29],["Iranian",26],["Romance",24],["Biu-Mandara",24],["Southeastern Pama-Nyungan",23],["Dravidian",23],["Malayo-Sumbawan",22]]},"kind":"categorical","n":3573,"n_null":911,"n_unique":625,"null_rate":0.25496781416176884,"stats":{"cardinality":625,"entropy":7.950145602565396,"entropy_ratio":0.8559853360737966,"top_rate":0.05597295266716754,"top_value":"Oceanic"}},{"alerts":[{"code":"long_tail","level":"info","message":"601 singleton categories"},{"code":"null_rate","level":"warn","message":"82.5% null"}],"column":"GenusIcon","extras":{"singletons":601,"top_values":[["c688033",2],["c803E33",2],["c804733",2],["c807D33",2],["c806233",2],["c805033",2],["c7A8033",2],["c805933",2],["c807433",2],["c806B33",2],["c718033",2],["c803533",2],["cCC8C51",1],["cCC6851",1],["cCC7E51",1],["c8FCC51",1],["cCC8051",1],["c528033",1],["cCC9F51",1],["cCCB551",1]]},"kind":"categorical","n":3573,"n_null":2948,"n_unique":613,"null_rate":0.8250769661349007,"stats":{"cardinality":613,"entropy":9.249312379549451,"entropy_ratio":0.9988735233964605,"top_rate":0.0032,"top_value":"c688033"}},{"alerts":[{"code":"one_word","level":"warn","message":"99.0% rows are a single word"},{"code":"null_rate","level":"warn","message":"26.1% null"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"}],"column":"ISO_codes","extras":{"language_counts":{},"language_sample_size":3573,"length_histogram":{"counts":[2614,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,26],"edges":[3.0,3.1,3.2,3.3,3.4,3.5,3.6,3.7,3.8,3.9,4.0,4.1,4.2,4.3,4.4,4.5,4.6,4.7,4.8,4.9,5.0,5.1,5.2,5.300000000000001,5.4,5.5,5.6,5.7,5.800000000000001,5.9,6.0,6.1,6.2,6.300000000000001,6.4,6.5,6.6,6.7,6.800000000000001,6.9,7.0]},"near_unique":false,"sample":["shu","nyo","dto","sps","kpx","yux","kff","tue","tlj","ame","pmw","moc","gqu","acn","goa","cdm","bot","tio","zoh","kio","hni","njo","sgw","bom","pgu","hvn","mik","csl","brg","ywq","nen","npy","ady","kgy","jiv","kpk","wmt","luy","kju","mxu","sui","wmb","gqa","waw","xer","tao","its","khm","crw","tft"],"top_values":[["eus",12],["gsw",6],["deu",6],["str",6],["bod",6],["apc",4],["bdy",4],["cbd",4],["nog",4],["roh",4],["ydd",4],["ady",3],["jaa",3],["ute",3],["nld",3],["bar",3],["kal",3],["jya",3],["yuf",3],["hur",3]],"top_words":[["eus",12],["gsw",6],["deu",6],["str",6],["bod",6],["apc",4],["bdy",4],["cbd",4],["nog",4],["roh",4],["ydd",4],["ady",3],["jaa",3],["ute",3],["nld",3],["bar",3],["gcf",3],["kal",3],["jya",3],["yuf",3],["hur",3],["ike",3],["gle",3],["xal",3],["mpj",3]],"vocab_skipped":null,"word_histogram":{"counts":[2614,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,26],"edges":[1.0,1.0333333333333334,1.0666666666666667,1.1,1.1333333333333333,1.1666666666666667,1.2,1.2333333333333334,1.2666666666666666,1.3,1.3333333333333333,1.3666666666666667,1.4,1.4333333333333333,1.4666666666666668,1.5,1.5333333333333332,1.5666666666666667,1.6,1.6333333333333333,1.6666666666666665,1.7,1.7333333333333334,1.7666666666666666,1.8,1.8333333333333335,1.8666666666666667,1.9,1.9333333333333333,1.9666666666666668,2.0]}},"kind":"text","n":3573,"n_null":933,"n_unique":2468,"null_rate":0.2611251049538203,"stats":{"allcaps_rate":0.0,"boilerplate_rate":0.0,"duplicate_rate":0.06515151515151515,"emoji_rate":0.0,"len_max":7,"len_mean":3.0393939393939395,"len_median":3.0,"len_min":3,"len_p95":3.0,"n_duplicates":172,"n_empty":0,"one_word_rate":0.9901515151515151,"readability_flesch_mean":117.41300000000003,"url_rate":0.0,"vocab_size":2486,"word_mean":1.0098484848484848,"word_median":1.0}},{"alerts":[{"code":"null_rate","level":"warn","message":"25.5% null"},{"code":"imbalance","level":"warn","message":"top value is 96.2% of rows"}],"column":"Samples_100","extras":{"singletons":0,"top_values":[["False",2562],["True",100]]},"kind":"categorical","n":3573,"n_null":911,"n_unique":2,"null_rate":0.25496781416176884,"stats":{"cardinality":2,"entropy":0.23101765269146687,"entropy_ratio":0.23101765269146687,"top_rate":0.9624342599549212,"top_value":"False"}},{"alerts":[{"code":"null_rate","level":"warn","message":"25.5% null"}],"column":"Samples_200","extras":{"singletons":0,"top_values":[["False",2462],["True",200]]},"kind":"categorical","n":3573,"n_null":911,"n_unique":2,"null_rate":0.25496781416176884,"stats":{"cardinality":2,"entropy":0.38478791286167435,"entropy_ratio":0.38478791286167435,"top_rate":0.9248685199098422,"top_value":"False"}},{"alerts":[{"code":"null_rate","level":"warn","message":"25.7% null"}],"column":"Country_ID","extras":{"singletons":146,"top_values":[["PG",214],["AU",185],["US",177],["ID",177],["IN",120],["MX",120],["RU",89],["NG",66],["BR",66],["CN",54],["CD",49],["CM",46],["CA",45],["CO",39],["ET",36],["PH",36],["PE",35],["NP",32],["TZ",28],["VU",28]]},"kind":"categorical","n":3573,"n_null":918,"n_unique":337,"null_rate":0.25692695214105793,"stats":{"cardinality":337,"entropy":6.314446960448029,"entropy_ratio":0.75202383880205,"top_rate":0.08060263653483993,"top_value":"PG"}},{"alerts":[{"code":"one_word","level":"warn","message":"45.5% rows are a single word"},{"code":"null_rate","level":"warn","message":"30.1% null"}],"column":"Source","extras":{"language_counts":{},"language_sample_size":3573,"length_histogram":{"counts":[937,518,280,203,136,90,61,42,35,36,27,21,12,13,6,14,6,5,3,4,4,5,3,6,7,3,4,1,1,3,1,1,0,2,1,1,4,1,0,2],"edges":[7.0,18.125,29.25,40.375,51.5,62.625,73.75,84.875,96.0,107.125,118.25,129.375,140.5,151.625,162.75,173.875,185.0,196.125,207.25,218.375,229.5,240.625,251.75,262.875,274.0,285.125,296.25,307.375,318.5,329.625,340.75,351.875,363.0,374.125,385.25,396.375,407.5,418.625,429.75,440.875,452.0]},"near_unique":false,"sample":["Abu-Absi-1995","Grenoble-1992","Goldstein-1991","Ross-2002g","Bacelar-2004","Hanes-1952 de-Vegamian-1978","Laanest-1982 Leskinen-1984 Raun-1964b Rjagoev-1993","Haas-1940 Haas-1953 Nichols-1992 Swanton-1919 Swanton-1921","Ross-2002h","Duff-Tripp-1997 Fast-1953 Wise-1958 Wise-1978 Wise-1986 Wise-1990","Farris-1992","Voorhoeve-1975","Hindle-and-Rigsby-1973 Rigsby-1986","Maring-1967 Miller-1965 Miller-1966 Nichols-1992 Spencer-1946","Rapp-1966","Borman-1962 Borman-1976","Dondrup-1990","Bhaskararao-1972 Emeneau-1980 Gil-1994b Grierson-1906 Kelley-1963 Kostic-et-al-1977 Krishnamurti-1961 Krishnamurti-1998 Krishnamurti-and-Gwynn-1985 Krishnamurti-and-Sarma-1968 Lisker-1963 Malherbe-and-Rosenberg-1996 Petrunicheva-1960 Sastry-1972 Stolz-1996 Subbarao-and-Murthy-2000 Subrahmanyam-1974","Johnson-2000 Knudson-1980","Kastenholz-1987","Frajzyngier-and-Shay-2002","Clark-1893 Coupe-1999 Coupe-2007 Gowda-1972 Gowda-1975 Gurubasave-1975 Mills-1926","Chung-1983 Chung-1998 Cooreman-1982 Cooreman-1988 Costenoble-1940 Malherbe-and-Rosenberg-1996 Nichols-1992 Safford-1903-1905 Seiden-1960 Topping-1973 Topping-1980a Topping-1980b Topping-et-al-1975 Witucki-1974","Naden-1973 Prost-1950 Stolz-1996","Lauck-1976 Lauck-1979","Gault-1979","Birk-1975 Birk-1976 Mushin-1995 Nichols-1992 Tryon-1974","Durbin-and-Seijas-1972 Huber-and-Reed-1992 Robayo-1989","Bybee-et-al-1994 Mitterrutzner-1867 Spagnolo-1933 Stolz-1996","Gao-1958","Soukka-2000","Kutsch-Lojenga-1994","Bergsland-1956 Bergsland-1959 Bergsland-1994 Cho-et-al-1997 Golovko-2001 Jacobson-1944 Menovshchikov-1968 Rozelle-1997","Bokarev-1959","Hang-et-al-1989 Pevnov-1997","Donwa-Ifode-1990 Maingard-1962 Meinhof-1930","Nguyen-1998","Gordon-1986","Sims-and-Sims-1982","Bloomfield-1962 Miner-1975 Pesetsky-1979","Hatfield-2008 Voorhoeve-1975","Hawkins-1991 Hawkins-1998","Martius-1863","Cook-1987 Tryon-1971","Yi-2008","Dench-1998","Conrad-1971 Laycock-1965b","Tenishev-1997a","Kinkade-1963 Kinkade-1976 Kinkade-1991","Sande-and-Stokhof-1977"],"top_values":[["Huber-and-Reed-1992",14],["Boelaars-1950",11],["Tucker-and-Bryan-1966",9],["Voorhoeve-1975",8],["Adelaar-2004",7],["Omar-1983",7],["Hualde-1999",7],["ZGraggen-1969",6],["Llamzon-1978",5],["Portman-1887b",4],["Malherbe-and-Rosenberg-1996",4],["Laycock-and-Zgraggen-1975",4],["Derbyshire-1986 Derbyshire-and-Payne-1990",4],["Durbin-and-Seijas-1972",4],["Stevenson-1957 Tucker-and-Bryan-1966",4],["Thompson-et-al-1974",4],["Tryon-1974",4],["Dahl-1985",3],["Voorhoeve-1971",3],["Sharma-1989a",3]],"top_words":[["nichols-1992",113],["malherbe-and-rosenberg-1996",113],["stolz-1996",79],["bybee-et-al-1994",70],["dahl-1985",54],["tucker-and-bryan-1966",53],["huber-and-reed-1992",45],["haspelmath-1997",30],["derbyshire-and-payne-1990",29],["fabricius-1998",27],["adelaar-2004",16],["abbi-1992",16],["langacker-1976",16],["boelaars-1950",15],["aikhenvald-and-dixon-1999",14],["llamzon-1978",12],["hualde-1999",11],["mushin-1995",11],["wise-1978",10],["voorhoeve-1975",10],["martius-1863",9],["corbett-1991",9],["roca-1999",9],["noonan-2003c",8],["omar-1983",8]],"vocab_skipped":null,"word_histogram":{"counts":[1136,519,293,180,91,67,32,43,27,0,19,17,10,8,8,5,3,10,4,0,2,7,3,3,1,3,2,2,2,2],"edges":[1.0,1.9,2.8,3.7,4.6,5.5,6.4,7.3,8.2,9.1,10.0,10.9,11.8,12.700000000000001,13.6,14.5,15.4,16.3,17.2,18.1,19.0,19.900000000000002,20.8,21.7,22.6,23.5,24.400000000000002,25.3,26.2,27.1,28.0]}},"kind":"text","n":3573,"n_null":1074,"n_unique":2373,"null_rate":0.30058774139378674,"stats":{"allcaps_rate":0.0,"boilerplate_rate":0.0,"duplicate_rate":0.05042016806722689,"emoji_rate":0.0,"len_max":452,"len_mean":42.07122849139656,"len_median":25.0,"len_min":7,"len_p95":135.0,"n_duplicates":126,"n_empty":0,"one_word_rate":0.4545818327330932,"readability_flesch_mean":21.332089495798346,"url_rate":0.0,"vocab_size":5899,"word_mean":2.854341736694678,"word_median":2.0}},{"alerts":[{"code":"long_tail","level":"info","message":"501 singleton categories"}],"column":"Parent_ID","extras":{"singletons":501,"top_values":[["genus-oceanic",149],["genus-bantu",141],["genus-indic",50],["genus-westernpamanyungan",49],["genus-semitic",43],["genus-turkic",41],["genus-signlanguages",40],["genus-bodic",40],["genus-germanic",39],["genus-northernpamanyungan",33],["genus-creolesandpidgins",32],["genus-mayan",30],["family-austronesian",30],["genus-algonquian",29],["genus-centralmalayopolynesian",29],["genus-iranian",26],["family-transnewguinea",25],["genus-romance",24],["genus-biumandara",24],["genus-southeasternpamanyungan",23]]},"kind":"categorical","n":3573,"n_null":254,"n_unique":911,"null_rate":0.07108872096277638,"stats":{"cardinality":911,"entropy":8.55358852285104,"entropy_ratio":0.8700357247245505,"top_rate":0.04489304007231094,"top_value":"genus-oceanic"}}],"insights":{"errors":[],"insights":[{"confidence":"high","critiques":[],"evidence_keys":["Family.top_values","Macroarea.top_values","Genus.top_values","Country_ID.top_values","Samples_100.stats","Samples_200.stats","Latitude.stats","Longitude.stats","Subfamily.null_rate"],"featured_charts":[{"caption":"Top language families \u2014 Niger-Congo and Austronesian tie at 324 each, with a long tail across 254 families.","column":"Family","kind":"bar"},{"caption":"Six macroareas, led by Eurasia (659) and Africa (606); shows global coverage balance.","column":"Macroarea","kind":"donut"},{"caption":"Country distribution \u2014 Papua New Guinea, Australia, US, and Indonesia top the list, signaling linguistic hotspots.","column":"Country_ID","kind":"bar"},{"caption":"Latitude spread from -55 to 71 with median ~8\u00b0, showing a tropical/northern-hemisphere skew.","column":"Latitude","kind":"histogram"},{"caption":"Only 200 of 2,662 non-null rows are flagged True \u2014 useful for filtering to the curated WALS 200-sample set.","column":"Samples_200","kind":"donut"}],"model":"anthropic:claude-opus-4-7","narrative":"This dataset catalogs 3,573 world languages (WALS) across 17 columns combining identifiers (ISO codes, Glottocode), classifications (Family, Genus, Subfamily), geography (Latitude, Longitude, Macroarea, Country_ID), and sampling flags. The Family and Macroarea distributions are the most informative starting point: Niger-Congo and Austronesian dominate at 324 languages each, and Eurasia (659) and Africa (606) lead the macroareas out of just six categories. Note that roughly a quarter of rows (null_rate ~0.255) are missing geographic and family fields in lockstep, suggesting a shared set of unclassified entries worth investigating. The Samples_100 and Samples_200 flags are highly imbalanced (only 100 and 200 'True' values respectively), reflecting curated WALS sub-samples. Subfamily is sparsely populated (74.5% null) so treat it as supplementary rather than primary.","scope":"dataset","target":"__global__"},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.duplicate_rate","stats.one_word_rate","stats.vocab_size","stats.len_median","stats.len_max","top_words"],"model":"anthropic:claude-opus-4-7","narrative":"This is an identifier column: every one of the 3573 rows holds a unique single-token string with no nulls or duplicates. Values are short (median length 3, max 36) and the vocabulary equals the row count (3573), confirming one-to-one uniqueness. Top tokens like 'aab', 'aar', 'aba' suggest short alphabetic codes rather than numeric keys.","role":"identifier","scope":"column","target":"ID","treatment":"drop from modelling; retain only as a join key."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.duplicate_rate","stats.n_duplicates","stats.one_word_rate","stats.len_mean","stats.word_mean","top_values","top_words"],"model":"anthropic:claude-opus-4-7","narrative":"This column holds short proper-noun labels \u2014 almost certainly language or ethnonym names, given top values like 'Basque', 'Ainu', 'Beothuk' and frequent words 'sign', 'language', 'arabic', 'german'. Entries are terse (mean 8.7 chars, 80% one-word) but not unique: 375 duplicates (10.5%) and only 3,198 distinct names across 3,573 rows, with several names appearing exactly 3 times \u2014 suggesting the dataset repeats each language across multiple records or variants (e.g. '(northern)', '(southern)'). No nulls, no URLs, no emoji.","role":"label","scope":"column","target":"Name","treatment":"Treat as a categorical key; deduplicate or join on a normalized form before aggregating."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.cardinality","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"Macroarea is a coarse geographic grouping with 6 categories spanning Eurasia, Africa, Papunesia, North America, South America, and Australia \u2014 consistent with WALS/Glottolog-style language area labels. Distribution is relatively even (entropy ratio 0.95, top value Eurasia at 24.8%), so no single region dominates. Note the 25.5% null rate, which is substantial and flagged.","role":"feature","scope":"column","target":"Macroarea","treatment":"One-hot encode and add an explicit 'missing' category to preserve the 25.5% nulls."},{"confidence":"high","critiques":[],"evidence_keys":["null_rate","stats.min","stats.max","stats.mean","stats.median","stats.q1","stats.q3","stats.iqr","stats.skew","stats.kurtosis","stats.n_outliers"],"model":"anthropic:claude-opus-4-7","narrative":"Geographic latitude in degrees, ranging from -55.0 to 71.25 with a median of 8.29 and IQR of 33.0, consistent with a worldwide point distribution. The 25.5% null rate is notable and flagged, while skew (0.36) and kurtosis (-0.50) indicate a fairly symmetric, slightly flat spread with only one outlier.","role":"feature","scope":"column","target":"Latitude","treatment":"Impute or filter the 25.5% missing values, and pair with longitude for any geospatial modelling."},{"confidence":"high","critiques":[],"evidence_keys":["min","max","skew","kurtosis","null_rate","n","n_unique","median","mean","q1","q3"],"model":"anthropic:claude-opus-4-7","narrative":"Geographic longitude in degrees, spanning the full globe from -178.17 to 179.17 with a near-zero skew (-0.33) and flat kurtosis (-1.05), consistent with a worldwide point distribution. The 25.5% null rate is the main concern, and despite 3573 rows only 1360 unique values appear, suggesting repeated locations or rounded coordinates. No outliers flagged, as expected for a bounded angular measure.","role":"feature","scope":"column","target":"Longitude","treatment":"Pair with Latitude for geospatial features; impute or drop the 25.5% missing before modelling."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.len_min","stats.len_max","stats.one_word_rate","stats.duplicate_rate","stats.n_duplicates","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This column holds Glottocodes \u2014 fixed 8-character language identifiers from the Glottolog catalogue (e.g. 'basq1248', 'stan1295'), with every value a single token of length exactly 8. About 26% of rows are null and 2502 distinct codes cover 3573 records, with a 5.4% duplicate rate; the most repeated code 'basq1248' appears 11 times, suggesting multiple records can share a language.","role":"foreign_key","scope":"column","target":"Glottocode","treatment":"Left-join on this code to a Glottolog reference table; impute or flag the 26% nulls separately."},{"confidence":"high","critiques":[],"evidence_keys":["null_rate","len_min","len_max","len_mean","one_word_rate","n_unique","duplicate_rate","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This column holds ISO 639-3 language codes \u2014 every non-null value is exactly 3 characters and a single word (len_mean 3.0, one_word_rate 1.0), with familiar codes like 'eus' (Basque), 'deu' (German), and 'gsw' (Swiss German) at the top. Coverage is incomplete: 26.84% of rows are null, and across 3573 rows there are 2442 unique codes with a 6.58% duplicate rate. Nothing in the evidence indicates which entity each code is tagging.","role":"foreign_key","scope":"column","target":"ISO639P3code","treatment":"Treat as a categorical join key to an ISO 639-3 reference table; impute or filter the 26.84% nulls before use."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"Categorical label assigning each of 3,573 rows to one of 254 language families, headed by Niger-Congo and Austronesian (tied at 324 rows, 12.2% each). The long tail is heavy \u2014 entropy ratio 0.705 indicates the distribution is fairly spread across families rather than dominated by a few \u2014 and 25.5% of rows are null, which is a substantial gap for what looks like a taxonomic feature.","role":"feature","scope":"column","target":"Family","treatment":"Impute or add an explicit 'unknown' category for the 25.5% nulls, then group rare families before encoding."},{"confidence":"high","critiques":[],"evidence_keys":["null_rate","n_unique","stats.top_value","stats.top_rate","stats.entropy_ratio","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This column records the linguistic subfamily classification of entries, drawn from a controlled vocabulary of 32 values such as Benue-Congo, Eastern Malayo-Polynesian, and Tibeto-Burman. Coverage is the main concern: 74.5% of rows are null, leaving only ~910 labelled records, with Benue-Congo accounting for 21.95% of those. Among populated rows the distribution is reasonably diverse (entropy ratio 0.77), so the signal is informative where present but sparse overall.","role":"feature","scope":"column","target":"Subfamily","treatment":"Treat as a sparse categorical: impute an explicit 'unknown' level before encoding, since 74.5% are null."},{"confidence":"high","critiques":[],"evidence_keys":["column","kind","n","n_unique","null_rate","stats.cardinality","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"Genus is a linguistic genus label (subfamily-level grouping of languages), with values like Oceanic, Bantu, Indic, and Semitic. It is highly diverse \u2014 625 distinct genera across 3573 rows with entropy ratio 0.86 and the top value Oceanic covering only 5.6% \u2014 and 25.5% of rows are null, which is the flagged concern.","role":"feature","scope":"column","target":"Genus","treatment":"Treat as a high-cardinality categorical: target- or frequency-encode and explicitly model the 25.5% missing as its own category."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"GenusIcon is a high-cardinality categorical with 613 unique values across only 3573 rows, and 82.51% of those rows are null. Entropy ratio of 0.9988 and a top_rate of just 0.0032 mean the non-null values are nearly uniformly distributed, with the most frequent code 'c688033' appearing only twice. The hex-like tokens (e.g. 'c807D33') suggest icon identifiers or color/asset codes rather than a meaningful category.","role":"metadata","scope":"column","target":"GenusIcon","treatment":"Drop or retain as a sparse asset reference; not useful as a modelling feature given near-unique values and 82.51% nulls."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.len_min","stats.len_max","stats.len_p95","stats.one_word_rate","stats.n_duplicates","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"Almost certainly ISO 639-3 language codes: 99% are single tokens, length is tightly clustered at 3 characters (min 3, max 7, p95 3), and top values like 'eus', 'deu', 'gsw', 'bod', 'roh' are recognisable three-letter language identifiers. Cardinality is high (2468 unique out of 3573) with a 26.1% null rate and 172 duplicates, so coverage is partial and no single code dominates (top value 'eus' appears just 12 times). The handful of length-7 entries is anomalous for a strict ISO 639-3 field and worth inspecting.","role":"feature","scope":"column","target":"ISO_codes","treatment":"Treat as a categorical code; validate against the ISO 639-3 list and investigate entries longer than 3 characters."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.top_rate","stats.top_value","stats.entropy_ratio","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"Boolean flag with only two values (False/True) where False dominates at 96.2% of non-null rows (2562 vs 100). The name 'Samples_100' plus the exact count of 100 True values suggests this marks a curated subset of 100 sampled records. A 25.5% null rate is notable and should be reconciled before use.","role":"feature","scope":"column","target":"Samples_100","treatment":"Treat as a boolean subset indicator; impute or exclude nulls and avoid using as a model feature given severe imbalance."},{"confidence":"high","critiques":[],"evidence_keys":["n_unique","null_rate","top_rate","top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"Binary True/False flag, almost certainly indicating membership in a 200-row sample (the name 'Samples_200' and the exact count of 200 'True' values support this). The column is heavily imbalanced \u2014 'False' covers 92.5% of non-null rows \u2014 and 25.5% of values are null, which is unusual for a sampling indicator and worth investigating.","role":"metadata","scope":"column","target":"Samples_200","treatment":"Use as a boolean filter/split flag; reconcile the 25.5% nulls (treat as False or exclude) before relying on it."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.cardinality","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"Two-letter country codes (PG, AU, US, ID, IN...) identifying the country associated with each record, with 337 distinct values across 3573 rows. The cardinality is suspiciously high since ISO 3166-1 alpha-2 only defines ~250 codes, hinting at non-standard or sub-region codes mixed in. Distribution is fairly flat (entropy ratio 0.752, top value PG only 8.06%) and 25.69% of rows are null.","role":"foreign_key","scope":"column","target":"Country_ID","treatment":"Validate codes against ISO 3166, impute or flag the 25.69% nulls, then left-join on this id."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.one_word_rate","stats.len_median","stats.word_median","stats.duplicate_rate","top_values","top_words"],"model":"anthropic:claude-opus-4-7","narrative":"This column holds bibliographic citation tags (e.g., 'Huber-and-Reed-1992', 'Boelaars-1950'), almost certainly the source reference for each row in what appears to be a linguistic dataset. About 45% of values are a single token and the median length is 25 chars, consistent with compact Author-Year keys, but 30% of rows are null and 2,373 of 3,573 values are unique, with only 126 duplicates. Top citations like 'nichols-1992' and 'malherbe-and-rosenberg-1996' (113 occurrences each) dominate, suggesting a few reference works supply many entries.","role":"metadata","scope":"column","target":"Source","treatment":"Normalize casing and keep as a categorical provenance tag; impute or flag the 30% nulls rather than modelling the text."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.cardinality","stats.entropy","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values","alerts"],"model":"anthropic:claude-opus-4-7","narrative":"Parent_ID looks like a foreign key pointing to a linguistic genus (e.g. 'genus-oceanic', 'genus-bantu'), grouping the 3573 rows into 911 parent categories. The distribution is long-tailed but not dominated: the top value covers only 4.5% of rows and entropy is 8.55 (ratio 0.87), so most genera carry few members. About 7.1% of values are null, which will need a decision before any join or grouping.","role":"foreign_key","scope":"column","target":"Parent_ID","treatment":"Left-join on this id to a genus lookup; impute or flag the 7.1% nulls before grouping."}],"providers":["anthropic:claude-opus-4-7"],"total_usage":{"completion_tokens":5566,"prompt_tokens":23481,"total_tokens":29047}},"language_counts":{},"meta":{"generated_at":"2026-05-01T17:52:01+00:00","mode":"full","row_count":3573,"sampled_rows":3573,"seed":42,"source":"/home/coolhand/datasets/language-data/wals_languages.csv"},"notes":[],"saturn_version":"0.2.0","schema":{"Country_ID":"categorical","Family":"categorical","Genus":"categorical","GenusIcon":"categorical","Glottocode":"text","ID":"text","ISO639P3code":"text","ISO_codes":"text","Latitude":"numeric","Longitude":"numeric","Macroarea":"categorical","Name":"text","Parent_ID":"categorical","Samples_100":"boolean","Samples_200":"boolean","Source":"text","Subfamily":"categorical"}}
