{"columns":[{"alerts":[{"code":"near_unique","level":"info","message":"100.0% of rows are unique strings"},{"code":"one_word","level":"warn","message":"100.0% rows are a single word"},{"code":"allcaps","level":"info","message":"99.1% rows are all-caps"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"}],"column":"key","extras":{"language_counts":{},"language_sample_size":5000,"length_histogram":{"counts":[9,0,0,0,0,0,0,0,0,0,90,0,0,0,0,0,0,0,0,0,873,0,0,0,0,0,0,0,0,0,8465,0,0,0,0,0,0,0,0,1922],"edges":[1.0,1.1,1.2,1.3,1.4,1.5,1.6,1.7000000000000002,1.8,1.9,2.0,2.1,2.2,2.3,2.4000000000000004,2.5,2.6,2.7,2.8,2.9000000000000004,3.0,3.1,3.2,3.3000000000000003,3.4000000000000004,3.5,3.6,3.7,3.8000000000000003,3.9000000000000004,4.0,4.1,4.2,4.300000000000001,4.4,4.5,4.6,4.7,4.800000000000001,4.9,5.0]},"near_unique":true,"sample":["82","9048","10407","9838","5031","11690","5220","10815","9943","59","7237","6326","3284","231","3448","2153","1033","10092","11929","4740","3605","91","1876","1092","8356","9184","6582","2200","778","11694","7833","7911","7910","4623","4253","5158","11295","6057","5297","6286","9820","11158","3254","11147","11507","11766","4033","4629","1831","10651"],"top_values":[],"top_words":[["47",1],["49",1],["50",1],["54",1],["66",1],["68",1],["69",1],["72",1],["75",1],["76",1],["79",1],["80",1],["81",1],["82",1],["83",1],["85",1],["105",1],["130",1],["131",1],["136",1],["141",1],["147",1],["149",1],["151",1],["156",1]],"vocab_skipped":null,"word_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11359,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[0.5,0.5333333333333333,0.5666666666666667,0.6,0.6333333333333333,0.6666666666666666,0.7,0.7333333333333334,0.7666666666666666,0.8,0.8333333333333333,0.8666666666666667,0.9,0.9333333333333333,0.9666666666666667,1.0,1.0333333333333332,1.0666666666666667,1.1,1.1333333333333333,1.1666666666666665,1.2,1.2333333333333334,1.2666666666666666,1.3,1.3333333333333335,1.3666666666666667,1.4,1.4333333333333333,1.4666666666666668,1.5]}},"kind":"text","n":11359,"n_null":0,"n_unique":11359,"null_rate":0.0,"stats":{"allcaps_rate":0.9912844440531737,"boilerplate_rate":0.0,"duplicate_rate":0.0,"emoji_rate":0.0,"len_max":5,"len_mean":4.074126243507351,"len_median":4.0,"len_min":1,"len_p95":5.0,"n_duplicates":0,"n_empty":0,"one_word_rate":1.0,"readability_flesch_mean":121.22000000000004,"url_rate":0.0,"vocab_size":11359,"word_mean":1.0,"word_median":1.0}},{"alerts":[{"code":"multilingual","level":"info","message":"31 languages detected in sample"},{"code":"duplicates","level":"warn","message":"66.3% duplicate strings"}],"column":"author","extras":{"language_counts":{"__engine":"fasttext:4,147","ca":4,"ceb":3,"cs":4,"da":6,"de":384,"en":3125,"eo":6,"es":115,"et":1,"eu":2,"fi":10,"fr":136,"hr":2,"hu":4,"id":3,"it":91,"ja":46,"lt":1,"ms":2,"nl":107,"no":5,"pl":16,"pt":29,"ru":15,"sq":1,"sr":1,"sv":8,"tl":1,"vi":2,"zh":12},"language_sample_size":5000,"length_histogram":{"counts":[2178,6163,1293,1034,188,206,126,30,75,31,7,4,2,5,2,2,1,0,1,0,0,0,0,0,0,3,0,6,1,0,0,0,0,0,0,0,0,0,0,1],"edges":[0.0,11.325,22.65,33.974999999999994,45.3,56.625,67.94999999999999,79.27499999999999,90.6,101.925,113.25,124.57499999999999,135.89999999999998,147.225,158.54999999999998,169.875,181.2,192.52499999999998,203.85,215.17499999999998,226.5,237.825,249.14999999999998,260.47499999999997,271.79999999999995,283.125,294.45,305.775,317.09999999999997,328.42499999999995,339.75,351.075,362.4,373.72499999999997,385.04999999999995,396.375,407.7,419.025,430.34999999999997,441.67499999999995,453.0]},"near_unique":false,"sample":["Boutkan, Dirk and Siebinga, Sjoerd M.","Peiros, Ilia","","Chlenova, Svetlana","Spratt, David and Nancy","Oshika, Beatrice R. T.","Stokhof, W. A. L.","Blench, Roger M.","Haupers, Ralph and Haupers, Lorraine","Daud, Bukhari and Durie, Mark","Koelle, Sigismund Wilhelm","Nurse, Derek and Philippson, G\u00e9rard","\u9648\u6653\u9526","Project, The Rosetta","Stairs Kreger, Glenn Albert and de Stairs, Emily Florence Scharfe","\u6c5f\u837b\u3001\u674e\u5927\u52e4\u3001\u5b59\u5b8f\u5f00","B\u00fchnen, Stefan","Biggs, Bruce","Holst, Jan Henrik","Paulin, Pascale","Bender, M. Lionel","Smallhorn, J.","Gerzenstein, Ana","Kasond, Makasae and Raymond, Alexander","Tryon, Darrell T.","Fabre, Anne Gwenaelle","Reesink, Ger P.","Koelle, Sigismund Wilhelm","Van der Veen, Lolke","Edel'man, Dz\u030coy I.","Beck, Simone and Beyer, Daniela","Aboagye, P. A. Kwesi","","Chamberlain, Wendy and Chamberlain, Brad and Pavey, Emma","Voorhoeve, C. L.","Koshal, Sanyukta.","Hudak, Thomas J.","\u767d\u4e91","","Barrett, Bevan","Shimizu, Kiyoshi","Storch, Anne","\u8c22\u7559\u6587","McDonald, M. and Wurm, Stephen A.","\u90b9\u598d","Auderset, Sandra","Ibopishak Singh, P.","Dimmendaal, Gerrit Jan","Rensch, Calvin R.","Miskow, Johan and Br\u00f8ndal, Viggo"],"top_values":[["",1277],["Koelle, Sigismund Wilhelm",193],["Tryon, Darrell T.",191],["Blench, Roger",162],["Nurse, Derek and Philippson, G\u00e9rard",119],["\u5b59\u5b8f\u5f00\u3001\u4e01\u90a6\u65b0\u3001\u6c5f\u837b\u3001\u71d5\u6d77\u96c4",113],["Bender, M. Lionel",107],["Z'graggen, Johannes A.",106],["Project, The Rosetta",101],["Voorhoeve, C. L.",91],["Laycock, Donald C.",88],["Auderset, Sandra",74],["Piron, Pascale",69],["\u9648\u7ae0\u592a\u3001\u674e\u884c\u5065",66],["Kraft, Charles H.",61],["Van der Veen, Lolke",60],["Lastra de Suarez, Yolanda",60],["Huber, Randall Q. and Reed, Robert B.",59],["Hooley, Bruce A.",56],["Blench, Roger M.",52]],"top_words":[["and",4049],["a.",746],["l.",478],["c.",463],["m.",399],["j.",367],["t.",277],["roger",263],["e.",258],["blench,",237],["r.",228],["john",228],["w.",218],["wilhelm",217],["tryon,",215],["darrell",213],["david",201],["koelle,",194],["sigismund",194],["de",190],["h.",189],["robert",186],["william",175],["g.",172],["k.",171]],"vocab_skipped":null,"word_histogram":{"counts":[9184,1596,203,241,101,9,7,4,1,2,0,3,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1],"edges":[1.0,4.633333333333333,8.266666666666666,11.9,15.533333333333333,19.166666666666668,22.8,26.433333333333334,30.066666666666666,33.7,37.333333333333336,40.96666666666667,44.6,48.233333333333334,51.86666666666667,55.5,59.13333333333333,62.766666666666666,66.4,70.03333333333333,73.66666666666667,77.3,80.93333333333334,84.56666666666666,88.2,91.83333333333333,95.46666666666667,99.1,102.73333333333333,106.36666666666666,110.0]}},"kind":"text","n":11359,"n_null":0,"n_unique":3830,"null_rate":0.0,"stats":{"allcaps_rate":0.05889602958006867,"boilerplate_rate":0.0,"duplicate_rate":0.6628224315520732,"emoji_rate":0.0,"len_max":453,"len_mean":20.427150277313142,"len_median":17.0,"len_min":0,"len_p95":50.0,"n_duplicates":7529,"n_empty":1277,"one_word_rate":0.18364292631393608,"readability_flesch_mean":53.21399574175826,"url_rate":0.0,"vocab_size":6656,"word_mean":3.3819878510432257,"word_median":3.0}},{"alerts":[],"column":"year","extras":{"singletons":107,"top_values":[["",1300],["n.d.",574],["1992",338],["2007",282],["1971",271],["1979",254],["1980",225],["2005",221],["2015",217],["1986",208],["1997",208],["2006",204],["2009",202],["2011",196],["1975",195],["1963 [1854]",193],["1981",188],["2016",188],["2004",185],["2000",185]]},"kind":"categorical","n":11359,"n_null":0,"n_unique":271,"null_rate":0.0,"stats":{"cardinality":271,"entropy":6.100243326257095,"entropy_ratio":0.7547798605351158,"top_rate":0.1144466942512545,"top_value":""}},{"alerts":[{"code":"multilingual","level":"info","message":"26 languages detected in sample"},{"code":"duplicates","level":"warn","message":"50.1% duplicate strings"}],"column":"title","extras":{"language_counts":{"__engine":"fasttext:4,937","bs":1,"ca":2,"cs":1,"da":3,"de":224,"en":3620,"eo":2,"es":207,"fi":4,"fr":382,"hu":1,"id":19,"it":24,"ja":21,"ms":1,"nl":34,"no":4,"oc":1,"pl":8,"pt":41,"ru":22,"sk":2,"sl":6,"uk":7,"zh":300},"language_sample_size":5000,"length_histogram":{"counts":[1194,2283,3425,1892,1094,646,344,178,81,56,39,28,23,24,13,14,5,3,3,2,6,1,0,2,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1],"edges":[0.0,39.05,78.1,117.14999999999999,156.2,195.25,234.29999999999998,273.34999999999997,312.4,351.45,390.5,429.54999999999995,468.59999999999997,507.65,546.6999999999999,585.75,624.8,663.8499999999999,702.9,741.9499999999999,781.0,820.05,859.0999999999999,898.15,937.1999999999999,976.2499999999999,1015.3,1054.35,1093.3999999999999,1132.4499999999998,1171.5,1210.55,1249.6,1288.6499999999999,1327.6999999999998,1366.75,1405.8,1444.85,1483.8999999999999,1522.9499999999998,1562.0]},"near_unique":false,"sample":["Old Frisian Etymological Dictionary","Comparative Linguistics in Southeast Asia. Pacific Linguistics C-142. Canberra: Australian National University.","http://language.psy.auckland.ac.nz/austronesian/about.php (accessed February 2008)","Manusela, Yazyk Tsentral'nogo Serama: Materialy i Zametki. Moscow: Econ-Inform.","Kusal. In Kropp Dakubu, M. E. (ed.), West African language data sheets, vol. 1. Legon, Ghana: West African Linguistic Society.","The relationship of Kam-Sui-Mak to Tai. Ph.D. dissertation, University of Michigan.","Preliminary notes on the Alor and Pantar languages (East Indonesia). Pacific Linguistics B-43. Canberra: Research School of Pacific, Studies Department of Linguistics, Australian National University.","The Upper Cross languages: a comparative study. Manuscript in preparation.","Stieng-English dictionary. Dallas: Summer Institute of Linguistics microfiche publications.","Kamus Basa Aceh. Kamus Bahasa aceh. Acehnese-Indonesian-English Thesaurus. Canberra: Pacific Linguistics.","Polyglotta africana, or a comparative vocabulary of nearly three hundred words and phrases in more than one hundred distinct African languages. London: Church Missionary House.","The Tanzanian Language Survey. http://www.cbold.ish-lyon.cnrs.fr/.","\u5e7f\u897f\u7389\u6797\u5e02\u5ba2\u5bb6\u65b9\u8a00\u8c03\u67e5\u7814\u7a76.\u5317\u4eac\uff1a\u4e2d\u56fd\u793e\u4f1a\u79d1\u5b66\u51fa\u7248\u793e","[Data supplied by Timothy Usher.]","Diccionario Huave de San Mateo del Mar. Serie de Vocabularios y Diccionarios Ind\u00edgenas Mariano Silva y Aceves, 24. M\u00e9xico: Instituto Ling\u00fc\u00edstico de Verano.","\u300a\u8fbe\u8ba9\u8bed\u7814\u7a76\u300b\u3002\u5317\u4eac\uff1a\u6c11\u65cf\u51fa\u7248\u793e","Lexique comparatif des dialectes Ba\u0272un et de Kasanga et Cobiana. Bremen.","[Bruce Biggs' 1972 collection of Polynesian word lists, provided by Andrew Pawley.]","Einf\u00fchrung in die eskimo-aleutischen Sprachen. Hamburg: Helmut Buske Verlag.","Etude comparative des langues du groupe Ring - langues Grassfields de l'ouest, Cameroun. Lyon: Universit\u00e9 Lumi\u00e8re - Lyon 2.","The languages of Ethiopia: A new lexicostatistic classification and some problems of diffusion. Anthropological Linguistics 13. 165-288.","The Binanderean languages of Papua New Guinea: reconstruction and subgrouping. Canberra: Pacific Linguistics.","Lengua Chorote. Buenos Aires: Universidad de Buenos Aires, Facultad de Filosof\u00eda y Letras, Instituto de Ling\u00fc\u00edstica.","A classified vocabulary of the Icibemba language. M\u00fcnchen: Lincom Europa.","New Hebrides Languages: An Internal Classification. Canberra: Research School of Pacific Studies, Australian National University.","\u00c9tude du Samba Leko, parler d'allani. M\u00fcnchen: Lincom Europa.","The eastern Bird's Head languages compared. In Reesink, Ger P. (ed.), Languages of the eastern Bird's Head, 1-44. Pacific Linguistics 524. Canberra: Australian National University.","Polyglotta africana, or a comparative vocabulary of nearly three hundred words and phrases in more than one hundred distinct African languages. London: Church Missionary House.","Projet ALGAB (Atlas Linguistique du Gabon). RefLex.","Jazguljamsko-Russkij Slovar'. Moscow: Akademia Nauk SSSR.","A Sociolinguistic Assessment of the Darwazi Speech Variety in Afghanistan. Linguistic Discovery 11(1).","Nzema-English, English-Nzema dictionary. Accra: Ghana Publishing Corporation.","http://www.africamuseum.be/research/human-sciences/linguistics/lexico-1?set_language=deandcl=de (accessed March 2008)","A Sociolinguistic Survey of Kinnauri spoken in Kinnaur District, Himachal Pradesh, India. Ms.","Languages of Irian Jaya, checklist: preliminary classification, language maps, wordlists. Pacific Linguistics: Series B 31. Canberra: Research School of Pacific and Asian Studies, Australian National University.","Ladakhi Grammar. Delhi: Motilal Banarsidass.","William J. Gedney's comparative Tai source book. Honolulu: University of Hawai'i Press.","\u8bed\u4fdd\u9879\u76ee\u6c49\u8bed\u65b9\u8a00\u5e7f\u897f\u8c61\u5dde\u5b98\u8bdd\u8bb0\u97f3","http://language.psy.auckland.ac.nz/austronesian/language.php?id=519 (accessed Jan 2010)","Historical reconstruction of the Maric languages of central Queensland. MA thesis, Australian National University, Canberra.","The Southern Bauchi group of Chadic languages: a survey report (Africana Marburgensia: Sonderheft 2). Marburg/Lahn: Africana Marburgensia.","Vergleichender Teil (Westafrikanische Studien 20). In Das Hone und seine Stellung im Zentral-Jukunoid (Westafrikanische Studien 20), 267-399. K\u00f6ln: R\u00fcdiger K\u00f6ppe.","\u4e8e\u90fd\u65b9\u8a00\u8bcd\u5178.\u5357\u4eac\uff1a\u6c5f\u82cf\u6559\u80b2\u51fa\u7248\u793e","Basic Materials in Wankumara (Galali): Grammar, Sentences and Vocabulary (Pacific Linguistics: Series B 65). Canberra: Research School of Pacific and Asian Studies, Australian National University.","\u6e56\u5357\u5357\u53bf\u8bdd\u8bed\u97f3\u7814\u7a76[D].\u6e56\u5357\u5e08\u8303\u5927\u5b66","https://github.com/SAuderset/MixteCoDB (accessed May 2024); Josserand, J. Kathryn. 1983. Mixtec Dialect History. New Orleans: Tulane University dissertation. (Doctoral dissertation); Amith, Jonathan D. and Castillo Garc\u00eda, Rey. 2021. Recursos lexicosem\u00e1nticos para el mixteco de Yolox\u00f3chitl, municipio de San Luis Acatl\u00e1n, Guerrero (Glottocode yolo1241; ISO 639-3 xty). (unpublished).","Kabui (Rongmei) grammar. Imphal: Directorate for the Development of Tribals and Backward Classes, Govt. of Manipur.","The consonants of proto-Upper Cross and their implications for the classification of the Upper Cross languages. Department of African Languages, State University of Leiden.","An etymological dictionary of the Chinantec languages: Studies in Chinantec languages 1. (Summer Institute of Linguistics and the University of Texas at Arlington Publications in Linguistics, 87.) Dallas: Summer Institute of Linguistics and the University of Texas at Arlington.","Sig\u00f8jnersprog i Danmark. Danske Studier 46, 97-145. Copenhagen: Gyldendalske Boghandel."],"top_values":[["http://www.africamuseum.be/research/human-sciences/linguistics/lexico-1?set_language=deandcl=de (accessed March 2008)",355],["Polyglotta africana, or a comparative vocabulary of nearly three hundred words and phrases in more than one hundred distinct African languages. London: Church Missionary House.",193],["[http://language.psy.auckland.ac.nz/austronesian/ (accessed September 2007).]",173],["New Hebrides Languages: An Internal Classification. Canberra: Research School of Pacific Studies, Australian National University.",132],["The Tanzanian Language Survey. http://www.cbold.ish-lyon.cnrs.fr/.",119],["http://www.africamuseum.be/research/human-sciences/linguistics/lexico-1?set_language=de&cl=de (accessed March 2008)",107],["The languages of Ethiopia: A new lexicostatistic classification and some problems of diffusion. Anthropological Linguistics 13. 165-288.",96],["[Data supplied by Timothy Usher.]",87],["Classification interne du groupe banto\u00efde I. M\u00fcnchen: Lincom Europa.",69],["\u666e\u901a\u8bdd\u57fa\u7840\u65b9\u8a00\u57fa\u672c\u8bcd\u6c47\u96c6\u00b7\u8bcd\u6c47\u5377.\u5317\u4eac\uff1a\u8bed\u6587\u51fa\u7248\u793e",66],["Projet ALGAB (Atlas Linguistique du Gabon). RefLex.",60],["Austronesian languages of the Morobe district, Papua. Oceanic Linguistics 10(2). 79-151. Honolulu: The University Press of Hawaii.",54],["Comparative Vocabulary: Selected Words in Indigenous Languages of Colombia. Bogot\u00e1, Colombia: Summer Institute of Linguistics.",47],["Sociolinguistic survey of Northern Pakistan 2. Languages of northern areas. Islamabad: National Institute of Pakistan Studies and Summer Institute of Linguistics.",47],["Las areas dialectales del Nahuatl moderno. Mexico: Universidad Nacional Aut\u00f3noma de Mexico.",47],["http://language.psy.auckland.ac.nz/austronesian/about.php (accessed February 2008)",45],["Philippine Minor Languages: Word Lists and Phonologies. Oceanic Linguistics Special Publication No. 8. Honolulu: University of Hawai'i Press.",41],["Moru-Ma\u2019di survey report. Nairobi: Sudan Branch, Summer Institute of Linguistics (SIL).",40],["The Trans-New Guinea Phylum: Explorations in deep-level genetic relationships. Canberra: Pacific Linguistics.",38],["https://github.com/SAuderset/MixteCoDB (accessed May 2024); Josserand, J. Kathryn. 1983. Mixtec Dialect History. New Orleans: Tulane University dissertation. (Doctoral dissertation)",38]],"top_words":[["of",8251],["the",4405],["and",3383],["in",2675],["a",2236],["de",2116],["languages",1752],["language",1364],["linguistics",1305],["(accessed",1300],["university",1223],["new",1208],["pacific",1180],["university.",1040],["comparative",1037],["national",1027],["canberra:",1000],["australian",990],["institute",917],["survey",839],["african",711],["research",627],["studies,",620],["sil",568],["school",559]],"vocab_skipped":null,"word_histogram":{"counts":[3577,3667,1989,1275,436,167,90,48,28,24,22,15,5,6,0,5,1,1,1,0,0,0,0,0,0,1,0,0,0,1],"edges":[1.0,8.3,15.6,22.9,30.2,37.5,44.8,52.1,59.4,66.7,74.0,81.3,88.6,95.89999999999999,103.2,110.5,117.8,125.1,132.4,139.7,147.0,154.29999999999998,161.6,168.9,176.2,183.5,190.79999999999998,198.1,205.4,212.7,220.0]}},"kind":"text","n":11359,"n_null":0,"n_unique":5663,"null_rate":0.0,"stats":{"allcaps_rate":0.061537107139713006,"boilerplate_rate":0.0,"duplicate_rate":0.5014525926578044,"emoji_rate":0.0,"len_max":1562,"len_mean":120.61598732282772,"len_median":104.0,"len_min":0,"len_p95":261.0,"n_duplicates":5696,"n_empty":7,"one_word_rate":0.07007659124922969,"readability_flesch_mean":8.00288358102524,"url_rate":0.17413504709921648,"vocab_size":21846,"word_mean":14.658684743375296,"word_median":12.0}},{"alerts":[{"code":"imbalance","level":"warn","message":"top value is 100.0% of rows"}],"column":"journal","extras":{"singletons":2,"top_values":[["",11355],["M\u00fcnchener Studien zur Sprachwissenschaft",2],["Zeitschrift f\u00fcr vergleichende Sprachforschung",1],["Historische Sprachforschung",1]]},"kind":"categorical","n":11359,"n_null":0,"n_unique":4,"null_rate":0.0,"stats":{"cardinality":4,"entropy":0.005075795124253641,"entropy_ratio":0.0025378975621268207,"top_rate":0.9996478563253808,"top_value":""}},{"alerts":[{"code":"long_tail","level":"info","message":"13 singleton categories"},{"code":"imbalance","level":"warn","message":"top value is 99.9% of rows"}],"column":"publisher","extras":{"singletons":13,"top_values":[["",11344],["Brill",2],["Winter",1],["Reichert",1],["Vosto\u010dnaja Literatura",1],["Rodopi",1],["Harrassowitz",1],["K. J. Tr\u00fcbner",1],["Belaruska\u00e2 navuka",1],["Fitzroy Dearborn Publishers",1],["Karl J. Tr\u00fcbner",1],["Institut f\u00fcr Sprachwissenschaft der Universit\u00e4t Innsbruck",1],["Vandenhoeck & Ruprecht",1],["Walter de Gruyter & Co.",1],["Nova Fronteira",1]]},"kind":"categorical","n":11359,"n_null":0,"n_unique":15,"null_rate":0.0,"stats":{"cardinality":15,"entropy":0.019517506292282574,"entropy_ratio":0.004995662359785793,"top_rate":0.9986794612201778,"top_value":""}},{"alerts":[{"code":"long_tail","level":"info","message":"3 singleton categories"},{"code":"imbalance","level":"warn","message":"top value is 100.0% of rows"}],"column":"editor","extras":{"singletons":3,"top_values":[["",11356],["Tischler, Johann",1],["Martyna\u01d4, V. and G., Cyhun",1],["Mallory, James P.",1]]},"kind":"categorical","n":11359,"n_null":0,"n_unique":4,"null_rate":0.0,"stats":{"cardinality":4,"entropy":0.0039389169966472905,"entropy_ratio":0.0019694584983236453,"top_rate":0.9997358922440356,"top_value":""}},{"alerts":[{"code":"imbalance","level":"warn","message":"top value is 100.0% of rows"}],"column":"url","extras":{"singletons":0,"top_values":[["",11359]]},"kind":"categorical","n":11359,"n_null":0,"n_unique":1,"null_rate":0.0,"stats":{"cardinality":1,"entropy":-0.0,"entropy_ratio":0.0,"top_rate":1.0,"top_value":""}},{"alerts":[{"code":"multilingual","level":"info","message":"31 languages detected in sample"},{"code":"duplicates","level":"warn","message":"54.4% duplicate strings"}],"column":"citation","extras":{"language_counts":{"__engine":"fasttext:5,000","bs":1,"ca":2,"cs":3,"de":249,"en":3590,"eo":22,"es":205,"et":1,"fi":3,"fr":401,"hr":1,"hu":6,"id":11,"it":24,"ja":15,"ms":9,"nl":34,"no":3,"os":1,"pam":1,"pl":4,"pt":45,"ru":19,"sk":3,"sl":5,"sq":1,"sr":1,"uk":4,"war":1,"zh":335},"language_sample_size":5000,"length_histogram":{"counts":[7,22,20,8,54,128,99,99,133,125,61,269,86,119,95,71,105,312,5296,3139,854,168,64,4,6,3,6,0,2,0,0,1,1,0,0,1,0,0,0,1],"edges":[8.0,11.35,14.7,18.05,21.4,24.75,28.1,31.45,34.8,38.150000000000006,41.5,44.85,48.2,51.550000000000004,54.9,58.25,61.6,64.95,68.30000000000001,71.65,75.0,78.35000000000001,81.7,85.05,88.4,91.75,95.10000000000001,98.45,101.8,105.15,108.5,111.85000000000001,115.2,118.55,121.9,125.25,128.60000000000002,131.95,135.3,138.65,142.0]},"near_unique":false,"sample":["Boutkan 2005. Old Frisian Etymological Dictionary","Peiros 1998. Comparative Linguistics in Southeast Asia. Pacific Lingui\u2026","Anon. n.d.. http://language.psy.auckland.ac.nz/austronesian/about.php\u2026","Chlenova 2012. Manusela, Yazyk Tsentral'nogo Serama: Materialy i Zametki\u2026","Spratt 1977. Kusal. In Kropp Dakubu, M. E. (ed.), West African languag\u2026","Oshika 1973. The relationship of Kam-Sui-Mak to Tai. Ph.D. dissertatio\u2026","Stokhof 1975. Preliminary notes on the Alor and Pantar languages (East \u2026","Blench 2014. The Upper Cross languages: a comparative study. Manuscrip\u2026","Haupers 1991. Stieng-English dictionary. Dallas: Summer Institute of Li\u2026","Daud 1999. Kamus Basa Aceh. Kamus Bahasa aceh. Acehnese-Indonesian-E\u2026","Koelle 1963 [1854]. Polyglotta africana, or a comparative vocabulary of nearl\u2026","Nurse 1975/99. The Tanzanian Language Survey. http://www.cbold.ish-lyon.\u2026","\u9648\u6653\u9526 2004. \u5e7f\u897f\u7389\u6797\u5e02\u5ba2\u5bb6\u65b9\u8a00\u8c03\u67e5\u7814\u7a76.\u5317\u4eac\uff1a\u4e2d\u56fd\u793e\u4f1a\u79d1\u5b66\u51fa\u7248\u793e","Project n.d.. [Data supplied by Timothy Usher.]","Stairs Kreger 1981. Diccionario Huave de San Mateo del Mar. Serie de Vocabula\u2026","\u6c5f\u837b\u3001\u674e\u5927\u52e4\u3001\u5b59\u5b8f\u5f00 2013. \u300a\u8fbe\u8ba9\u8bed\u7814\u7a76\u300b\u3002\u5317\u4eac\uff1a\u6c11\u65cf\u51fa\u7248\u793e","B\u00fchnen 1988. Lexique comparatif des dialectes Ba\u0272un et de Kasanga et C\u2026","Biggs 1972. [Bruce Biggs' 1972 collection of Polynesian word lists, p\u2026","Holst 2005. Einf\u00fchrung in die eskimo-aleutischen Sprachen. Hamburg: H\u2026","Paulin 1995. Etude comparative des langues du groupe Ring - langues Gr\u2026","Bender 1971. The languages of Ethiopia: A new lexicostatistic classifi\u2026","Smallhorn 2011. The Binanderean languages of Papua New Guinea: reconstruc\u2026","Gerzenstein 1978. Lengua Chorote. Buenos Aires: Universidad de Buenos Aires\u2026","Kasond 2002. A classified vocabulary of the Icibemba language. M\u00fcnchen\u2026","Tryon 1979. New Hebrides Languages: An Internal Classification. Canbe\u2026","Fabre 2003. \u00c9tude du Samba Leko, parler d'allani. M\u00fcnchen: Lincom Eur\u2026","Reesink 2002. The eastern Bird's Head languages compared. In Reesink, G\u2026","Koelle 1963 [1854]. Polyglotta africana, or a comparative vocabulary of nearl\u2026","Van der Veen 2011. Projet ALGAB (Atlas Linguistique du Gabon). RefLex","Edel'man 1971. Jazguljamsko-Russkij Slovar'. Moscow: Akademia Nauk SSSR","Beck 2013. A Sociolinguistic Assessment of the Darwazi Speech Variet\u2026","Aboagye 1968. Nzema-English, English-Nzema dictionary. Accra: Ghana Pub\u2026","Anon. n.d.. http://www.africamuseum.be/research/human-sciences/lingui\u2026","Chamberlain 1998. A Sociolinguistic Survey of Kinnauri spoken in Kinnaur Di\u2026","Voorhoeve 1975. Languages of Irian Jaya, checklist: preliminary classific\u2026","Koshal 1979. Ladakhi Grammar. Delhi: Motilal Banarsidass","Hudak 2008. William J. Gedney's comparative Tai source book. Honolulu\u2026","\u767d\u4e91 2018. \u8bed\u4fdd\u9879\u76ee\u6c49\u8bed\u65b9\u8a00\u5e7f\u897f\u8c61\u5dde\u5b98\u8bdd\u8bb0\u97f3","Anon. n.d.. http://language.psy.auckland.ac.nz/austronesian/language.\u2026","Barrett 2005. Historical reconstruction of the Maric languages of centr\u2026","Shimizu 1978. The Southern Bauchi group of Chadic languages: a survey r\u2026","Storch 1999. Vergleichender Teil (Westafrikanische Studien 20). In Das\u2026","\u8c22\u7559\u6587 1998. \u4e8e\u90fd\u65b9\u8a00\u8bcd\u5178.\u5357\u4eac\uff1a\u6c5f\u82cf\u6559\u80b2\u51fa\u7248\u793e","McDonald 1979. Basic Materials in Wankumara (Galali): Grammar, Sentences\u2026","\u90b9\u598d 2007. \u6e56\u5357\u5357\u53bf\u8bdd\u8bed\u97f3\u7814\u7a76[D].\u6e56\u5357\u5e08\u8303\u5927\u5b66","Auderset 2024. https://github.com/SAuderset/MixteCoDB (accessed May 2024\u2026","Ibopishak Singh 1990. Kabui (Rongmei) grammar. Imphal: Directorate for the Deve\u2026","Dimmendaal 1978. The consonants of proto-Upper Cross and their implication\u2026","Rensch 1989. An etymological dictionary of the Chinantec languages: St\u2026","Miskow 1923. Sig\u00f8jnersprog i Danmark. Danske Studier 46, 97-145. Copen\u2026"],"top_values":[["Anon. n.d.. http://www.africamuseum.be/research/human-sciences/lingui\u2026",462],["Anon. n.d.. http://language.psy.auckland.ac.nz/austronesian/language.\u2026",221],["Koelle 1963 [1854]. Polyglotta africana, or a comparative vocabulary of nearl\u2026",193],["Anon. n.d.. [http://language.psy.auckland.ac.nz/austronesian/ (access\u2026",180],["Tryon 1979. New Hebrides Languages: An Internal Classification. Canbe\u2026",143],["Nurse 1975/99. The Tanzanian Language Survey. http://www.cbold.ish-lyon.\u2026",119],["Bender 1971. The languages of Ethiopia: A new lexicostatistic classifi\u2026",98],["Project n.d.. [Data supplied by Timothy Usher.]",87],["Anon. n.d.. http://language.psy.auckland.ac.nz/austronesian/about.php\u2026",76],["Auderset 2024. https://github.com/SAuderset/MixteCoDB (accessed May 2024\u2026",68],["Piron 1997. Classification interne du groupe banto\u00efde I. M\u00fcnchen: Lin\u2026",67],["\u9648\u7ae0\u592a\u3001\u674e\u884c\u5065 1996. \u666e\u901a\u8bdd\u57fa\u7840\u65b9\u8a00\u57fa\u672c\u8bcd\u6c47\u96c6\u00b7\u8bcd\u6c47\u5377.\u5317\u4eac\uff1a\u8bed\u6587\u51fa\u7248\u793e",66],["Van der Veen 2011. Projet ALGAB (Atlas Linguistique du Gabon). RefLex",60],["Hooley 1971. Austronesian languages of the Morobe district, Papua. Oce\u2026",54],["Huber 1992. Comparative Vocabulary: Selected Words in Indigenous Lang\u2026",48],["Backstrom 1992. Sociolinguistic survey of Northern Pakistan 2. Languages \u2026",47],["Lastra de Suarez 1986. Las areas dialectales del Nahuatl moderno. Mexico: Univer\u2026",47],["Reid 1971. Philippine Minor Languages: Word Lists and Phonologies. O\u2026",41],["Boone 1992. Moru-Ma\u2019di survey report. Nairobi: Sudan Branch, Summer I\u2026",40],["Peiros 1998. Comparative Linguistics in Southeast Asia. Pacific Lingui\u2026",40]],"top_words":[["of",4183],["the",3104],["a",1953],["n.d..",1867],["anon.",1275],["in",1270],["and",1175],["languages",1035],["\u2026",989],["comparative",904],["de",842],["language",788],["new",582],["survey",559],["http://www.africamuseum.be/research/human-sciences/lingui\u2026",462],["on",412],["sociolinguistic",393],["by",381],["grammar",373],["dictionary.",373],["du",364],["[data",341],["supplied",339],["languages:",338],["1992.",338]],"vocab_skipped":null,"word_histogram":{"counts":[7,1635,0,283,96,0,234,625,0,620,0,1653,2376,0,2129,1291,0,316,82,0,10,0,1,0,0,0,0,0,0,1],"edges":[2.0,2.6333333333333333,3.2666666666666666,3.9,4.533333333333333,5.166666666666666,5.8,6.433333333333334,7.066666666666666,7.699999999999999,8.333333333333332,8.966666666666667,9.6,10.233333333333333,10.866666666666667,11.5,12.133333333333333,12.766666666666666,13.399999999999999,14.033333333333333,14.666666666666666,15.299999999999999,15.933333333333334,16.566666666666666,17.2,17.833333333333332,18.466666666666665,19.099999999999998,19.733333333333334,20.366666666666667,21.0]}},"kind":"text","n":11359,"n_null":0,"n_unique":5178,"null_rate":0.0,"stats":{"allcaps_rate":0.06004049652258121,"boilerplate_rate":0.0,"duplicate_rate":0.5441500132053878,"emoji_rate":0.0,"len_max":142,"len_mean":67.93811074918567,"len_median":71.0,"len_min":8,"len_p95":77.0,"n_duplicates":6181,"n_empty":0,"one_word_rate":0.0,"readability_flesch_mean":29.26145445804198,"url_rate":0.1333744167620389,"vocab_size":15535,"word_mean":8.826569240250022,"word_median":10.0}}],"insights":{"errors":[],"insights":[{"confidence":"high","critiques":[],"evidence_keys":["row_count","column_count","columns.author.stats.duplicate_rate","columns.author.stats.n_empty","columns.author.language_counts","columns.author.top_values","columns.citation.stats.duplicate_rate","columns.citation.top_values","columns.title.stats.duplicate_rate","columns.title.top_values","columns.title.language_counts","columns.year.n_unique","columns.year.stats.entropy_ratio","columns.year.top_values","columns.editor.stats.top_rate","columns.publisher.stats.top_rate","columns.url.stats.top_rate"],"featured_charts":[{"caption":"Distribution of publication years; watch for the large empty/'n.d.' share and peaks around 1971, 1979, and 1992.","column":"year","kind":"bar"},{"caption":"Top authors are dominated by a few prolific contributors (Koelle, Tryon, Blench), reflecting the 66% duplicate rate.","column":"author","kind":"bar"},{"caption":"Citation string lengths cluster around 70 characters \u2014 useful for spotting truncated or anomalously short entries.","column":"citation","kind":"length"},{"caption":"Title lengths are highly skewed (median 104, max 1562); long outliers often contain embedded URLs.","column":"title","kind":"length"},{"caption":"Language mix of titles shows English dominant but with sizeable German, French, Spanish, and Chinese minorities.","column":"title","kind":"donut"}],"model":"anthropic:claude-opus-4-7","narrative":"This dataset is a bibliographic reference list with 11,359 rows and 9 columns (key, author, citation, title, year, plus mostly-empty editor/publisher/journal/url fields). The most informative columns are author, citation, title, and year \u2014 the rest are either unique IDs or near-empty categoricals. Note that author has a 66% duplicate rate and 1,277 empty values, while citation and title both show heavy duplication (54% and 50%) driven by a handful of large source collections like Koelle's Polyglotta africana and the Africa Museum and Austronesian web archives. The year column spans 271 distinct values with reasonable spread (entropy ratio 0.75), though about 11% of rows have no year and another 574 are marked 'n.d.'. Author and title are also multilingual, with English dominant but meaningful German, French, Spanish, and Chinese subsets.","scope":"dataset","target":"__global__"},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.one_word_rate","stats.duplicate_rate","stats.len_mean","stats.len_max","stats.allcaps_rate","stats.vocab_size","top_words"],"model":"anthropic:claude-opus-4-7","narrative":"This column is almost certainly a primary identifier: every one of the 11,359 rows holds a unique, single-token value (n_unique=11359, one_word_rate=1.0, duplicate_rate=0.0). Values are short (len_mean 4.07, len_max 5) and 99.1% are uppercase, consistent with short alphanumeric codes rather than natural text. The top_words sample shows purely numeric tokens (47, 49, 50, ...), so the 'allcaps' signal may be a side effect of digit-only strings rather than true letters.","role":"identifier","scope":"column","target":"key","treatment":"Use as a row key or left-join key; drop from modelling features."},{"confidence":"high","critiques":[],"evidence_keys":["alerts","n","n_unique","null_rate","stats.duplicate_rate","stats.n_duplicates","stats.n_empty","stats.word_mean","top_values","top_words","language_counts"],"model":"anthropic:claude-opus-4-7","narrative":"This is an author/contributor name field, mostly formatted 'Surname, Given Name' with frequent multi-author strings joined by 'and' (4049 occurrences). Duplication is severe: 66.3% of rows repeat, with prolific contributors like 'Koelle, Sigismund Wilhelm' (193) and 'Tryon, Darrell T.' (191) dominating, and 1277 rows are empty strings despite a 0.0 null rate. Names span 30 detected languages \u2014 predominantly English (3125) and German (384), but also 113 rows of CJK script \u2014 so naive string matching will fragment identities.","role":"metadata","scope":"column","target":"author","treatment":"Normalize to a canonical name form and split multi-author strings on ' and ' before any join or grouping."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.cardinality","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This is a year field stored as strings rather than integers, with 271 distinct values across 11,359 rows. The most common entry is an empty string (1,300 rows, 11.4%) followed by the literal \"n.d.\" (574 rows), so roughly 16.5% of records carry no usable year despite a 0.0 null rate. Actual years span at least 1971 to 2015 in the top values, with 1992 the most frequent real year at 338 occurrences.","role":"feature","scope":"column","target":"year","treatment":"Coerce to integer, mapping empty strings and \"n.d.\" to missing before any temporal analysis."},{"confidence":"high","critiques":[],"evidence_keys":["language_counts","n","n_unique","stats.duplicate_rate","stats.n_duplicates","stats.url_rate","stats.len_median","stats.len_max","top_values","top_words","alerts"],"model":"anthropic:claude-opus-4-7","narrative":"This column holds bibliographic citation strings \u2014 book/article titles, URLs, and access notes for linguistic sources, dominated by English (3620) but mixing 24 other languages including French (382), Chinese (300), German (224), and Spanish (207). Half the values are duplicates (duplicate_rate 0.50, 5696 repeats across 11359 rows), with a single africamuseum.be URL appearing 355 times and 17.4% of entries containing URLs. Length varies wildly (median 104 chars, max 1562) and top words ('of','the','languages','linguistics','(accessed') confirm these are reference citations rather than free-form titles.","role":"metadata","scope":"column","target":"title","treatment":"Normalize and deduplicate citations into a source-reference lookup table rather than treating as a modelling feature."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.top_rate","stats.top_value","stats.entropy","stats.entropy_ratio","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This appears to be a journal-name field for bibliographic records, but it is effectively empty: 11,355 of 11,359 rows (top_rate 0.9996) hold an empty string, leaving only 4 actual journal names across 3 distinct German linguistics titles. Entropy is 0.005 (entropy_ratio 0.0025), so the column carries virtually no information despite a 0.0 null rate \u2014 the blanks are stored as empty strings rather than nulls.","role":"metadata","scope":"column","target":"journal","treatment":"Drop; near-constant empty string with only 4 populated rows."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.top_rate","stats.top_value","stats.entropy_ratio","top_values","alerts"],"model":"anthropic:claude-opus-4-7","narrative":"Publisher name field, but it is effectively empty: 11,344 of 11,359 rows (top_rate 0.9987) carry an empty string, leaving only 15 distinct values and an entropy_ratio of 0.005. The handful of populated entries (Brill with 2, then Winter, Reichert, Rodopi, Harrassowitz and others with 1 each) hint at academic/humanities publishers but are too sparse to be useful. Note that null_rate is 0.0 because the blanks are stored as empty strings rather than true nulls.","role":"metadata","scope":"column","target":"publisher","treatment":"Drop; the column is ~99.9% empty strings and carries almost no signal."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.top_rate","stats.top_value","stats.entropy","stats.cardinality","top_values","alerts"],"model":"anthropic:claude-opus-4-7","narrative":"This appears to be an editor name field for bibliographic records, but it is effectively empty: 11356 of 11359 rows (top_rate 0.9997) hold the empty string, with only three distinct named editors each appearing once. Entropy is essentially zero (0.0039) and cardinality is just 4, so the column carries almost no information despite a 0.0 null rate (blanks are encoded as ''). The long_tail and imbalance alerts simply reflect that three singleton names sit beside one dominant blank.","role":"metadata","scope":"column","target":"editor","treatment":"Drop; near-constant blank with only three populated rows."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.cardinality","stats.entropy","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This column is labelled 'url' but contains a single value\u2014an empty string\u2014across all 11,359 rows. Cardinality is 1, entropy is 0, and the top_rate is 1.0, so it carries no information whatsoever. Likely a placeholder field that was never populated during ingestion.","role":"metadata","scope":"column","target":"url","treatment":"Drop; the column is constant and has zero predictive value."},{"confidence":"high","critiques":[],"evidence_keys":["language_counts","stats.duplicate_rate","stats.n_duplicates","n_unique","top_values","stats.url_rate","stats.readability_flesch_mean","stats.len_mean"],"model":"anthropic:claude-opus-4-7","narrative":"Bibliographic citation strings for linguistic sources, mostly English (3590) with substantial French (401), Chinese (335), and German (249) entries. Heavy duplication is the headline: 6181 duplicates (54.4%) across only 5178 unique values, with the top value 'Anon. n.d.. http://www.africamuseum.be/...' repeating 462 times. URLs appear in 13.3% of rows and Flesch readability is low (29.3), consistent with reference-style text rather than prose.","role":"metadata","scope":"column","target":"citation","treatment":"Normalize and deduplicate to a citation lookup table, then reference by key."}],"providers":["anthropic:claude-opus-4-7"],"total_usage":{"completion_tokens":3531,"prompt_tokens":16138,"total_tokens":19669}},"language_counts":{"bs":2,"ca":8,"ceb":3,"cs":8,"da":9,"de":857,"en":10335,"eo":30,"es":527,"et":2,"eu":2,"fi":17,"fr":919,"hr":3,"hu":11,"id":33,"it":139,"ja":82,"lt":1,"ms":12,"nl":175,"no":12,"oc":1,"os":1,"pam":1,"pl":28,"pt":115,"ru":56,"sk":5,"sl":11,"sq":2,"sr":2,"sv":8,"tl":1,"uk":11,"vi":2,"war":1,"zh":647},"meta":{"generated_at":"2026-05-01T18:36:26+00:00","mode":"full","row_count":11359,"sampled_rows":11359,"seed":42,"source":"/home/coolhand/servers/diachronica/etymology_atlas/processed/lexibank_references.json"},"notes":[],"saturn_version":"0.2.0","schema":{"author":"text","citation":"text","editor":"categorical","journal":"categorical","key":"text","publisher":"categorical","title":"text","url":"categorical","year":"categorical"}}
