{"columns":[{"alerts":[{"code":"one_word","level":"warn","message":"94.6% rows are a single word"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"},{"code":"duplicates","level":"warn","message":"24.9% duplicate strings"}],"column":"form","extras":{"language_counts":{},"language_sample_size":5000,"length_histogram":{"counts":[530,9411,6280,6745,1056,823,222,284,100,128,65,14,27,5,7,10,5,2,3,0,4,6,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,2],"edges":[1.0,2.55,4.1,5.65,7.2,8.75,10.3,11.85,13.4,14.950000000000001,16.5,18.05,19.6,21.150000000000002,22.7,24.25,25.8,27.35,28.900000000000002,30.45,32.0,33.550000000000004,35.1,36.65,38.2,39.75,41.300000000000004,42.85,44.4,45.95,47.5,49.050000000000004,50.6,52.15,53.7,55.25,56.800000000000004,58.35,59.9,61.45,63.0]},"near_unique":false,"sample":["luftoj","s\u00f6","spaga","era","\u010dan\u010d","m\u0159\u2032e-","enc","berg","qa\u1e37b","aigua","herthye","morir","mala\u00f1","pref\u00ebt","Hond","isci","kohar\u0101","pe\u1e63ik","m\u00fc\u1e63\u00e2p \u010d-","bhasma(n)","ewr","mlha","fruta","hr\u00e6\u00f0ast","g\u2032\u00f2l","ko\u010dak","so\u00f3r","penzare","nez","gi\u00e0lt","puchn\u0105\u0107","ffon","eira","t\u00edni","\u1e2ba\u1e2b\u1e2bal(u)u\u032fant(i)-","thr\u00edx","graban","bon","p\u01ebp","xi\u0161\u00e1p","\u1e0dang","drega","u\u1e63\u1e47\u00e1-","swart","t-\u00fczik ti-","m\u00e2r\u2032a\u0148i\u014b","n\u0101h","depro-","jezioro","hasv"],"top_values":[["noga",24],["nos",20],["sol",18],["p\u0101",16],["tri",16],["oko",15],["dva",15],["d\u016br",15],["bitter",15],["voda",14],["par",14],["sar",14],["dom",14],["n\u0101m",13],["rot",13],["un",13],["salt",13],["horn",13],["tre",12],["gras",12]],"top_words":[["/",70],["a",36],["\u2260",34],["se",29],["i",20],["yax",19],["noga",18],["nos",17],["gras",15],["bitter",15],["sol",13],["par",13],["du",12],["dom",12],["horn",12],["tri",12],["oko",11],["dva",11],["un",11],["p\u0101",11],["sar",11],["luna",11],["an",11],["salt",11],["n\u0101m",11]],"vocab_skipped":null,"word_histogram":{"counts":[24351,0,0,1117,0,0,210,0,0,36,0,0,11,0,0,2,0,0,1,0,0,1,0,0,0,0,0,1,0,1],"edges":[1.0,1.3333333333333333,1.6666666666666665,2.0,2.333333333333333,2.6666666666666665,3.0,3.333333333333333,3.6666666666666665,4.0,4.333333333333333,4.666666666666666,5.0,5.333333333333333,5.666666666666666,6.0,6.333333333333333,6.666666666666666,7.0,7.333333333333333,7.666666666666666,8.0,8.333333333333332,8.666666666666666,9.0,9.333333333333332,9.666666666666666,10.0,10.333333333333332,10.666666666666666,11.0]}},"kind":"text","n":25731,"n_null":0,"n_unique":19334,"null_rate":0.0,"stats":{"allcaps_rate":0.0,"boilerplate_rate":0.0,"duplicate_rate":0.24861062531576697,"emoji_rate":0.0,"len_max":63,"len_mean":5.373285142435195,"len_median":5.0,"len_min":1,"len_p95":9.0,"n_duplicates":6397,"n_empty":0,"one_word_rate":0.9463681940072286,"readability_flesch_mean":86.61917500000003,"url_rate":0.0,"vocab_size":16219,"word_mean":1.0672729392561502,"word_median":1.0}},{"alerts":[],"column":"language_id","extras":{"histogram":{"counts":[683,684,1190,344,1036,862,691,1033,686,603,155,513,345,632,686,833,375,415,511,175,174,550,626,340,173,673,655,510,346,674,621,500,517,513,839,1253,1345,1115,966,889],"edges":[3.0,10.85,18.7,26.549999999999997,34.4,42.25,50.099999999999994,57.949999999999996,65.8,73.64999999999999,81.5,89.35,97.19999999999999,105.05,112.89999999999999,120.75,128.6,136.45,144.29999999999998,152.15,160.0,167.85,175.7,183.54999999999998,191.39999999999998,199.25,207.1,214.95,222.79999999999998,230.64999999999998,238.5,246.35,254.2,262.05,269.9,277.75,285.59999999999997,293.45,301.3,309.15,317.0]},"sample":[7.0,7.0,7.0,7.0,8.0,10.0,10.0,10.0,15.0,15.0,15.0,15.0,15.0,15.0,16.0,17.0,17.0,18.0,18.0,20.0,20.0,20.0,20.0,23.0,24.0,24.0,25.0,25.0,26.0,27.0,27.0,27.0,27.0,32.0,32.0,32.0,32.0,35.0,35.0,35.0,35.0,35.0,35.0,36.0,36.0,36.0,36.0,39.0,39.0,39.0,40.0,40.0,40.0,42.0,42.0,42.0,46.0,46.0,46.0,46.0,47.0,47.0,47.0,47.0,48.0,48.0,48.0,92.0,50.0,50.0,50.0,50.0,50.0,50.0,51.0,51.0,54.0,54.0,54.0,54.0,55.0,55.0,56.0,56.0,56.0,58.0,58.0,58.0,58.0,58.0,59.0,60.0,60.0,60.0,62.0,62.0,62.0,62.0,63.0,95.0,95.0,65.0,66.0,66.0,66.0,66.0,66.0,97.0,69.0,75.0,75.0,75.0,76.0,76.0,110.0,110.0,110.0,110.0,110.0,111.0,80.0,80.0,112.0,44.0,100.0,100.0,100.0,100.0,108.0,105.0,105.0,105.0,105.0,81.0,81.0,82.0,82.0,82.0,82.0,42.0,115.0,115.0,120.0,120.0,120.0,120.0,119.0,119.0,119.0,119.0,119.0,119.0,46.0,118.0,38.0,24.0,3.0,65.0,25.0,122.0,122.0,122.0,122.0,122.0,50.0,8.0,95.0,124.0,124.0,124.0,125.0,124.0,124.0,128.0,127.0,127.0,129.0,27.0,3.0,110.0,143.0,143.0,143.0,143.0,143.0,144.0,144.0,145.0,145.0,8.0,147.0,157.0,58.0,162.0,162.0,162.0,162.0,66.0,172.0,172.0,97.0,173.0,15.0,173.0,174.0,177.0,174.0,174.0,177.0,177.0,177.0,177.0,177.0,177.0,177.0,177.0,124.0,124.0,148.0,112.0,176.0,176.0,176.0,176.0,176.0,176.0,175.0,175.0,175.0,48.0,97.0,66.0,16.0,16.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,118.0,118.0,189.0,189.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,195.0,195.0,195.0,201.0,201.0,201.0,201.0,201.0,203.0,203.0,203.0,204.0,204.0,204.0,204.0,207.0,209.0,209.0,210.0,210.0,210.0,210.0,210.0,210.0,211.0,211.0,211.0,211.0,219.0,219.0,219.0,219.0,219.0,7.0,143.0,143.0,220.0,122.0,122.0,44.0,222.0,157.0,120.0,39.0,229.0,229.0,38.0,230.0,230.0,230.0,230.0,231.0,231.0,231.0,234.0,42.0,147.0,10.0,147.0,56.0,147.0,236.0,236.0,236.0,51.0,111.0,111.0,23.0,50.0,144.0,238.0,238.0,238.0,238.0,239.0,239.0,239.0,239.0,239.0,239.0,242.0,242.0,242.0,242.0,243.0,243.0,243.0,243.0,243.0,243.0,249.0,249.0,65.0,254.0,254.0,254.0,254.0,254.0,253.0,259.0,259.0,260.0,260.0,260.0,260.0,260.0,257.0,257.0,75.0,265.0,265.0,266.0,266.0,266.0,271.0,271.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0,273.0,273.0,273.0,273.0,273.0,273.0,277.0,277.0,278.0,278.0,274.0,274.0,274.0,274.0,274.0,274.0,274.0,263.0,263.0,263.0,263.0,263.0,242.0,279.0,279.0,279.0,279.0,279.0,280.0,280.0,280.0,281.0,282.0,282.0,282.0,282.0,282.0,282.0,283.0,283.0,283.0,284.0,284.0,286.0,286.0,286.0,286.0,286.0,285.0,285.0,285.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,289.0,289.0,289.0,291.0,292.0,293.0,293.0,118.0,271.0,296.0,296.0,296.0,296.0,296.0,296.0,296.0,295.0,295.0,297.0,297.0,298.0,298.0,299.0,299.0,300.0,301.0,301.0,301.0,301.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,303.0,304.0,305.0,305.0,305.0,305.0,305.0,305.0,307.0,307.0,307.0,308.0,308.0,308.0,311.0,311.0,311.0,311.0,311.0,311.0,314.0,314.0,314.0,315.0,315.0,316.0,316.0,316.0,316.0,303.0]},"kind":"numeric","n":25731,"n_null":0,"n_unique":160,"null_rate":0.0,"stats":{"iqr":201.0,"kurtosis":-1.471393905563583,"max":317.0,"mean":166.0061404531499,"median":174.0,"min":3.0,"n_outliers":0,"outlier_rate":0.0,"q1":65.0,"q3":266.0,"skew":-0.04884719953202867,"std":101.3938534933283,"zero_rate":0.0}},{"alerts":[],"column":"language_name","extras":{"singletons":0,"top_values":[["Bakhtiari",178],["Nepali",177],["Greek: Italiot",177],["Old Spanish",177],["Greek: Pontic",177],["Breton: Treger",177],["Hindi",176],["Romanian",176],["Greek: Cappadocian",176],["Breton: Gwened",176],["Middle Welsh",176],["Ladin",175],["Old Church Slavonic",175],["Elfdalian",175],["Old Swedish",175],["Welsh: North",175],["Old Polish",175],["Lithuanian",174],["Sinhalese",174],["Urdu",174]]},"kind":"categorical","n":25731,"n_null":0,"n_unique":160,"null_rate":0.0,"stats":{"cardinality":160,"entropy":7.293925249674526,"entropy_ratio":0.9961754820793187,"top_rate":0.006917725700516886,"top_value":"Bakhtiari"}},{"alerts":[],"column":"glottocode","extras":{"singletons":0,"top_values":[["mace1250",497],["swed1254",347],["czec1258",346],["poli1260",345],["sout2640",345],["slov1268",342],["oldc1252",317],["bakh1245",178],["east1436",177],["apul1236",177],["olds1249",177],["pont1253",177],["treg1244",177],["hind1269",176],["roma1327",176],["capp1239",176],["vann1244",176],["midd1363",176],["ladi1250",175],["chur1257",175]]},"kind":"categorical","n":25731,"n_null":0,"n_unique":152,"null_rate":0.0,"stats":{"cardinality":152,"entropy":7.184000506668173,"entropy_ratio":0.9911799605257036,"top_rate":0.01931522288290389,"top_value":"mace1250"}},{"alerts":[],"column":"iso_639_3","extras":{"singletons":0,"top_values":[["ell",522],["slv",509],["mkd",497],["bre",353],["swe",347],["ces",346],["pol",345],["sdh",345],["src",343],["por",341],["oss",341],["cat",340],["grc",332],["bsh",289],["bqi",178],["nep",177],["osp",177],["pnt",177],["hin",176],["ron",176]]},"kind":"categorical","n":25731,"n_null":173,"n_unique":142,"null_rate":0.0067234075628619175,"stats":{"cardinality":142,"entropy":7.044449705352019,"entropy_ratio":0.9852725680513358,"top_rate":0.02042413334376712,"top_value":"ell"}},{"alerts":[],"column":"concept","extras":{"singletons":0,"top_values":[["say",170],["man",166],["big",163],["stone",163],["house",163],["foot",161],["hand",161],["head",161],["see",161],["woman",161],["year",161],["day",160],["good",160],["name",160],["water",160],["do",160],["come",159],["give",159],["know",159],["red",159]]},"kind":"categorical","n":25731,"n_null":0,"n_unique":170,"null_rate":0.0,"stats":{"cardinality":170,"entropy":7.40808371156896,"entropy_ratio":0.9998235719264904,"top_rate":0.006606816680268936,"top_value":"say"}},{"alerts":[],"column":"cognate_id","extras":{"histogram":{"counts":[3663,3046,1164,2020,1934,677,687,370,629,897,263,436,392,234,129,141,158,105,99,175,1554,368,274,296,373,695,602,318,281,504,574,263,275,652,273,106,199,285,314,306],"edges":[3.0,252.475,501.95,751.425,1000.9,1250.375,1499.85,1749.325,1998.8,2248.275,2497.75,2747.225,2996.7,3246.1749999999997,3495.65,3745.125,3994.6,4244.075,4493.55,4743.025,4992.5,5241.974999999999,5491.45,5740.925,5990.4,6239.875,6489.349999999999,6738.825,6988.3,7237.775,7487.25,7736.724999999999,7986.2,8235.675,8485.15,8734.625,8984.1,9233.574999999999,9483.05,9732.525,9982.0]},"sample":[8378.0,225.0,607.0,662.0,648.0,91.0,4163.0,146.0,39.0,4958.0,1150.0,257.0,327.0,6178.0,335.0,1261.0,930.0,39.0,1141.0,204.0,264.0,316.0,1330.0,231.0,238.0,293.0,864.0,6379.0,294.0,39.0,228.0,298.0,350.0,44.0,1498.0,1503.0,1555.0,756.0,2283.0,7952.0,6514.0,1613.0,1633.0,220.0,226.0,2833.0,283.0,208.0,283.0,535.0,6611.0,7350.0,5645.0,877.0,565.0,953.0,1897.0,1859.0,2330.0,3148.0,1131.0,263.0,1173.0,146.0,28.0,1247.0,671.0,7318.0,774.0,704.0,437.0,802.0,7455.0,146.0,790.0,7396.0,9446.0,146.0,333.0,5794.0,380.0,1141.0,8246.0,927.0,931.0,6505.0,2805.0,8233.0,6707.0,5776.0,266.0,72.0,8265.0,294.0,6505.0,7955.0,91.0,641.0,852.0,1125.0,384.0,385.0,1105.0,700.0,1147.0,7523.0,1189.0,208.0,39.0,5620.0,4735.0,5713.0,931.0,303.0,2484.0,4900.0,1547.0,671.0,515.0,815.0,5261.0,7318.0,2895.0,1860.0,1109.0,1151.0,267.0,146.0,6715.0,1486.0,5975.0,328.0,157.0,859.0,3141.0,2593.0,44.0,2670.0,153.0,864.0,598.0,968.0,320.0,329.0,252.0,198.0,5467.0,18.0,2703.0,690.0,5716.0,82.0,2268.0,1656.0,5922.0,2502.0,7943.0,7652.0,2443.0,2551.0,870.0,883.0,287.0,953.0,1672.0,5997.0,3082.0,1954.0,718.0,98.0,1969.0,294.0,327.0,8259.0,788.0,1734.0,1486.0,2211.0,3825.0,4896.0,371.0,5456.0,9604.0,335.0,5643.0,4519.0,845.0,39.0,815.0,5008.0,535.0,309.0,5737.0,7382.0,6146.0,147.0,131.0,2458.0,5642.0,1050.0,2336.0,405.0,4989.0,2654.0,6652.0,309.0,796.0,86.0,2401.0,4896.0,589.0,4136.0,649.0,5030.0,5028.0,5271.0,7164.0,7166.0,8269.0,5007.0,5577.0,5145.0,266.0,91.0,815.0,6178.0,1173.0,1189.0,342.0,6096.0,5202.0,5157.0,5163.0,5148.0,3389.0,22.0,5059.0,249.0,2783.0,342.0,747.0,5221.0,744.0,859.0,2078.0,2841.0,7144.0,7163.0,2235.0,2458.0,2448.0,6873.0,7563.0,4123.0,2456.0,202.0,5869.0,6235.0,44.0,5909.0,855.0,223.0,309.0,782.0,5040.0,5125.0,6517.0,1404.0,900.0,82.0,1456.0,6084.0,1512.0,1536.0,2268.0,5015.0,405.0,1519.0,6258.0,4130.0,44.0,411.0,2647.0,2878.0,7136.0,3355.0,6336.0,5925.0,225.0,6390.0,6340.0,6849.0,758.0,5341.0,7206.0,2336.0,4362.0,970.0,5345.0,968.0,18.0,1092.0,4387.0,5546.0,153.0,2456.0,91.0,2235.0,2774.0,7376.0,312.0,5110.0,320.0,2146.0,8267.0,8434.0,5097.0,6787.0,844.0,6339.0,3907.0,5037.0,2916.0,1250.0,1144.0,7567.0,977.0,1025.0,907.0,86.0,2137.0,8696.0,573.0,9413.0,338.0,7730.0,5097.0,1117.0,7551.0,8276.0,301.0,303.0,1146.0,1297.0,7780.0,5007.0,18.0,1157.0,1238.0,5163.0,403.0,5031.0,1248.0,1105.0,8265.0,1168.0,468.0,301.0,4490.0,165.0,8615.0,42.0,276.0,773.0,789.0,805.0,14.0,27.0,8484.0,9762.0,712.0,7952.0,9975.0,9957.0,8440.0,1957.0,9669.0,2235.0,3062.0,60.0,5032.0,28.0,744.0,5223.0,84.0,327.0,5034.0,2647.0,6470.0,6615.0,697.0,405.0,211.0,311.0,268.0,2235.0,5438.0,535.0,2268.0,2011.0,8398.0,5021.0,7523.0,91.0,3057.0,84.0,2336.0,2268.0,703.0,342.0,257.0,745.0,9126.0,999.0,2443.0,5015.0,3080.0,9473.0,6439.0,519.0,8523.0,8543.0,8529.0,8159.0,8607.0,123.0,598.0,4716.0,9494.0,649.0,9485.0,328.0,157.0,888.0,303.0,5040.0,6778.0,6178.0,411.0,6726.0,9197.0,6489.0,225.0,123.0,165.0,5619.0,9387.0,5032.0,9365.0,265.0,300.0,289.0,312.0,1136.0,316.0,189.0,8248.0,343.0,1861.0,294.0,5089.0,6453.0,328.0,327.0,7106.0,7319.0,7523.0,326.0,226.0,1318.0,208.0,1333.0,211.0,8369.0,82.0,314.0,948.0,961.0,294.0,9851.0,7919.0,9216.0,7200.0,9918.0,856.0,6505.0,910.0,266.0,2443.0,953.0,579.0,3832.0,4142.0,42.0,1969.0,314.0,6878.0,7263.0,7262.0,5448.0]},"kind":"numeric","n":25731,"n_null":0,"n_unique":4979,"null_rate":0.0,"stats":{"iqr":5229.0,"kurtosis":-0.9047739398298336,"max":9982.0,"mean":3085.572772142552,"median":1610.0,"min":3.0,"n_outliers":0,"outlier_rate":0.0,"q1":411.0,"q3":5640.0,"skew":0.7307114093140809,"std":3023.6609161049155,"zero_rate":0.0}},{"alerts":[{"code":"imbalance","level":"warn","message":"top value is 100.0% of rows"}],"column":"source_dataset","extras":{"singletons":0,"top_values":[["iecor",25731]]},"kind":"categorical","n":25731,"n_null":0,"n_unique":1,"null_rate":0.0,"stats":{"cardinality":1,"entropy":-0.0,"entropy_ratio":0.0,"top_rate":1.0,"top_value":"iecor"}}],"insights":{"errors":[],"insights":[{"confidence":"high","critiques":[],"evidence_keys":["row_count","column_count","form.stats.one_word_rate","form.stats.duplicate_rate","form.stats.len_mean","iso_639_3.n_unique","iso_639_3.top_values","concept.n_unique","concept.top_values","language_name.top_values","source_dataset.top_rate"],"featured_charts":[{"caption":"Top languages by ISO code \u2014 Greek, Slovenian, and Macedonian lead the sample.","column":"iso_639_3","kind":"bar"},{"caption":"Concept distribution is highly uniform \u2014 each of the 170 concepts has ~160 forms.","column":"concept","kind":"bar"},{"caption":"Which languages are most densely sampled by name (Bakhtiari, Nepali, Italiot Greek).","column":"language_name","kind":"bar"},{"caption":"Form lengths cluster tightly around 5 characters; nearly all entries are single words.","column":"form","kind":"length"},{"caption":"Spread of cognate IDs across the 4,979 cognate sets \u2014 useful for spotting clustering.","column":"cognate_id","kind":"histogram"}],"model":"anthropic:claude-opus-4-7","narrative":"This dataset contains 25,731 word forms drawn from a single source ('iecor'), each tagged with a concept, language, and cognate identifier \u2014 essentially a comparative wordlist across 160 languages and 170 concepts. The 'form' column is mostly single-word entries (94.6% one-word, mean length ~5 characters) with about 24.9% duplicates, suggesting many shared or repeated forms across languages. The language coverage is broad and well-balanced (entropy ratio ~0.99 across 142 ISO codes), led by Greek (ell), Slovenian (slv), and Macedonian (mkd). Worth a closer look: the concept distribution is remarkably even (~160-170 forms per concept), and the language_name distribution shows which languages are most densely sampled (Bakhtiari, Nepali, Italiot Greek). The 'source_dataset' column is constant and can be ignored.","scope":"dataset","target":"__global__"},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","one_word_rate","len_mean","word_median","vocab_size","duplicate_rate","n_duplicates","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This column holds short single-word lexical forms \u2014 94.6% are one-word entries with a mean length of 5.4 characters and median word count of 1. The vocabulary spans 16,219 distinct words across 25,731 rows, and top values like 'noga', 'p\u0101', 'd\u016br', 'voda', 'bitter' suggest a multilingual mix (Slavic, Polynesian, Germanic). Notably, 24.9% of rows are duplicates (6,397), yet no single form dominates \u2014 the most frequent ('noga') appears only 24 times.","role":"feature","scope":"column","target":"form","treatment":"Treat as a categorical lexical token; normalize unicode and consider language detection before embedding."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.min","stats.max","stats.mean","stats.median","stats.q1","stats.q3","stats.skew","stats.kurtosis","stats.zero_rate","stats.n_outliers"],"model":"anthropic:claude-opus-4-7","narrative":"Numeric code with 160 distinct values across 25,731 rows and zero nulls, ranging from 3 to 317 with a near-symmetric distribution (skew -0.05) and flat shape (kurtosis -1.47). The flat, wide spread and integer-looking quartiles (65, 174, 266) suggest this is a categorical language identifier stored as an integer rather than a true numeric measurement. No outliers and no zeros, consistent with a lookup key.","role":"foreign_key","scope":"column","target":"language_id","treatment":"Treat as categorical and left-join to a language lookup table; do not model as a continuous variable."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.cardinality","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This column holds language names, with 160 distinct values across 25,731 rows and no nulls. The distribution is remarkably flat: entropy_ratio is 0.996 and the most common value 'Bakhtiari' appears just 178 times (0.69%), suggesting a near-uniform sampling of languages rather than a natural population. Several entries use a 'Family: Variety' convention (e.g., 'Greek: Italiot', 'Breton: Treger'), indicating dialect-level granularity mixed with top-level language names.","role":"feature","scope":"column","target":"language_name","treatment":"Use as a categorical feature; consider splitting on ':' to separate family from variety before encoding."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"Glottocodes are Glottolog's stable language identifiers, so this column tags each of the 25,731 rows with one of 152 distinct languages. The distribution is remarkably flat: entropy ratio is 0.991 and the most frequent code 'mace1250' covers only 1.93% of rows, with several Slavic and Germanic codes clustered around 340\u2013350 occurrences. No nulls, and a visible drop-off after the top seven codes (down to ~177) suggests a tiered sampling design rather than a long tail.","role":"foreign_key","scope":"column","target":"glottocode","treatment":"left-join on this id to Glottolog metadata, or one-hot/target-encode for modelling."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.cardinality","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This column holds ISO 639-3 language codes, with 142 distinct languages spread across 25,731 rows. The distribution is remarkably flat \u2014 entropy ratio of 0.985 and the top code 'ell' covering only 2.04% \u2014 so no single language dominates. Null rate is negligible at 0.67%.","role":"feature","scope":"column","target":"iso_639_3","treatment":"Treat as a categorical feature; one-hot or target-encode given the 142 levels, or group rare codes."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.cardinality","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This column holds 170 distinct concept labels (e.g., 'say', 'man', 'big', 'stone', 'house') spread almost perfectly evenly across 25,731 rows, with the top value covering only 0.66% of the data and entropy at 99.98% of the maximum. The vocabulary resembles a Swadesh-style basic concept list, and the near-uniform distribution suggests each concept appears a fixed number of times \u2014 likely once per language or source.","role":"foreign_key","scope":"column","target":"concept","treatment":"Treat as a categorical key; group or pivot on it rather than one-hot encoding given 170 balanced levels."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.min","stats.max","stats.zero_rate","stats.skew","stats.kurtosis","stats.n_outliers"],"model":"anthropic:claude-opus-4-7","narrative":"This is almost certainly a cognate group identifier: 4,979 distinct integer values spread across 25,731 rows (roughly 5x repetition) with no nulls and no zeros. Despite being stored as numeric, the wide range (3 to 9,982), moderate skew (0.73) and negative kurtosis (-0.90) suggest these are arbitrary group labels rather than a measured quantity. The lack of outliers is consistent with categorical-style codes packed into the integer space.","role":"foreign_key","scope":"column","target":"cognate_id","treatment":"Treat as a categorical group key; join or group-by rather than feeding as a numeric feature."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.cardinality","stats.entropy","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This column is a constant provenance tag identifying the source dataset, with every one of the 25731 rows labelled \"iecor\". Cardinality is 1 and entropy is 0, so it carries no discriminative signal.","role":"metadata","scope":"column","target":"source_dataset","treatment":"Drop before modelling; retain only as a provenance flag."}],"providers":["anthropic:claude-opus-4-7"],"total_usage":{"completion_tokens":2918,"prompt_tokens":9578,"total_tokens":12496}},"language_counts":{},"meta":{"generated_at":"2026-05-01T18:05:52+00:00","mode":"full","row_count":25731,"sampled_rows":25731,"seed":42,"source":"/home/coolhand/servers/diachronica/etymology_atlas/processed/word_forms.csv"},"notes":[],"saturn_version":"0.2.0","schema":{"cognate_id":"numeric","concept":"categorical","form":"text","glottocode":"categorical","iso_639_3":"categorical","language_id":"numeric","language_name":"categorical","source_dataset":"categorical"}}
