{"columns":[{"alerts":[{"code":"one_word","level":"warn","message":"100.0% rows are a single word"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"},{"code":"duplicates","level":"warn","message":"96.5% duplicate strings"}],"column":"language_id","extras":{"language_counts":{},"language_sample_size":5000,"length_histogram":{"counts":[342,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,76059],"edges":[2.0,2.025,2.05,2.075,2.1,2.125,2.15,2.175,2.2,2.225,2.25,2.275,2.3,2.325,2.35,2.375,2.4,2.425,2.45,2.475,2.5,2.525,2.55,2.575,2.6,2.625,2.65,2.675,2.7,2.725,2.75,2.775,2.8,2.825,2.85,2.875,2.9,2.925,2.95,2.975,3.0]},"near_unique":false,"sample":["abi","rus","tmn","tag","kmz","yim","kom","tuy","tag","aml","mun","map","gol","abk","guj","cre","bej","tel","zaz","khs","hdi","amp","chk","bgs","pai","saw","men","cti","awp","yid","niu","nmm","nmd","kfe","jak","knz","wem","lmh","kpa","mal","sul","wal","gnb","wah","wsk","ykt","ila","kfe","chc","tru"],"top_values":[["eng",159],["fre",158],["ger",157],["rus",156],["fin",155],["grk",155],["hun",155],["spa",155],["tur",154],["ind",153],["mnd",153],["jpn",151],["ame",150],["geo",150],["bsq",149],["eve",149],["kor",149],["lez",149],["abk",148],["hau",148]],"top_words":[["ind",56],["zul",48],["fin",48],["jpn",47],["abk",45],["kro",45],["tag",45],["ame",45],["eng",45],["kho",44],["spa",43],["mal",43],["aeg",43],["chk",43],["mar",43],["apu",43],["may",42],["ger",42],["ngi",41],["kse",41],["epe",41],["pau",41],["hun",40],["goo",40],["yag",40]],"vocab_skipped":null,"word_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,76401,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[0.5,0.5333333333333333,0.5666666666666667,0.6,0.6333333333333333,0.6666666666666666,0.7,0.7333333333333334,0.7666666666666666,0.8,0.8333333333333333,0.8666666666666667,0.9,0.9333333333333333,0.9666666666666667,1.0,1.0333333333333332,1.0666666666666667,1.1,1.1333333333333333,1.1666666666666665,1.2,1.2333333333333334,1.2666666666666666,1.3,1.3333333333333335,1.3666666666666667,1.4,1.4333333333333333,1.4666666666666668,1.5]}},"kind":"text","n":76475,"n_null":74,"n_unique":2659,"null_rate":0.0009676364825106244,"stats":{"allcaps_rate":0.0,"boilerplate_rate":0.0,"duplicate_rate":0.9651967906179238,"emoji_rate":0.0,"len_max":3,"len_mean":2.995523618800801,"len_median":3.0,"len_min":2,"len_p95":3.0,"n_duplicates":73742,"n_empty":0,"one_word_rate":1.0,"readability_flesch_mean":118.25900000000001,"url_rate":0.0,"vocab_size":2209,"word_mean":1.0,"word_median":1.0}},{"alerts":[],"column":"feature_id","extras":{"singletons":0,"top_values":[["83A",1518],["82A",1496],["81A",1376],["87A",1367],["143A",1325],["143E",1325],["143F",1325],["143G",1325],["97A",1316],["86A",1249],["88A",1225],["144A",1190],["85A",1184],["112A",1157],["89A",1154],["95A",1142],["69A",1131],["33A",1066],["51A",1031],["26A",969]]},"kind":"categorical","n":76475,"n_null":0,"n_unique":192,"null_rate":0.0,"stats":{"cardinality":192,"entropy":7.103389681832427,"entropy_ratio":0.9365095320058678,"top_rate":0.019849624060150377,"top_value":"83A"}},{"alerts":[],"column":"feature_name","extras":{"singletons":0,"top_values":[["Order of Object and Verb",1518],["Order of Subject and Verb",1496],["Order of Subject, Object and Verb",1376],["Order of Adjective and Noun",1367],["Order of Negative Morpheme and Verb",1325],["Preverbal Negative Morphemes",1325],["Postverbal Negative Morphemes",1325],["Minor morphological means of signaling negation",1325],["Relationship between the Order of Object and Verb and the Order of Adjective and Noun",1316],["Order of Genitive and Noun",1249],["Order of Demonstrative and Noun",1225],["Position of Negative Word With Respect to Subject, Object, and Verb",1190],["Order of Adposition and Noun Phrase",1184],["Negative Morphemes",1157],["Order of Numeral and Noun",1154],["Relationship between the Order of Object and Verb and the Order of Adposition and Noun Phrase",1142],["Position of Tense-Aspect Affixes",1131],["Coding of Nominal Plurality",1066],["Position of Case Affixes",1031],["Prefixing vs. Suffixing in Inflectional Morphology",969]]},"kind":"categorical","n":76475,"n_null":0,"n_unique":192,"null_rate":0.0,"stats":{"cardinality":192,"entropy":7.103389681832427,"entropy_ratio":0.9365095320058678,"top_rate":0.019849624060150377,"top_value":"Order of Object and Verb"}},{"alerts":[{"code":"high_skew","level":"info","message":"skew=+3.49"}],"column":"value","extras":{"histogram":{"counts":[27379,21173,6771,0,9917,3602,0,2730,1392,0,1042,597,0,32,265,0,43,68,0,251,190,0,156,44,0,127,83,0,358,202,0,21,18,0,3,3,0,4,3,1],"edges":[1.0,1.675,2.35,3.0250000000000004,3.7,4.375,5.050000000000001,5.7250000000000005,6.4,7.075,7.75,8.425,9.100000000000001,9.775,10.450000000000001,11.125,11.8,12.475000000000001,13.15,13.825000000000001,14.5,15.175,15.850000000000001,16.525,17.200000000000003,17.875,18.55,19.225,19.900000000000002,20.575000000000003,21.25,21.925,22.6,23.275000000000002,23.950000000000003,24.625,25.3,25.975,26.650000000000002,27.325000000000003,28.0]},"sample":[1.0,6.0,4.0,4.0,4.0,4.0,1.0,1.0,2.0,2.0,1.0,3.0,3.0,3.0,1.0,2.0,1.0,1.0,3.0,9.0,3.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,6.0,2.0,4.0,1.0,4.0,5.0,4.0,1.0,3.0,4.0,1.0,2.0,5.0,8.0,2.0,1.0,2.0,6.0,2.0,1.0,2.0,1.0,1.0,1.0,6.0,8.0,1.0,3.0,1.0,4.0,1.0,4.0,1.0,2.0,2.0,14.0,1.0,1.0,4.0,5.0,1.0,2.0,5.0,1.0,1.0,1.0,2.0,3.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,4.0,4.0,5.0,1.0,1.0,2.0,1.0,2.0,1.0,6.0,1.0,1.0,4.0,2.0,4.0,1.0,5.0,7.0,2.0,2.0,4.0,2.0,6.0,4.0,1.0,11.0,2.0,1.0,2.0,4.0,3.0,4.0,1.0,1.0,8.0,1.0,5.0,2.0,1.0,2.0,2.0,1.0,3.0,2.0,7.0,2.0,4.0,5.0,1.0,1.0,4.0,2.0,2.0,1.0,1.0,1.0,2.0,5.0,4.0,2.0,1.0,1.0,5.0,2.0,3.0,3.0,2.0,1.0,3.0,1.0,2.0,1.0,3.0,1.0,1.0,4.0,4.0,2.0,4.0,2.0,2.0,2.0,1.0,1.0,3.0,1.0,3.0,4.0,2.0,4.0,6.0,1.0,3.0,1.0,7.0,1.0,1.0,1.0,1.0,4.0,4.0,2.0,2.0,1.0,3.0,4.0,2.0,4.0,2.0,2.0,6.0,5.0,1.0,2.0,4.0,5.0,1.0,2.0,12.0,1.0,18.0,2.0,2.0,3.0,2.0,1.0,2.0,5.0,1.0,2.0,2.0,10.0,2.0,5.0,3.0,4.0,1.0,6.0,2.0,5.0,5.0,1.0,1.0,4.0,3.0,2.0,3.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,19.0,14.0,1.0,1.0,1.0,1.0,2.0,7.0,3.0,2.0,8.0,2.0,1.0,4.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,4.0,1.0,8.0,1.0,6.0,2.0,6.0,2.0,2.0,4.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,4.0,4.0,2.0,3.0,2.0,2.0,3.0,2.0,2.0,4.0,10.0,4.0,2.0,5.0,1.0,2.0,3.0,8.0,8.0,4.0,4.0,1.0,2.0,21.0,1.0,2.0,1.0,1.0,5.0,4.0,1.0,1.0,2.0,3.0,3.0,1.0,8.0,5.0,6.0,21.0,2.0,6.0,2.0,1.0,5.0,2.0,6.0,3.0,1.0,2.0,20.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0,3.0,1.0,3.0,1.0,4.0,3.0,6.0,1.0,5.0,6.0,7.0,4.0,2.0,1.0,1.0,1.0,5.0,1.0,6.0,4.0,3.0,4.0,1.0,1.0,2.0,1.0,2.0,8.0,4.0,2.0,1.0,11.0,23.0,1.0,1.0,1.0,18.0,4.0,2.0,2.0,2.0,1.0,3.0,3.0,7.0,7.0,8.0,5.0,2.0,4.0,2.0,1.0,1.0,1.0,5.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,20.0,2.0,2.0,3.0,2.0,2.0,4.0,2.0,1.0,1.0,1.0,6.0,4.0,2.0,4.0,4.0,1.0,2.0,1.0,1.0,1.0,5.0,3.0,1.0,3.0,2.0,6.0,2.0,1.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,1.0,4.0,4.0,3.0,3.0,4.0,3.0,2.0,3.0,3.0,1.0,6.0,2.0,3.0,2.0,3.0,4.0,1.0,1.0,1.0,1.0,20.0,16.0,5.0,2.0,1.0,1.0,1.0,4.0,2.0,1.0,4.0,2.0,2.0,1.0,3.0,2.0,3.0,3.0,1.0,1.0,4.0,3.0,1.0,1.0,1.0,4.0,1.0,2.0,6.0,9.0,1.0,1.0,1.0,1.0,3.0]},"kind":"numeric","n":76475,"n_null":0,"n_unique":28,"null_rate":0.0,"stats":{"iqr":3.0,"kurtosis":16.360895410436576,"max":28.0,"mean":2.854436090225564,"median":2.0,"min":1.0,"n_outliers":2469,"outlier_rate":0.03228506047728016,"q1":1.0,"q3":4.0,"skew":3.492637589549751,"std":2.82443353242618,"zero_rate":0.0}},{"alerts":[{"code":"one_word","level":"warn","message":"100.0% rows are a single word"},{"code":"allcaps","level":"info","message":"100.0% rows are all-caps"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"},{"code":"duplicates","level":"warn","message":"98.5% duplicate strings"}],"column":"value_name","extras":{"language_counts":{},"language_sample_size":5000,"length_histogram":{"counts":[4995,0,0,0,0,0,0,0,0,0,0,0,0,45403,0,0,0,0,0,0,0,0,0,0,0,0,24205,0,0,0,0,0,0,0,0,0,0,0,0,1872],"edges":[4.0,4.075,4.15,4.225,4.3,4.375,4.45,4.525,4.6,4.675,4.75,4.825,4.9,4.975,5.05,5.125,5.2,5.275,5.35,5.425,5.5,5.575,5.65,5.725,5.8,5.875,5.95,6.025,6.1,6.175,6.25,6.324999999999999,6.4,6.475,6.55,6.625,6.699999999999999,6.775,6.85,6.925,7.0]},"near_unique":false,"sample":["18A-1","96A-2","87A-2","144J-7","144P-4","95A-1","144B-3","114A-7","131A-1","96A-4","88A-1","144H-4","83A-1","108B-4","26A-1","45A-1","136B-1","121A-1","113A-1","91A-2","15A-8","90C-1","107A-2","9A-3","130A-2","85A-1","144A-21","37A-5","144N-8","102A-1","85A-2","136A-1","92A-6","47A-1","84A-6","29A-3","91A-1","88A-6","86A-1","93A-1","97A-4","144Q-4","38A-3","13A-2","89A-2","1A-3","142A-2","64A-2","92A-6","97A-5"],"top_values":[["143G-4",1315],["82A-1",1192],["87A-2",879],["83A-1",712],["143F-4",712],["83A-2",705],["86A-1",685],["143E-1",682],["69A-2",667],["93A-2",615],["89A-2",608],["116A-1",585],["90A-1",579],["90C-1",579],["85A-1",577],["81A-1",564],["88A-2",562],["88A-1",542],["11A-1",525],["143A-1",525]],"top_words":[["143g-4",364],["82a-1",313],["87a-2",237],["83a-2",193],["143e-1",187],["86a-1",182],["83a-1",181],["93a-2",173],["89a-2",171],["143f-4",167],["69a-2",164],["90c-1",157],["85a-1",151],["90a-1",151],["116a-1",149],["88a-1",145],["81a-1",144],["33a-2",139],["11a-1",139],["130a-2",138],["88a-2",137],["143a-1",136],["6a-1",135],["81a-2",132],["19a-1",129]],"vocab_skipped":null,"word_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,76475,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[0.5,0.5333333333333333,0.5666666666666667,0.6,0.6333333333333333,0.6666666666666666,0.7,0.7333333333333334,0.7666666666666666,0.8,0.8333333333333333,0.8666666666666667,0.9,0.9333333333333333,0.9666666666666667,1.0,1.0333333333333332,1.0666666666666667,1.1,1.1333333333333333,1.1666666666666665,1.2,1.2333333333333334,1.2666666666666666,1.3,1.3333333333333335,1.3666666666666667,1.4,1.4333333333333333,1.4666666666666668,1.5]}},"kind":"text","n":76475,"n_null":0,"n_unique":1139,"null_rate":0.0,"stats":{"allcaps_rate":1.0,"boilerplate_rate":0.0,"duplicate_rate":0.9851062438705459,"emoji_rate":0.0,"len_max":7,"len_mean":5.300150375939849,"len_median":5.0,"len_min":4,"len_p95":6.0,"n_duplicates":75336,"n_empty":0,"one_word_rate":1.0,"readability_flesch_mean":121.22000000000004,"url_rate":0.0,"vocab_size":911,"word_mean":1.0,"word_median":1.0}},{"alerts":[{"code":"imbalance","level":"warn","message":"top value is 100.0% of rows"}],"column":"source","extras":{"singletons":0,"top_values":[["WALS",76475]]},"kind":"categorical","n":76475,"n_null":0,"n_unique":1,"null_rate":0.0,"stats":{"cardinality":1,"entropy":-0.0,"entropy_ratio":0.0,"top_rate":1.0,"top_value":"WALS"}}],"insights":{"errors":[],"insights":[{"confidence":"high","critiques":[],"evidence_keys":["row_count","column_count","columns.value_name.n_unique","columns.value_name.top_values","columns.language_id.n_unique","columns.language_id.top_values","columns.source.top_value","columns.source.top_rate","columns.value.skew","columns.value.kurtosis","columns.value.outlier_rate","columns.value.max","columns.feature_id.n_unique","columns.feature_name.top_values","columns.feature_name.top_rate"],"featured_charts":[{"caption":"See which typological features are most heavily represented; word-order and negation features dominate the top of the distribution.","column":"feature_name","kind":"bar"},{"caption":"Check the heavy right skew \u2014 most observations fall at values 1\u20134, but a long tail extends up to 28.","column":"value","kind":"histogram"},{"caption":"Top languages by number of feature observations are fairly even (around 150\u2013160 each), reflecting WALS's broad coverage.","column":"language_id","kind":"bar"},{"caption":"The most frequent value codes (e.g., '143G-4', '82A-1') reveal which feature-value combinations recur across many languages.","column":"value_name","kind":"bar"},{"caption":"Confirms that 100% of rows come from WALS \u2014 the column is constant and adds no variation.","column":"source","kind":"donut"}],"model":"anthropic:claude-opus-4-7","narrative":"This dataset contains 76,475 rows of linguistic feature observations, all sourced from WALS (World Atlas of Language Structures). Each row pairs a language (2,659 unique language IDs) with one of 192 typological features (e.g., 'Order of Object and Verb') and a categorical value encoded as both a short code (value_name) and a small integer (value). The most common features cluster around word order and negation, with 'Order of Object and Verb' being the top feature at 1,518 rows. Worth a closer look: the `value` column is highly skewed (skew 3.49, kurtosis 16.4) with ~3.2% outliers reaching up to 28, suggesting most features have only a few possible values but a handful have many categories. The `source` column is constant ('WALS') and can be ignored as a variable.","scope":"dataset","target":"__global__"},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.len_mean","stats.len_max","stats.one_word_rate","stats.duplicate_rate","stats.vocab_size","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This column holds 2-3 character language codes (ISO 639-style: eng, fre, ger, rus...), with len_mean 2.996 and one_word_rate 1.0. With 76,475 rows but only 2,659 unique values and a 96.5% duplicate_rate, it behaves as a categorical key rather than free text. The top codes are remarkably balanced (eng 159, fre 158, ger 157), suggesting a curated multilingual catalogue rather than organic usage.","role":"foreign_key","scope":"column","target":"language_id","treatment":"Treat as a categorical code; left-join to a language lookup table."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.cardinality","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"feature_id is a categorical code with 192 distinct values across 76,475 rows and no nulls, with codes like '83A', '82A', '143E' suggesting a numeric prefix plus letter suffix scheme. The distribution is remarkably flat: entropy ratio 0.937, top value '83A' covers only 1.98% of rows, and four '143' suffixes ('143A','143E','143F','143G') tie at exactly 1325 occurrences, hinting at a structured co-occurrence rather than random sampling.","role":"feature","scope":"column","target":"feature_id","treatment":"Treat as a categorical feature; consider splitting the numeric prefix and letter suffix into separate fields before encoding."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.cardinality","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This column names linguistic typology features, almost certainly WALS-style feature labels (e.g., 'Order of Object and Verb', 'Order of Subject and Verb'). With 192 distinct values across 76,475 rows and a top rate of just 1.98%, the distribution is remarkably flat (entropy ratio 0.94), suggesting each feature is sampled across many languages rather than a few features dominating. There are no nulls and no obvious duplicates among the top values.","role":"feature","scope":"column","target":"feature_name","treatment":"Use as a categorical key; one-hot or target-encode, or pivot to wide form keyed by feature_name."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.min","stats.max","stats.mean","stats.median","stats.q1","stats.q3","stats.iqr","stats.skew","stats.kurtosis","stats.n_outliers","stats.outlier_rate","stats.zero_rate"],"model":"anthropic:claude-opus-4-7","narrative":"A small-integer count or rating feature with only 28 distinct values ranging from 1 to 28, mean 2.85 and median 2. The distribution is heavily right-skewed (skew 3.49, kurtosis 16.36), with 2,469 outliers (3.23%) stretching the tail to 28 while the IQR sits tight at 1\u20134. No nulls or zeros, so every row carries a positive count.","role":"feature","scope":"column","target":"value","treatment":"log1p-transform or cap the upper tail before modelling to tame the skew."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.allcaps_rate","stats.one_word_rate","stats.duplicate_rate","stats.len_min","stats.len_max","stats.len_mean","stats.vocab_size","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This column holds short alphanumeric codes (e.g. '143G-4', '82A-1') \u2014 uppercase, single-token, 4-7 characters long, with 1.0 one_word_rate and 1.0 allcaps_rate. Across 76,475 rows there are only 1,139 distinct values and a 0.985 duplicate_rate, so it behaves as a categorical key rather than free text. The pattern (digits + letter + dash + digit) suggests a typology or feature-value identifier drawn from a fixed vocabulary of 911 tokens.","role":"feature","scope":"column","target":"value_name","treatment":"Treat as a categorical code and encode (e.g. target or one-hot) rather than tokenizing as text."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.cardinality","stats.entropy","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This column records the data origin and holds the constant value \"WALS\" across all 76,475 rows. With cardinality of 1 and entropy of 0.0, it carries no information for modelling and merely tags the dataset's provenance.","role":"metadata","scope":"column","target":"source","treatment":"Drop before modelling; retain only as a provenance tag."}],"providers":["anthropic:claude-opus-4-7"],"total_usage":{"completion_tokens":2520,"prompt_tokens":8485,"total_tokens":11005}},"language_counts":{},"meta":{"generated_at":"2026-05-01T17:23:21+00:00","mode":"full","row_count":76475,"sampled_rows":76475,"seed":42,"source":"/home/coolhand/servers/diachronica/etymology_atlas/parquet/linguistic_features.parquet"},"notes":[],"saturn_version":"0.2.0","schema":{"feature_id":"categorical","feature_name":"categorical","language_id":"text","source":"categorical","value":"numeric","value_name":"text"}}
