synthphonia/parser/ioexamples.rs
1// use crate::galloc::alloc_iter;
2
3use std::collections::HashSet;
4
5use crate::galloc::{self, AllocForIter};
6
7use crate::value::ConstValue;
8
9use super::problem::{new_custom_error_span, FunSig, SynthFun};
10
11use super::prod::ProdRule;
12
13use super::problem::Error;
14
15use crate::value::Type;
16
17use super::problem::Rule;
18
19use counter::Counter;
20use itertools::Itertools;
21use pest::error::InputLocation;
22use pest::iterators::Pair;
23
24use crate::value::Value;
25use derive_more::DebugCustom;
26#[derive(DebugCustom, Clone)]
27#[debug(fmt = "{:?} -> {:?}", inputs, output)]
28/// A struct that holds input and output examples for string synthesis problems.
29///
30/// The structure consists of two fields: `inputs`, which is a vector containing multiple `Value` elements, and `output`, a single `Value` representing the expected result.
31/// This setup is designed to facilitate the storage and retrieval of example data necessary for evaluating and validating synthesis algorithms, by providing concrete cases of input-output relationships.
32///
33pub struct IOExamples {
34 pub(crate) inputs: Vec<Value>,
35 pub(crate) output: Value,
36}
37
38impl IOExamples {
39 /// Parses a collection of input/output examples according to a specified function signature and optional deduplication flag, returning a structured set of examples or an error.
40 ///
41 /// It begins by extracting relevant metadata from the provided function signature, such as function name, argument types, and return type.
42 /// The function processes the provided examples by iterating over them, ensuring each example contains a correct number of arguments and matching types.
43 /// If the 'dedup' parameter is set to true, duplicates are removed using a `HashSet`.
44 /// Finally, the function constructs the `inputs` and `output`, organizing each example's inputs by type before returning the assembled `IOExamples` structure.
45 ///
46 pub(crate) fn parse(examples: Pair<'_, Rule>, sig: &FunSig, dedup: bool) -> Result<Self, Error> {
47 let name = sig.name.as_str();
48 let args = sig.args.as_slice();
49 let rettype = sig.rettype;
50 let mut types = args.iter().map(|x| x.1).collect_vec();
51 types.push(rettype);
52 let mut v: Vec<_> = examples
53 .into_inner()
54 .map(|x| {
55 let span = x.as_span();
56 let v = x.into_inner().skip(1).collect_vec();
57 let v: Vec<_> = v.into_iter().map(|x| ConstValue::parse(x)).try_collect()?;
58 if v.len() != types.len() {
59 return Err(new_custom_error_span(format!("wrong number of arguments for {}: expected", name), span));
60 }
61 for (value, typ) in v.iter().zip(types.iter()) {
62 if value.ty() != *typ {
63 return Err(new_custom_error_span(format!("wrong type for {}", name), span));
64 }
65 }
66 Ok(v)
67 }).try_collect()?;
68
69 if dedup {
70 let set: HashSet<_> = v.iter().cloned().collect();
71 v = set.into_iter().collect_vec();
72 }
73
74 let mut inputs = types.iter().enumerate().map(|(i, ty)| Value::from_const(*ty, v.iter().map(|input| &input[i]).cloned())).collect_vec();
75 let output = inputs.pop().unwrap();
76 Ok(Self { inputs, output })
77 }
78
79 /// Extracts and returns a list of constant substrings identified in the input and output examples of string synthesis problems.
80 ///
81 /// The method iterates over all input strings and the output string, treating them as a unified sequence.
82 /// For each string, it generates all possible substrings using `all_slices` and counts their occurrences.
83 /// It then evaluates each distinct substring, checking for specific filtering conditions: the substring must appear with sufficient frequency, must either be a significant length or show certain frequency patterns, and should not be simple numeric or alphanumeric characters.
84 /// Substrings meeting these criteria that are not already surpassed in count by longer substrings are added to the list of constants.
85 /// This approach helps in identifying significant repeating string patterns, which can play a crucial role in constructing string transformation rules.
86 pub fn extract_constants(&self) -> Vec<&'static str> {
87 let mut counter = Counter::<&str, usize>::new();
88 let mut total_len = 0;
89 for s1 in self.inputs.iter().chain(std::iter::once(&self.output)) {
90 if let Value::Str(a) = s1 {
91 for s2 in a.iter() {
92 for s in all_slices(s2) {
93 counter[&s] += 1;
94 total_len += s.len();
95 }
96 }
97 }
98 }
99
100 let mut constants: Vec<&'static str> = Vec::new();
101 for (k, v) in counter.iter() {
102 let mut flag = false;
103 if *v >= std::cmp::max(3, total_len / 200) {
104 if k.len() == 1 && k.chars().all(char::is_alphanumeric) {
105 continue;
106 }
107 if k.chars().all(char::is_numeric) {
108 continue;
109 }
110 if k.len() == 1 {
111 flag = true;
112 }
113 if k.len() >= 6 {
114 flag = true;
115 } else if k.len() >= 4 && *v >= std::cmp::max(5, total_len / 100) {
116 flag = true;
117 } else if *v >= std::cmp::max(8, total_len / 30) {
118 flag = true;
119 }
120
121 if flag && constants.iter().filter(|c| c.contains(k)).all(|c| counter[k] > counter[c] + 1) {
122 constants.retain(|c| !c.contains(k) || counter[k] + 1 < counter[c]);
123 constants.push(k);
124 }
125 }
126 }
127
128 constants
129 }
130}
131
132/// Generates an iterator over all possible slices of the input string.
133///
134///
135/// This function takes a string slice as input and creates an iterator that yields each possible substring of the input string.
136/// It uses a range from 0 to the length of the string to initiate the starting index of each slice.
137/// For each starting index, it employs `flat_map` combined with `char_indices` and `skip` to navigate through the string, creating substrings from each starting index to each subsequent character index.
138/// The resulting iterator efficiently covers all contiguous substrings in the original string, ensuring comprehensive slice generation without allocating additional string storage.
139fn all_slices(a: &str) -> impl Iterator<Item = &str> {
140 (0..a.len()).flat_map(move |i| a.char_indices().skip(i).map(move |(j, _)| &a[i..j + 1]))
141}