synthphonia/parser/
ioexamples.rs

1// use crate::galloc::alloc_iter;
2
3use std::collections::HashSet;
4
5use crate::galloc::{self, AllocForIter};
6
7use crate::value::ConstValue;
8
9use super::problem::{new_custom_error_span, FunSig, SynthFun};
10
11use super::prod::ProdRule;
12
13use super::problem::Error;
14
15use crate::value::Type;
16
17use super::problem::Rule;
18
19use counter::Counter;
20use itertools::Itertools;
21use pest::error::InputLocation;
22use pest::iterators::Pair;
23
24use crate::value::Value;
25use derive_more::DebugCustom;
26#[derive(DebugCustom, Clone)]
27#[debug(fmt = "{:?} -> {:?}", inputs, output)]
28/// A struct that holds input and output examples for string synthesis problems. 
29/// 
30/// The structure consists of two fields: `inputs`, which is a vector containing multiple `Value` elements, and `output`, a single `Value` representing the expected result. 
31/// This setup is designed to facilitate the storage and retrieval of example data necessary for evaluating and validating synthesis algorithms, by providing concrete cases of input-output relationships.
32/// 
33pub struct IOExamples {
34    pub(crate) inputs: Vec<Value>,
35    pub(crate) output: Value,
36}
37
38impl IOExamples {
39    /// Parses a collection of input/output examples according to a specified function signature and optional deduplication flag, returning a structured set of examples or an error. 
40    /// 
41    /// It begins by extracting relevant metadata from the provided function signature, such as function name, argument types, and return type. 
42    /// The function processes the provided examples by iterating over them, ensuring each example contains a correct number of arguments and matching types. 
43    /// If the 'dedup' parameter is set to true, duplicates are removed using a `HashSet`. 
44    /// Finally, the function constructs the `inputs` and `output`, organizing each example's inputs by type before returning the assembled `IOExamples` structure.
45    /// 
46    pub(crate) fn parse(examples: Pair<'_, Rule>, sig: &FunSig, dedup: bool) -> Result<Self, Error> {
47        let name = sig.name.as_str();
48        let args = sig.args.as_slice();
49        let rettype = sig.rettype;
50        let mut types = args.iter().map(|x| x.1).collect_vec();
51        types.push(rettype);
52        let mut v: Vec<_> = examples
53            .into_inner()
54            .map(|x| {
55                let span = x.as_span();
56                let v = x.into_inner().skip(1).collect_vec();
57                let v: Vec<_> = v.into_iter().map(|x| ConstValue::parse(x)).try_collect()?;
58                if v.len() != types.len() {
59                    return Err(new_custom_error_span(format!("wrong number of arguments for {}: expected", name), span));
60                }
61                for (value, typ) in v.iter().zip(types.iter()) {
62                    if value.ty() != *typ {
63                        return Err(new_custom_error_span(format!("wrong type for {}", name), span));
64                    }
65                }
66                Ok(v)
67            }).try_collect()?;
68            
69        if dedup {
70            let set: HashSet<_> = v.iter().cloned().collect();
71            v = set.into_iter().collect_vec();
72        }
73
74        let mut inputs = types.iter().enumerate().map(|(i, ty)| Value::from_const(*ty, v.iter().map(|input| &input[i]).cloned())).collect_vec();
75        let output = inputs.pop().unwrap();
76        Ok(Self { inputs, output })
77    }
78    
79    /// Extracts and returns a list of constant substrings identified in the input and output examples of string synthesis problems.
80    /// 
81    /// The method iterates over all input strings and the output string, treating them as a unified sequence. 
82    /// For each string, it generates all possible substrings using `all_slices` and counts their occurrences. 
83    /// It then evaluates each distinct substring, checking for specific filtering conditions: the substring must appear with sufficient frequency, must either be a significant length or show certain frequency patterns, and should not be simple numeric or alphanumeric characters. 
84    /// Substrings meeting these criteria that are not already surpassed in count by longer substrings are added to the list of constants. 
85    /// This approach helps in identifying significant repeating string patterns, which can play a crucial role in constructing string transformation rules.
86    pub fn extract_constants(&self) -> Vec<&'static str> {
87        let mut counter = Counter::<&str, usize>::new();
88        let mut total_len = 0;
89        for s1 in self.inputs.iter().chain(std::iter::once(&self.output)) {
90            if let Value::Str(a) = s1 {
91                for s2 in a.iter() {
92                    for s in all_slices(s2) {
93                        counter[&s] += 1;
94                        total_len += s.len();
95                    }
96                }
97            }
98        }
99
100        let mut constants: Vec<&'static str> = Vec::new();
101        for (k, v) in counter.iter() {
102            let mut flag = false;
103            if *v >= std::cmp::max(3, total_len / 200) {
104                if k.len() == 1 && k.chars().all(char::is_alphanumeric) {
105                    continue;
106                }
107                if k.chars().all(char::is_numeric) {
108                    continue;
109                }
110                if k.len() == 1 {
111                    flag = true;
112                }
113                if k.len() >= 6 {
114                    flag = true;
115                } else if k.len() >= 4 && *v >= std::cmp::max(5, total_len / 100) {
116                    flag = true;
117                } else if *v >= std::cmp::max(8, total_len / 30) {
118                    flag = true;
119                }
120
121                if flag && constants.iter().filter(|c| c.contains(k)).all(|c| counter[k] > counter[c] + 1) {
122                    constants.retain(|c| !c.contains(k) || counter[k] + 1 < counter[c]);
123                    constants.push(k);
124                }
125            }
126        }
127
128        constants
129    }
130}
131
132/// Generates an iterator over all possible slices of the input string. 
133/// 
134/// 
135/// This function takes a string slice as input and creates an iterator that yields each possible substring of the input string. 
136/// It uses a range from 0 to the length of the string to initiate the starting index of each slice. 
137/// For each starting index, it employs `flat_map` combined with `char_indices` and `skip` to navigate through the string, creating substrings from each starting index to each subsequent character index. 
138/// The resulting iterator efficiently covers all contiguous substrings in the original string, ensuring comprehensive slice generation without allocating additional string storage.
139fn all_slices(a: &str) -> impl Iterator<Item = &str> {
140    (0..a.len()).flat_map(move |i| a.char_indices().skip(i).map(move |(j, _)| &a[i..j + 1]))
141}