@article {634391,
title = {PCPs and the hardness of generating synthetic data},
journal = {Journal of Cryptology},
volume = {33},
year = {2020},
pages = {2078-2112},
publisher = {Springer-Verlag},
abstract = {
Version History:\ Full version posted as\ ECCC\ TR10-017.
Published earlier in\ Yuval Ishai, ed., Proceedings of the 8th IACR Theory of Cryptography Conference (TCC {\textquoteleft}11), Lecture Notes on Computer Science. Springer-Verlag, Publishers: Vol. 5978, pp. 572-587.\ https://link.springer.com/chapter/10.1007/978-3-642-19571-6_24
Invited to\ J. Cryptology\ selected papers from TCC 2011.
Assuming the existence of one-way functions, we show that there is no polynomial-time, differentially private algorithm\ \(\mathcal{A}\)\ that takes a database\ \(D\ ∈\ ({0,\ 1}^d)^n\)\ and outputs a {\textquotedblleft}synthetic database{\textquotedblright}\ \(\hat{D}\)\ all of whose two-way marginals are approximately equal to those of\ \(D\). (A two-way marginal is the fraction of database rows\ \(x\ ∈ {0,\ 1}^d\)\ with a given pair of values in a given pair of columns.) This answers a question of Barak et al. (PODS {\textquoteleft}07), who gave an algorithm running in time\ \(\mathrm{poly} (n,\ 2^d)\).
Our proof combines a construction of hard-to-sanitize databases based on digital signatures (by Dwork et al., STOC {\textquoteleft}09) with encodings based on probabilistically checkable proofs.
We also present both negative and positive results for generating {\textquotedblleft}relaxed{\textquotedblright} synthetic data, where the fraction of rows in \(D\) satisfying a predicate \(c\) are estimated by applying \(c\) to each row of \(\hat{D}\) and aggregating the results in some way.
},
url = {https://link.springer.com/article/10.1007/s00145-020-09363-y},
author = {Jon Ullman and Salil Vadhan}
}