% vim: set spell:
\documentclass[a4paper]{article}

\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage{ae}           % apparantly, this helps me get less ugly pdfs on screen
\usepackage[cyr]{aeguill} % guillmets (goes with ae package above)
\usepackage{graphicx}
\usepackage{url}
\usepackage{xspace}
\usepackage{genistuff}
\usepackage[normalem]{ulem}
\usepackage{color}
\usepackage[margin=1.2cm,columnsep=1cm]{geometry}
\usepackage{fancyhdr}

\newcommand{\lexsel}[1] {« #1 »}
\newcommand{\koweypol}{\texttt}
\newcommand{\koweyplusN}[1]{\koweypol{\color{blue}+#1np}\xspace}
\newcommand{\koweyminusN}[1]{\koweypol{\color{red}-#1np}\xspace}
\newcommand{\koweyplus}{\koweyplusN{1}}
\newcommand{\koweyminus}{\koweyminusN{1}}
\newcommand{\koweyzero}{\koweypol{0np}\xspace}

\pagestyle{fancy}
\setlength\parindent{0cm}
\renewcommand{\headrulewidth}{0pt}

\begin{document}

\lhead{\tiny \sf Surface realisation: ambiguity and determinism}
\rhead{\tiny \sf Eric Kow 2008-01-10}
\cfoot{}

\author{Eric Kow\\ TALARIS project (INRIA/LORIA)}
\date{10 January 2008\\Brighton}

% -------------------------------------------------------------------------
%
% -------------------------------------------------------------------------

\twocolumn

\section{Surface realisation}

A typical natural language generation pipeline:
\begin{center}
\includegraphics[scale=0.25]{images/nlg-architecture}\\
\includegraphics[scale=0.3]{images/surface-realisation-paraphrases}
\end{center}

\subsection{$L_U$ Flat semantics}

{\small
\begin{enumerate}
\item An $L_U$ formula is a set of literals\\
      {\small \semexpr{l1:aimer(e,j,m), l2:jean(j), l3:marie(m)}}
\item Each literal consists of a predicate, a label and some arguments:
\begin{center}
% literal (l0:aimer(a,j,m))
% == label (l1)
%    predicate (aimer)
%    arguments (a,j,m)
\includegraphics[scale=0.25]{images/flat-semantics}
\end{center}
\item The label and arguments are either constants or unification
      variables.
\end{enumerate}
}

%See also
%
%\tiny
%\bibitem[Shieber 2003]{shieber1993plf}
%Shieber, S. (Computational linguistics, 1993)
%``The problem of logical form equivalence''
%


% -------------------------------------------------------------------------

\subsection{Feature-Based Lexicalised Tree Adjoining Grammar (FB-LTAG)}

An FB-LTAG associates each word with a set of trees.\\
Two combining operations: substitution and adjunction.

\begin{center}
\includegraphics[scale=0.35]{images/tag-example}
\end{center}

\subsubsection{FB-LTAG with an $L_U$ semantics}

Each tree is associated with a semantic representation

\begin{center}
\includegraphics[scale=0.35]{images/tag-example-sem}
\end{center}

$Sem(t_1 + t_2) = Sem(t_1) \cup Sem(t_2)$ modulo unification

\subsubsection{SemFraG}

Reversible grammar for French.  Will be made available.
For now, contact Claire Gardent \url{gardent@loria.fr}

% -------------------------------------------------------------------------
\subsection{Realisation algorithm}

\begin{center}
\includegraphics[scale=0.4]{images/surface-realisation-algo-1}
\end{center}

GenI realiser:\\
\url{http://trac.loria.fr/~geni}

% -------------------------------------------------------------------------
\section{Polarity filtering}

\newcommand{\ambiguityexampleSem}{
\semexpr{l0:picture(p)}  & \semexpr{l1:cost(c,p,h)} & \semexpr{l2:high(h)}}

\newcommand{\ambiguityexampleLex}{
\hline
\lexsel{picture} &
\lexsel{cost of}   &
\lexsel{is high} \\
%
\lexsel{painting} &
\lexsel{costs} &
\lexsel{a lot} \\
\hline}

\newcommand{\ambiguityexamplePol}{
\hline
\lexsel{picture} \koweyplus &
\lexsel{cost of} \koweyzero &
\lexsel{is high} \koweyminus \\
%
\lexsel{painting} \koweyplus &
\lexsel{costs} \koweyminusN{2} &
\lexsel{a lot} \koweyplus \\
\hline
}

\newcommand{\ambiguityexample}{
{\footnotesize
\begin{center}
\begin{tabular}{|c|c|c|}
\hline
\ambiguityexampleSem \\
\ambiguityexampleLex
\end{tabular}
\end{center}
}}
\newcommand{\ambiguityexampleP}{
{\footnotesize
\begin{center}
\begin{tabular}{|c|c|c|}
\hline
\ambiguityexampleSem \\
\ambiguityexamplePol
\end{tabular}
\end{center}
}}
\newcommand{\ambiguityexampleX}{
{\footnotesize
\begin{center}
\begin{tabular}{|c|c|c|}
\ambiguityexamplePol
\end{tabular}
\end{center}
}}


\subsection{Lexical ambiguity}

% NB: deliberate abuse of language to keep parallel with
% parsing
Lexical ambiguity is the possibility for a literal to be
expressed in several ways:

\ambiguityexample

The number of lexical combinations is exponential:
\begin{displaymath}
\prod_{1 \leq i \leq n}{a_i}
\end{displaymath}

{\footnotesize
$n$,   number of literals in an input semantics\\
$a_i$, ambiguity of the $i$-th literal}


% -------------------------------------------------------------------------

\subsection{Syntactic incompatibilities}

{\small
\begin{tabular}{ll}
\lexsel{picture} \lexsel{cost of} \lexsel{is high}
  & \natlang{the cost of the picture is high} \\
\lexsel{picture} \lexsel{costs} \lexsel{a lot}
  & \natlang{the picture costs a lot} \\
\sout{\lexsel{picture} \lexsel{cost of} \lexsel{a lot}}
  & \sout{\natlang{the cost of the picture a lot}} \\
\end{tabular}
}

% -------------------------------------------------------------------------
\subsection{Polarities}

Each lexical entry in the grammar is associated with a set of
polarities which represent its syntactic resources and needs.

\begin{center}
\includegraphics[scale=0.5]{images/le-tableau-coute-cher}
\end{center}

{\footnotesize
\begin{tabular}{lrrrr}
\lexsel{picture} \lexsel{cost of} \lexsel{is high}
  & \koweyplus & \koweyzero & \koweyminus & (= \koweyzero) \\

\lexsel{picture} \lexsel{costs} \lexsel{a lot}
  & \koweyplus & \koweyminusN{2} & \koweyplus & (= \koweyzero) \\

\sout{\lexsel{picture} \lexsel{cost of} \lexsel{a lot}}
  & \koweyplus & \koweyzero & \koweyplus & (= \koweyplusN{2}) \\
\end{tabular}
}

% -------------------------------------------------------------------------

\subsection{Polarity automaton}

\begin{center}
\includegraphics[scale=0.30]{images/basicaut-gpruned}
\end{center}

% -------------------------------------------------------------------------

\subsection{An example}

\natlang{L'homme qui discute philosophie avec Paul dit que Jean part.}

\begin{center}
\footnotesize
\begin{tabular}{|l|rr|}
\hline
& \textbf{no filtering} & \textbf{filtering} \\
\hline
lexical combinations     & 2 436 672 & 4 136   \\
substitutions            &    26 149 & 3 284   \\
adjunctions              &     5 014 &   630   \\
realisation time (s)     &     1 615 &    25   \\
\hline
\end{tabular}
\end{center}

% -------------------------------------------------------------------------

\begin{thebibliography}{perrier2003gi}
\bibitem[Perrier 2003]{perrier2003gi}
Perrier, G. (HDR, 2003)
``Les grammaires d'interaction''
\bibitem[Kow 2005]{kow2005apd}
Kow, E. (ESSLLI student session, 2005, Edinburgh)
``Adapting polarised disambiguation to surface realisation''
\end{thebibliography}

% -------------------------------------------------------------------------
%
% -------------------------------------------------------------------------
\section{Paraphrase selection}

\sout{A speech is going to be made by him.}\\
He is going to make a speech.

% -------------------------------------------------------------------------
\subsection{Metagrammar}

Tree fragments (dominance and linear precedence constraints)
combine to make TAG trees:~\\[1ex]
\begin{center}
\includegraphics[scale=0.75]{images/xmg-sample}
\end{center}

eXtensible MetaGrammar compiler (XMG)\\
\url{http://sourcesup.cru.fr/xmg/}

% -------------------------------------------------------------------------

\subsection{Tree properties for selection}

Enriched input semantics (each literal can be associated
with a set of tree properties, to act as filters)\\[2ex]

\newcommand{\enrichexA}{Joe gives the car to Sue.}
\newcommand{\enrichexB}{Sue is given the car by Joe.}
\newcommand{\enrichexC}{The car is given to Sue by Joe.}

{\footnotesize
\semexpr{l0:give(e,x,y,z)%
[Passive,ToObject]},%
\semexpr{l1:joe(x),l2:sue(y),l3:car(z)}

\enrichexA\\
\sout{\enrichexB}\\
\sout{\enrichexC}
}

% -------------------------------------------------------------------------

\subsection{Evaluation}

\semexpr{l0:give(e,x,y,z),l1:joe(x),l2:sue(y),l3:car(z)}
\begin{center}
$\Downarrow$
\end{center}
\begin{tabular}{rl}
\enrichexA & \semexpr{Active, Object}\\
{\color{red}\enrichexB} & \semexpr{Passive, Object}\\
{\color{red}\enrichexC} & \semexpr{Passive, ToObject}\\
\end{tabular}\\[3ex]

For each pair of paraphrases: do they have distinct \emph{enriched}
semantics?

Yes, for 98\% of pairs (87 cases, or 1528 paraphrases)

% -------------------------------------------------------------------------

\begin{thebibliography}{xmg}
\bibitem[Crabb{\'e} and Duchier 2004]{crabbe2004mr}
Crabb{\'e}, B. and Duchier, D. (2004).  ``Metagrammar Redux''.
\bibitem[Gardent and Kow 2007]{gardent2007san}
Gardent, C. and Kow, E. (ACL 2007, Prague).
``A symbolic approach to near-deterministic surface realisation using Tree Adjoining Grammar''
\end{thebibliography}

\newpage

% -------------------------------------------------------------------------
%
% -------------------------------------------------------------------------

\section{Reducing overgeneration}

Incremental, semi-automatic approach:

\begin{center}
\includegraphics[scale=0.30]{images/test-harness}
\end{center}

% -------------------------------------------------------------------------
\subsection{Derivations log}

For each string: its derivation tree, lexical selection and the
tree properties of each lexical entry used.

{
\footnotesize\begin{verbatim}
Output: Jean se demande si c'est Paul qui vient
demander:n8 <-(s)- venir
demander:n1 <-(s)- jean
venir:n4 <-(s)- paul

demander Tn0ClVs1int-630
  CanonicalSubject NonInvertedNominalSubject
  SententialInterrogative
venir Tn0V-615
  CleftSubject NonInvertedNominalSubject
paul TproperName-45
jean TproperName-45
\end{verbatim}
}


% -------------------------------------------------------------------------
\subsection{Suspects report}

For each lemma: TAG families, trees, tree properties which
\emph{only} appear in cases of overgeneration.

{
\footnotesize\begin{verbatim}input t90
Lemma: dire
Tn0Vn1 (all) - InfinitiveSubject Passive
  [699] CanonicalCAgent Passive
  [746] CanonicalGenitive dePassive
  [702] CleftCAgentOne Passive
  [752] CleftDont dePassive
\end{verbatim}
}

Also: combinations of lexical items which only appear in
overgeneration.

{
\footnotesize\begin{verbatim}
Input t70
consistently overgenerating derivation items
le:Tdet-17:n0 <-(a)- riche:Tn0vA-90
\end{verbatim}
}


% -------------------------------------------------------------------------

\subsection{A quarter of the output}

We have eliminated 70\% of the strings produced with 13 modifications
to the metagrammar (31 lines, 12 hours).

\begin{center}
\footnotesize
\begin{tabular}{|l|r|r|r|r|}
\hline
      & total & maximum & average & median \\
\hline
before & 28000 &  4900 & 200 & 25 \\
after  &  8400 &   710 &  60 & 12 \\
\hline
\end{tabular}\\
{\footnotesize number of strings per case}
\end{center}

We got greater reductions for longer phrases.

\begin{center}
\includegraphics[scale=0.15]{images/reductions}
\end{center}

% -------------------------------------------------------------------------

\begin{thebibliography}{xmg}
\bibitem[Gardent and Kow 2007]{gardent2007sos}
Gardent, C. and Kow, E. (ENLG 2007).
``Spotting overgeneration suspects''
\end{thebibliography}

\end{document}

