<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	xmlns:georss="http://www.georss.org/georss" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:media="http://search.yahoo.com/mrss/"
	>

<channel>
	<title>PDF Clown&#039;s Blog</title>
	<atom:link href="http://pdfclown.wordpress.com/feed/" rel="self" type="application/rss+xml" />
	<link>http://pdfclown.wordpress.com</link>
	<description>Developing a free/libre open source PDF library</description>
	<lastBuildDate>Fri, 27 Jan 2012 15:17:37 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.com/</generator>
<cloud domain='pdfclown.wordpress.com' port='80' path='/?rsscloud=notify' registerProcedure='' protocol='http-post' />
<image>
		<url>http://0.gravatar.com/blavatar/811b9eaf640a5f0d10e0b71d089a865b?s=96&#038;d=http%3A%2F%2Fs2.wp.com%2Fi%2Fbuttonw-com.png</url>
		<title>PDF Clown&#039;s Blog</title>
		<link>http://pdfclown.wordpress.com</link>
	</image>
	<atom:link rel="search" type="application/opensearchdescription+xml" href="http://pdfclown.wordpress.com/osd.xml" title="PDF Clown&#039;s Blog" />
	<atom:link rel='hub' href='http://pdfclown.wordpress.com/?pushpress=hub'/>
		<item>
		<title>What about screencasts on PDF Clown use?</title>
		<link>http://pdfclown.wordpress.com/2012/01/20/what-about-screencasts-on-pdf-clown-use/</link>
		<comments>http://pdfclown.wordpress.com/2012/01/20/what-about-screencasts-on-pdf-clown-use/#comments</comments>
		<pubDate>Fri, 20 Jan 2012 19:18:23 +0000</pubDate>
		<dc:creator>stechio</dc:creator>
				<category><![CDATA[Help]]></category>
		<category><![CDATA[PDF]]></category>
		<category><![CDATA[software]]></category>

		<guid isPermaLink="false">http://pdfclown.wordpress.com/?p=420</guid>
		<description><![CDATA[I&#8217;m considering to make screencasts on the use of the library. Topics are still under definition: what would you like to see in action? Unleash your curiosity and let me know! PS: I use open-source IDEs only, so don&#8217;t expect me to tweak around with proprietary tools like MS Visual Studio&#8230;<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=pdfclown.wordpress.com&amp;blog=9357253&amp;post=420&amp;subd=pdfclown&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p><a href="http://pdfclown.files.wordpress.com/2012/01/screencastpromo.jpg"><img class="aligncenter size-full wp-image-419" title="Eclipse IDE screenshot" src="http://pdfclown.files.wordpress.com/2012/01/screencastpromo.jpg?w=700&#038;h=437" alt="" width="700" height="437" style="border:none;" /></a></p>
<p>I&#8217;m considering to make <strong>screencasts on the use of the library</strong>.</p>
<p>Topics are still under definition: <em>what would you like to see in action?</em></p>
<p>Unleash your curiosity and let me know!</p>
<p>PS: I use open-source IDEs only, so don&#8217;t expect me to tweak around with proprietary tools like MS Visual Studio&#8230; <img src='http://s1.wp.com/wp-includes/images/smilies/icon_wink.gif' alt=';-)' class='wp-smiley' /> </p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/pdfclown.wordpress.com/420/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/pdfclown.wordpress.com/420/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/pdfclown.wordpress.com/420/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/pdfclown.wordpress.com/420/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/pdfclown.wordpress.com/420/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/pdfclown.wordpress.com/420/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/pdfclown.wordpress.com/420/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/pdfclown.wordpress.com/420/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/pdfclown.wordpress.com/420/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/pdfclown.wordpress.com/420/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/pdfclown.wordpress.com/420/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/pdfclown.wordpress.com/420/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/pdfclown.wordpress.com/420/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/pdfclown.wordpress.com/420/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=pdfclown.wordpress.com&amp;blog=9357253&amp;post=420&amp;subd=pdfclown&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://pdfclown.wordpress.com/2012/01/20/what-about-screencasts-on-pdf-clown-use/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/9b1849639b3f85a32527cf84fcfa52ff?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">stechio</media:title>
		</media:content>

		<media:content url="http://pdfclown.files.wordpress.com/2012/01/screencastpromo.jpg" medium="image">
			<media:title type="html">Eclipse IDE screenshot</media:title>
		</media:content>
	</item>
		<item>
		<title>Waiting for PDF Clown 0.1.2 release</title>
		<link>http://pdfclown.wordpress.com/2011/12/09/waiting-for-pdf-clown-0-1-2-release/</link>
		<comments>http://pdfclown.wordpress.com/2011/12/09/waiting-for-pdf-clown-0-1-2-release/#comments</comments>
		<pubDate>Fri, 09 Dec 2011 18:06:53 +0000</pubDate>
		<dc:creator>stechio</dc:creator>
				<category><![CDATA[Development]]></category>
		<category><![CDATA[DOM]]></category>
		<category><![CDATA[editing]]></category>
		<category><![CDATA[PDF]]></category>
		<category><![CDATA[software]]></category>
		<category><![CDATA[structure]]></category>

		<guid isPermaLink="false">http://pdfclown.wordpress.com/?p=324</guid>
		<description><![CDATA[1. PDF Clown DOM Inspector Since its earliest versions, PDF Clown has been shipped including a simple Swing-based proof of concept for viewing PDF file structures. Now that little fledgling is going to become a comprehensive tool for the visual editing of the structure of PDF files: PDF Clown DOM Inspector. It will be part [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=pdfclown.wordpress.com&amp;blog=9357253&amp;post=324&amp;subd=pdfclown&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<h3>1. PDF Clown DOM Inspector</h3>
<p>Since its earliest versions, PDF Clown has been shipped including a simple Swing-based proof of concept for viewing PDF file structures. Now that little fledgling is going to become a <em>comprehensive tool for the visual editing of the structure of PDF files</em>: <strong>PDF Clown DOM Inspector</strong>. It will be part of next 0.1.2 version as a dedicated project within the PDF Clown distribution.</p>
<p><img class="aligncenter size-full wp-image-371" style="border:none;" title="PDF Clown layers" src="http://pdfclown.files.wordpress.com/2011/12/layersoverview.png?w=700" alt=""   /></p>
<p>This tool conforms to the PDF model as defined by PDF Clown (see the diagram above), which adheres to the official <a href="http://www.adobe.com/devnet/pdf/pdf_reference.html">PDF Reference 1.7/ISO 32000-1</a>. This implies that <em>a PDF file is represented through several concurrent views which work at different abstraction levels</em>: Document view (document layer), File view (file/object layer, hierarchical) and XRef view (file/object layer, flat).</p>
<h4>1.1. Document view</h4>
<p><a href="http://pdfclown.files.wordpress.com/2011/12/inspector_doc_render.png"><img class="aligncenter size-full wp-image-325" style="border:none;" title="PDF Clown DOM Inspector - Renderer and document structure" src="http://pdfclown.files.wordpress.com/2011/12/inspector_doc_render.png?w=700&#038;h=413" alt="" width="700" height="413" /></a></p>
<p><strong>Document view</strong> (see the left pane in the above screenshot) shows the <em>high-level structure of a PDF file</em>; selecting a node, its data is shown in the right pane through several views &#8212; in this case, selecting a <a href="http://clown.sourceforge.net/docs/api/org/pdfclown/documents/Page.html">page</a> node shows its <a href="http://clown.sourceforge.net/docs/api/org/pdfclown/documents/contents/objects/package-summary.html">content stream structure</a> (<strong>Contents view</strong>, see below) and its <a href="http://clown.sourceforge.net/docs/api/org/pdfclown/tools/Renderer.html">rendering</a> (<strong>Render view</strong> [¹], see above). Note that the page model represented by both Contents view and Render view corresponds to the content (sub)layer described in the diagram above.</p>
<p><a href="http://pdfclown.files.wordpress.com/2011/12/inspector_document_contents.png"><img class="aligncenter size-full wp-image-326" style="border:none;" title="PDF Clown DOM Inspector - Page contents and document structure" src="http://pdfclown.files.wordpress.com/2011/12/inspector_document_contents.png?w=700&#038;h=413" alt="" width="700" height="413" /></a></p>
<p>Here it is just one of the possible functionalities: hovering the mouse pointer over a show-text-operation node, a tooltip pops up revealing the actual text encoded inside it (in this example, inspecting a russian-language document):</p>
<p><a href="http://pdfclown.files.wordpress.com/2011/12/domtexttooltip.png"><img class="aligncenter size-full wp-image-398" style="border:none;" title="Text tooltip showing the decoded content of the currently-hovered-node operation" src="http://pdfclown.files.wordpress.com/2011/12/domtexttooltip.png?w=700&#038;h=249" alt="" width="700" height="249" /></a></p>
<p>There&#8217;s such a potential for custom features that <em>I&#8217;m considering to make it pluggable so as to let it be extended with additional modules, at user&#8217;s will</em>.</p>
<h4>1.2. File view</h4>
<p><a href="http://pdfclown.files.wordpress.com/2011/12/inspector_file_render.png"><img class="aligncenter size-full wp-image-327" style="border:none;" title="PDF Clown DOM Inspector - File structure" src="http://pdfclown.files.wordpress.com/2011/12/inspector_file_render.png?w=700&#038;h=413" alt="" width="700" height="413" /></a></p>
<p><strong>File view</strong> shows the <em>low-level representation</em> of the same entities you found in the above-mentioned Document view, expressed as <a href="http://clown.sourceforge.net/docs/api/org/pdfclown/objects/package-summary.html">primitive objects</a> like dictionaries (PdfDictionary), arrays (PdfArray), streams (PdfStream) and so on.</p>
<h4>1.3. XRef view</h4>
<p><a href="http://pdfclown.files.wordpress.com/2011/12/inspector_xref_render.png"><img class="aligncenter size-full wp-image-328" style="border:none;" title="PDF Clown DOM Inspector - Cross-reference structure" src="http://pdfclown.files.wordpress.com/2011/12/inspector_xref_render.png?w=700&#038;h=413" alt="" width="700" height="413" /></a></p>
<p><strong>XRef view</strong> lists the entries of the <a href="http://clown.sourceforge.net/docs/api/org/pdfclown/files/IndirectObjects.html">cross-reference index</a> (either table or stream, but that&#8217;s a technical detail you can happily ignore as it&#8217;s transparently handled by the library).</p>
<p>It&#8217;s really interesting to note that <em>all the views (Document, File, XRef) are always kept synchronized</em>: when you select a node in one of these views, its corresponding entities in each of the others are automatically selected, allowing to seamlessly switch from one view to another.</p>
<p>[¹] Rendering is still partial as it&#8217;s under development (pre-alpha stage).</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/pdfclown.wordpress.com/324/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/pdfclown.wordpress.com/324/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/pdfclown.wordpress.com/324/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/pdfclown.wordpress.com/324/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/pdfclown.wordpress.com/324/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/pdfclown.wordpress.com/324/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/pdfclown.wordpress.com/324/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/pdfclown.wordpress.com/324/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/pdfclown.wordpress.com/324/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/pdfclown.wordpress.com/324/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/pdfclown.wordpress.com/324/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/pdfclown.wordpress.com/324/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/pdfclown.wordpress.com/324/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/pdfclown.wordpress.com/324/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=pdfclown.wordpress.com&amp;blog=9357253&amp;post=324&amp;subd=pdfclown&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://pdfclown.wordpress.com/2011/12/09/waiting-for-pdf-clown-0-1-2-release/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/9b1849639b3f85a32527cf84fcfa52ff?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">stechio</media:title>
		</media:content>

		<media:content url="http://pdfclown.files.wordpress.com/2011/12/layersoverview.png" medium="image">
			<media:title type="html">PDF Clown layers</media:title>
		</media:content>

		<media:content url="http://pdfclown.files.wordpress.com/2011/12/inspector_doc_render.png" medium="image">
			<media:title type="html">PDF Clown DOM Inspector - Renderer and document structure</media:title>
		</media:content>

		<media:content url="http://pdfclown.files.wordpress.com/2011/12/inspector_document_contents.png" medium="image">
			<media:title type="html">PDF Clown DOM Inspector - Page contents and document structure</media:title>
		</media:content>

		<media:content url="http://pdfclown.files.wordpress.com/2011/12/domtexttooltip.png" medium="image">
			<media:title type="html">Text tooltip showing the decoded content of the currently-hovered-node operation</media:title>
		</media:content>

		<media:content url="http://pdfclown.files.wordpress.com/2011/12/inspector_file_render.png" medium="image">
			<media:title type="html">PDF Clown DOM Inspector - File structure</media:title>
		</media:content>

		<media:content url="http://pdfclown.files.wordpress.com/2011/12/inspector_xref_render.png" medium="image">
			<media:title type="html">PDF Clown DOM Inspector - Cross-reference structure</media:title>
		</media:content>
	</item>
		<item>
		<title>PDF Clown 0.1.1 has been released!</title>
		<link>http://pdfclown.wordpress.com/2011/11/14/pdf-clown-0-1-1-has-been-released/</link>
		<comments>http://pdfclown.wordpress.com/2011/11/14/pdf-clown-0-1-1-has-been-released/#comments</comments>
		<pubDate>Mon, 14 Nov 2011 19:19:06 +0000</pubDate>
		<dc:creator>stechio</dc:creator>
				<category><![CDATA[Release]]></category>
		<category><![CDATA[PDF]]></category>
		<category><![CDATA[software]]></category>

		<guid isPermaLink="false">http://pdfclown.wordpress.com/?p=318</guid>
		<description><![CDATA[This release adds support to optional/layered contents, text highlighting, metadata streams (XMP), Type1/CFF font files, along with primitive object model and AcroForm fields filling enhancements. Lots of minor improvements have been applied too. Last but not least: ICSharpCode.SharpZipLib.dll dependency has been removed from .NET implementation. This release may be downloaded from: https://sourceforge.net/projects/clown/files/PDFClown-devel/0.1.1%20Beta/ enjoy!<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=pdfclown.wordpress.com&amp;blog=9357253&amp;post=318&amp;subd=pdfclown&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p><a href="../2011/04/12/waiting-for-pdf-clown-0-1-1-release/">This release</a> adds support to <strong>optional/layered contents</strong>, <strong>text highlighting</strong>, <strong>metadata streams</strong> (XMP), Type1/CFF font files, along with primitive object model and AcroForm fields filling enhancements. Lots of minor improvements have been applied too.</p>
<p>Last but not least: ICSharpCode.SharpZipLib.dll dependency has been removed from .NET implementation.</p>
<p>This release may be downloaded from:<br />
<a href="https://sourceforge.net/projects/clown/files/PDFClown-devel/0.1.1%20Beta/">https://sourceforge.net/projects/clown/files/PDFClown-devel/0.1.1%20Beta/</a></p>
<p>enjoy!</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/pdfclown.wordpress.com/318/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/pdfclown.wordpress.com/318/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/pdfclown.wordpress.com/318/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/pdfclown.wordpress.com/318/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/pdfclown.wordpress.com/318/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/pdfclown.wordpress.com/318/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/pdfclown.wordpress.com/318/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/pdfclown.wordpress.com/318/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/pdfclown.wordpress.com/318/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/pdfclown.wordpress.com/318/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/pdfclown.wordpress.com/318/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/pdfclown.wordpress.com/318/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/pdfclown.wordpress.com/318/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/pdfclown.wordpress.com/318/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=pdfclown.wordpress.com&amp;blog=9357253&amp;post=318&amp;subd=pdfclown&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://pdfclown.wordpress.com/2011/11/14/pdf-clown-0-1-1-has-been-released/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/9b1849639b3f85a32527cf84fcfa52ff?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">stechio</media:title>
		</media:content>
	</item>
		<item>
		<title>Waiting for PDF Clown 0.1.1 release</title>
		<link>http://pdfclown.wordpress.com/2011/04/12/waiting-for-pdf-clown-0-1-1-release/</link>
		<comments>http://pdfclown.wordpress.com/2011/04/12/waiting-for-pdf-clown-0-1-1-release/#comments</comments>
		<pubDate>Tue, 12 Apr 2011 17:53:46 +0000</pubDate>
		<dc:creator>stechio</dc:creator>
				<category><![CDATA[Development]]></category>
		<category><![CDATA[DOM]]></category>
		<category><![CDATA[PDF]]></category>
		<category><![CDATA[software]]></category>
		<category><![CDATA[text highlighting]]></category>

		<guid isPermaLink="false">http://pdfclown.wordpress.com/?p=229</guid>
		<description><![CDATA[[NOTE: this post was updated on November 14, 2011] Latest news: on November 14, 2011 PDF Clown 0.1.1 has been released! Next release is going to introduce new exciting features (text highlighting, optional/layered contents, Type1/CFF font support, etc.) along with improvements and consolidations of existing ones (enhanced text extraction, enhanced content rendering, enhanced acroform creation [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=pdfclown.wordpress.com&amp;blog=9357253&amp;post=229&amp;subd=pdfclown&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p style="text-align:right;"><span style="color:#999999;">[NOTE: this post was updated on November 14, 2011]</span></p>
<p style="text-align:center;"><span style="color:#99cc00;">Latest news: on November 14, 2011 <a href="http://pdfclown.wordpress.com/2011/11/14/pdf-clown-0-1-1-has-been-released/"><strong>PDF Clown 0.1.1</strong> has been released!</a></span></p>
<p><a href="http://www.stefanochizzolini.it/en/projects/clown/devel/roadmap.html#0.1.1">Next release</a> is going to introduce new exciting features (text highlighting, optional/layered contents, Type1/CFF font support, etc.) along with improvements and consolidations of existing ones (enhanced text extraction, enhanced content rendering, enhanced acroform creation and filling, etc.). This post will be kept updated according to development progress, so please stay tuned! <img src='http://s1.wp.com/wp-includes/images/smilies/icon_wink.gif' alt=';-)' class='wp-smiley' /> </p>
<div style="background-color:#98fb98;-moz-border-radius:10px;border-radius:10px;border:3px solid #68CB68;margin:15px;padding:10px;">NOTE &#8212; As 0.1.1 version is currently under development, <em>the new features described below are available through the trunk (HEAD revision) of <a href="http://sourceforge.net/scm/?type=svn&amp;group_id=176158"><strong>PDF Clown&#8217;s SVN repository</strong></a>.</em></div>
<p>These are some of the things I have been working on till now:</p>
<ul>
<li>primitive object model enhancements</li>
<li>text highlighting</li>
<li>metadata streams (XMP)</li>
<li>optional/layered contents</li>
<li>AcroForm fields filling</li>
</ul>
<h3>1. Primitive object model enhancements</h3>
<p>PDF primitive object model (see <a href="http://clown.sourceforge.net/0.1/docs/api/org/pdfclown/objects/package-summary.html">org.pdfclown.objects namespace</a>) has undergone a <em>substantial revision in order to simplify its use</em> (transparent update), <em>extend its functionality</em> (bidirectional traversal), <em>enforce its consistency</em> (simple object immutability) and <em>consolidate its code base</em> (parser classes refactoring).</p>
<p><strong>Bidirectional traversal</strong> has been accomplished by the introduction of explicit references to ascendants: composite objects (PdfDictionary, PdfArray, PdfStream) are now aware of their parent container, so walking through the ascending path to the root PdfIndirectObject (and File) is absolutely trivial! This functionality has loads of engaging potential applications, such as fine-grained object cloning based on structure context (as in case of Acroform annotations residing on a given page).</p>
<p>Ascendant-aware objects are intelligent enough to automatically detect and notify changes to their parent container, making <strong>incremental updates transparent to the user</strong>.</p>
<p><strong>Simple objects have been made immutable</strong> to avoid risks of unintended changes and promote their efficient reuse.</p>
<p>As expected (you may have noticed some TODO task comments about this within the project&#8217;s code base), <strong>object parsing of PostScript-related formats</strong> (PDF file, PDF content stream and CMaps) has been organized under the same class hierarchy to improve its consistency and maintainability.</p>
<h3>2. Text highlighting</h3>
<p><strong>Text highlighting</strong> was a <a href="http://pdfclown.wordpress.com/2010/01/02/upcoming-0-0-8-whats-going-to-be-new/#comment-19">much-requested feature</a>. It took me less than one hour of enjoyable coding to write a prototype which could <em>populate a PDF file with highlight annotations matching an arbitrary text pattern</em>, as you can see in the following figure representing a page of Alice in Wonderland resulting from the search of &#8220;rabbit&#8221; occurrences:</p>
<p><a href="http://pdfclown.files.wordpress.com/2011/04/texthighlight.jpg"><img class="aligncenter size-full wp-image-231" style="border:none;" title="An example of text highlighting generated by PDF Clown" src="http://pdfclown.files.wordpress.com/2011/04/texthighlight.jpg?w=700&#038;h=751" alt="" width="700" height="751" /></a></p>
<p>This text highlighting sample leverages both text extraction [line 55] and annotation [line 106] functionalities of PDF Clown, as you can see in its source code:</p>
<p><pre class="brush: java; highlight: [55,106];">
package org.pdfclown.samples.cli;

import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.pdfclown.documents.Page;
import org.pdfclown.documents.contents.ITextString;
import org.pdfclown.documents.contents.TextChar;
import org.pdfclown.documents.interaction.annotations.TextMarkup;
import org.pdfclown.documents.interaction.annotations.TextMarkup.MarkupTypeEnum;
import org.pdfclown.files.File;
import org.pdfclown.tools.TextExtractor;
import org.pdfclown.util.math.Interval;
import org.pdfclown.util.math.geom.Quad;

/**
  This sample demonstrates how to highlight text matching arbitrary patterns.
  Highlighting is defined through text markup annotations.

  @author Stefano Chizzolini (http://www.stefanochizzolini.it)
  @since 0.1.1
  @version 0.1.1
*/
public class TextHighlightSample
  extends Sample
{
  @Override
  public boolean run(
    )
  {
    String filePath = promptPdfFileChoice(&quot;Please select a PDF file&quot;);

    // 1. Open the PDF file!
    File file;
    try
    {file = new File(filePath);}
    catch(Exception e)
    {throw new RuntimeException(filePath + &quot; file access error.&quot;,e);}

    // Define the text pattern to look for!
    String textRegEx = promptChoice(&quot;Please enter the pattern to look for: &quot;);
    Pattern pattern = Pattern.compile(textRegEx, Pattern.CASE_INSENSITIVE);

    // 2. Iterating through the document pages...
    TextExtractor textExtractor = new TextExtractor(true, true);
    for(final Page page : file.getDocument().getPages())
    {
      System.out.println(&quot;\nScanning page &quot; + (page.getIndex()+1) + &quot;...\n&quot;);

      // 2.1. Extract the page text!
      Map&gt; textStrings = textExtractor.extract(page);

      // 2.2. Find the text pattern matches!
      final Matcher matcher = pattern.matcher(TextExtractor.toString(textStrings));

      // 2.3. Highlight the text pattern matches!
      textExtractor.filter(
        textStrings,
        new TextExtractor.IIntervalFilter()
        {
          @Override
          public boolean hasNext()
          {return matcher.find();}

          @Override
          public Interval next()
          {return new Interval(matcher.start(), matcher.end());}

          @Override
          public void process(
            Interval interval,
            ITextString match
            )
          {
            // Defining the highlight box of the text pattern match...
            List highlightQuads = new ArrayList();
            {
              /*
                NOTE: A text pattern match may be split across multiple contiguous lines,
                so we have to define a distinct highlight box for each text chunk.
              */
              Rectangle2D textBox = null;
              for(TextChar textChar : match.getTextChars())
              {
                Rectangle2D textCharBox = textChar.getBox();
                if(textBox == null)
                {textBox = (Rectangle2D)textCharBox.clone();}
                else
                {
                  if(textCharBox.getY() &gt; textBox.getMaxY())
                  {
                    highlightQuads.add(Quad.get(textBox));
                    textBox = (Rectangle2D)textCharBox.clone();
                  }
                  else
                  {textBox.add(textCharBox);}
                }
              }
              highlightQuads.add(Quad.get(textBox));
            }
            // Highlight the text pattern match!
            new TextMarkup(page, MarkupTypeEnum.Highlight, highlightQuads);
          }

          @Override
          public void remove()
          {throw new UnsupportedOperationException();}
        }
        );
    }

    // 3. Highlighted file serialization.
    serialize(file, false);

    return true;
  }
}
</pre></p>
<p>This is another example matching words which contain &#8220;co&#8221; (regular expression &#8220;\w*co\w*&#8221;):<br />
<a href="http://pdfclown.files.wordpress.com/2011/04/texthighlight2.jpg"><img class="aligncenter size-full wp-image-252" style="border:none;" title="Text highlighting containment matches" src="http://pdfclown.files.wordpress.com/2011/04/texthighlight2.jpg?w=700&#038;h=837" alt="" width="700" height="837" /></a><br />
Here you can appreciate the dehyphenation functionality applied to another search (words beginning with &#8220;devel&#8221; &#8212; regular expression &#8220;\bdevel\w*&#8221;):</p>
<p><a href="http://pdfclown.files.wordpress.com/2011/04/texthighlight3.jpg"><img class="aligncenter size-full wp-image-253" style="border:none;" title="Text highlighting on dehyphenated matches" src="http://pdfclown.files.wordpress.com/2011/04/texthighlight3.jpg?w=700&#038;h=754" alt="" width="700" height="754" /></a></p>
<h3>3. Metadata streams (XMP)</h3>
<p><a href="http://en.wikipedia.org/wiki/Extensible_Metadata_Platform">XMP</a> <strong>metadata streams</strong> are now available for reading and writing on any dictionary or stream entity within a PDF document (see PdfObjectWrapper.get/setMetadata()).</p>
<h3>4. Optional/Layered contents</h3>
<p>Smoothing out some PDF spec awkwardness while implementing the <strong>content layer (aka optional content) functionality</strong> proved to be an interesting challenge. The result was nothing but satisfaction: <em>a clean, intuitive and rich programming interface which automates lots of annoying housekeeping tasks</em> and lets you access even the whole raw structures in case of special needs! <img src='http://s0.wp.com/wp-includes/images/smilies/icon_cool.gif' alt='8-)' class='wp-smiley' /> </p>
<p><a href="http://pdfclown.files.wordpress.com/2011/04/layer.jpg"><img class="aligncenter size-full wp-image-273" style="border:none;" title="Content layering sample" src="http://pdfclown.files.wordpress.com/2011/04/layer.jpg?w=700&#038;h=742" alt="" width="700" height="742" /></a></p>
<p>The figure above represents a document generated by the following code sample; for the sake of comparison, <a href="http://itextpdf.com/examples/iia.php?id=265">I took an iText example</a> and translated it to PDF Clown, adding some niceties like the cooperation between the PrimitiveComposer (whose lower-level role is graphics composition through primitive operations like showing text lines and drawing shapes) and the BlockComposer (whose higher-level role is to arrange text within page areas managing alignments, paragraph spacing and indentation, hyphenation, and so on).</p>
<p><pre class="brush: java; highlight: [65,66,74,75,79,86,88,101,102,145,146,147,148,149];">
package org.pdfclown.samples.cli;

import java.awt.Dimension;
import java.awt.Point;
import java.awt.Rectangle;

import org.pdfclown.documents.Document;
import org.pdfclown.documents.Document.PageModeEnum;
import org.pdfclown.documents.Page;
import org.pdfclown.documents.contents.composition.AlignmentXEnum;
import org.pdfclown.documents.contents.composition.AlignmentYEnum;
import org.pdfclown.documents.contents.composition.BlockComposer;
import org.pdfclown.documents.contents.composition.PrimitiveComposer;
import org.pdfclown.documents.contents.fonts.StandardType1Font;
import org.pdfclown.documents.contents.layers.Layer;
import org.pdfclown.documents.contents.layers.Layer.ViewStateEnum;
import org.pdfclown.documents.contents.layers.LayerDefinition;
import org.pdfclown.documents.contents.layers.LayerGroup;
import org.pdfclown.documents.contents.layers.Layers;
import org.pdfclown.files.File;

/**
  This sample demonstrates how to define layers to control content visibility.

  @author Stefano Chizzolini (http://www.stefanochizzolini.it)
  @since 0.1.1
  @version 0.1.1
*/
public class LayerCreationSample
  extends Sample
{
  @Override
  public boolean run(
    )
  {
    // 1. PDF file instantiation.
    File file = new File();
    Document document = file.getDocument();

    // 2. Content creation.
    populate(document);

    // 3. Serialize the PDF file!
    serialize(file, false, &quot;Layer&quot;, &quot;inserting layers&quot;);

    return true;
  }

  private void populate(
    Document document
    )
  {
    // Initialize a new page!
    Page page = new Page(document);
    document.getPages().add(page);

    // Initialize the primitive composer (within the new page context)!
    PrimitiveComposer composer = new PrimitiveComposer(page);
    composer.setFont(new StandardType1Font(document, StandardType1Font.FamilyEnum.Helvetica, true, false), 12);

    // Initialize the block composer (wrapping the primitive one)!
    BlockComposer blockComposer = new BlockComposer(composer);

    // Initialize the document layer configuration!
    LayerDefinition layerDefinition = new LayerDefinition(document); // Creates the document layer configuration.
    document.setLayer(layerDefinition); // Activates the document layer configuration.
    document.setPageMode(PageModeEnum.Layers); // Shows the layers tab on document opening.

    // Get the root layers collection!
    Layers rootLayers = layerDefinition.getLayers();

    // 1. Nested layers.
    {
      Layer nestedLayer = new Layer(document, &quot;Nested layer&quot;);
      rootLayers.add(nestedLayer);
      Layers nestedSubLayers = nestedLayer.getLayers();

      Layer nestedLayer1 = new Layer(document, &quot;Nested layer 1&quot;);
      nestedSubLayers.add(nestedLayer1);

      Layer nestedLayer2 = new Layer(document, &quot;Nested layer 2&quot;);
      nestedSubLayers.add(nestedLayer2);
      nestedLayer2.setLocked(true);

      // NOTE: Text in this section is shown using PrimitiveComposer.
      composer.beginLayer(nestedLayer);
      composer.showText(nestedLayer.getTitle(), new Point(50, 50));
      composer.end();

      composer.beginLayer(nestedLayer1);
      composer.showText(nestedLayer1.getTitle(), new Point(50, 75));
      composer.end();

      composer.beginLayer(nestedLayer2);
      composer.showText(nestedLayer2.getTitle(), new Point(50, 100));
      composer.end();
    }

    // 2. Simple group (labeled group of non-nested, inclusive-state layers).
    {
      Layers simpleGroup = new Layers(document, &quot;Simple group&quot;);
      rootLayers.add(simpleGroup);

      Layer layer1 = new Layer(document, &quot;Grouped layer 1&quot;);
      simpleGroup.add(layer1);

      Layer layer2 = new Layer(document, &quot;Grouped layer 2&quot;);
      simpleGroup.add(layer2);

      // NOTE: Text in this section is shown using BlockComposer along with PrimitiveComposer
      // to demonstrate their flexible cooperation.
      blockComposer.begin(new Rectangle(50, 125, 200, 50), AlignmentXEnum.Left, AlignmentYEnum.Middle);

      composer.beginLayer(layer1);
      blockComposer.showText(layer1.getTitle());
      composer.end();

      blockComposer.showBreak(new Dimension(0, 15));

      composer.beginLayer(layer2);
      blockComposer.showText(layer2.getTitle());
      composer.end();

      blockComposer.end();
    }

    // 3. Radio group (labeled group of non-nested, exclusive-state layers).
    {
      Layers radioGroup = new Layers(document, &quot;Radio group&quot;);
      rootLayers.add(radioGroup);

      Layer radio1 = new Layer(document, &quot;Radiogrouped layer 1&quot;);
      radioGroup.add(radio1);
      radio1.setViewState(ViewStateEnum.On);

      Layer radio2 = new Layer(document, &quot;Radiogrouped layer 2&quot;);
      radioGroup.add(radio2);
      radio2.setViewState(ViewStateEnum.Off);

      Layer radio3 = new Layer(document, &quot;Radiogrouped layer 3&quot;);
      radioGroup.add(radio3);
      radio3.setViewState(ViewStateEnum.Off);

      // Register this option group in the layer configuration!
      LayerGroup options = new LayerGroup(document);
      options.add(radio1);
      options.add(radio2);
      options.add(radio3);
      layerDefinition.getOptionGroups().add(options);

      // NOTE: Text in this section is shown using BlockComposer along with PrimitiveComposer
      // to demonstrate their flexible cooperation.
      blockComposer.begin(new Rectangle(50, 185, 200, 75), AlignmentXEnum.Left, AlignmentYEnum.Middle);

      composer.beginLayer(radio1);
      blockComposer.showText(radio1.getTitle());
      composer.end();

      blockComposer.showBreak(new Dimension(0, 15));

      composer.beginLayer(radio2);
      blockComposer.showText(radio2.getTitle());
      composer.end();

      blockComposer.showBreak(new Dimension(0, 15));

      composer.beginLayer(radio3);
      blockComposer.showText(radio3.getTitle());
      composer.end();

      blockComposer.end();
    }
    composer.flush();
  }
}
</pre></p>
<p>Some comments on the code:</p>
<ul>
<li>document layer configuration initialization [lines 68-69]: this is the first operation to do;</li>
<li>layer creation [line 77] and insertion [line 78] into the hierarchical structure;</li>
<li>sublayer insertion [line 82];</li>
<li>content layering [lines 89, 91]: content is enclosed within a layer section, making its visibility dependent on the layer state. <em>There&#8217;s a subtle discrepancy in the PDF spec when it comes to nested layers</em>: one may assume they imply a hierarchical dependency of the sublayer states, but that&#8217;s NOT the case &#8212; if you hide a layer its descendants are still visible! To work around this counterintuitive behaviour, many software toolkits wrap contents within multiple nested layer blocks; for example, if you want to wrap the text &#8220;nested layer 1&#8243; into a layer (resource name /Pr2) which is a sublayer of another one (resource name /Pr1), the content stream will contain this cumbersome syntax:<br />
<code><br />
4 0 obj<br />
&lt;&lt; /Length 205 &gt;&gt;<br />
stream<br />
[...]<br />
<span style="background-color:yellow;"> /OC /Pr1 BDC<br />
/OC /Pr2 BDC</span><br />
q<br />
BT<br />
1 0 0 1 100 800 Tm<br />
/F1 12 Tf<br />
(nested layer 1)Tj<br />
ET<br />
Q<br />
<span style="background-color:yellow;">EMC<br />
EMC</span><br />
[...]<br />
endstream<br />
endobj<br />
</code><br />
This beast is repeated as many times as there are distinct content chunks to include within the same layer; it goes even worse as the number of nesting levels increases &#8212; just awful! <img src='http://s1.wp.com/wp-includes/images/smilies/icon_eek.gif' alt='8-O' class='wp-smiley' /> Instead of this, PDF Clown defines a default hierarchical membership for each layer which can be used as a single, terse wrapping block (resource name /Pr2):<br />
<code><br />
4 0 obj<br />
&lt;&lt; /Length 185 &gt;&gt;<br />
stream<br />
[...]<br />
<span style="background-color:yellow;">/OC /Pr2 BDC</span><br />
q<br />
BT<br />
1 0 0 1 100 800 Tm<br />
/F1 12 Tf<br />
(nested layer 1)Tj<br />
ET<br />
Q<br />
<span style="background-color:yellow;">EMC</span><br />
[...]<br />
endstream<br />
endobj<br />
&nbsp;<br />
6 0 obj<br />
&lt;&lt; /Type /Pages /Count 1 /Resources &lt;&lt; /Font 7 0 R /Properties 15 0 R &gt;&gt; /Kids [5 0 R ] &gt;&gt;<br />
endobj<br />
&nbsp;<br />
15 0 obj<br />
&lt;&lt; /Pr2 16 0 R &gt;&gt;<br />
endobj<br />
&nbsp;<br />
16 0 obj<br />
&lt;&lt; /Type /OCMD /OCGs [12 0 R 11 0 R ] /P /AllOn &gt;&gt; <span style="color:green;">% Membership containing the references to the layers belonging to the hierarchical path of nested layer 1.</span><br />
endobj<br />
</code><br />
This way code is concise and more maintainable (if you want to rearrange the hierarchical structure of the layers you don&#8217;t have to walk through the content stream hunting layer block occurrences for correction &#8212; just go to the membership associated to the layer and update its hierarchical path!). <img src='http://s0.wp.com/wp-includes/images/smilies/icon_smile.gif' alt=':-)' class='wp-smiley' /> </li>
<li>simple layer group creation and insertion [lines 104-105]</li>
<li>option group definition [lines 148-152]</li>
</ul>
<h3>5. AcroForm fields filling</h3>
<p>Text fields have been enhanced to support <em>automatic appearance update on value change</em>.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/pdfclown.wordpress.com/229/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/pdfclown.wordpress.com/229/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/pdfclown.wordpress.com/229/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/pdfclown.wordpress.com/229/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/pdfclown.wordpress.com/229/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/pdfclown.wordpress.com/229/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/pdfclown.wordpress.com/229/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/pdfclown.wordpress.com/229/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/pdfclown.wordpress.com/229/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/pdfclown.wordpress.com/229/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/pdfclown.wordpress.com/229/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/pdfclown.wordpress.com/229/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/pdfclown.wordpress.com/229/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/pdfclown.wordpress.com/229/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=pdfclown.wordpress.com&amp;blog=9357253&amp;post=229&amp;subd=pdfclown&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://pdfclown.wordpress.com/2011/04/12/waiting-for-pdf-clown-0-1-1-release/feed/</wfw:commentRss>
		<slash:comments>2</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/9b1849639b3f85a32527cf84fcfa52ff?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">stechio</media:title>
		</media:content>

		<media:content url="http://pdfclown.files.wordpress.com/2011/04/texthighlight.jpg" medium="image">
			<media:title type="html">An example of text highlighting generated by PDF Clown</media:title>
		</media:content>

		<media:content url="http://pdfclown.files.wordpress.com/2011/04/texthighlight2.jpg" medium="image">
			<media:title type="html">Text highlighting containment matches</media:title>
		</media:content>

		<media:content url="http://pdfclown.files.wordpress.com/2011/04/texthighlight3.jpg" medium="image">
			<media:title type="html">Text highlighting on dehyphenated matches</media:title>
		</media:content>

		<media:content url="http://pdfclown.files.wordpress.com/2011/04/layer.jpg" medium="image">
			<media:title type="html">Content layering sample</media:title>
		</media:content>
	</item>
		<item>
		<title>Celebrating Freedom</title>
		<link>http://pdfclown.wordpress.com/2011/03/07/celebrating-freedom/</link>
		<comments>http://pdfclown.wordpress.com/2011/03/07/celebrating-freedom/#comments</comments>
		<pubDate>Mon, 07 Mar 2011 23:43:31 +0000</pubDate>
		<dc:creator>stechio</dc:creator>
				<category><![CDATA[Musings]]></category>
		<category><![CDATA[Egypt]]></category>
		<category><![CDATA[Freedom]]></category>
		<category><![CDATA[Italy]]></category>
		<category><![CDATA[Libya]]></category>
		<category><![CDATA[Tunisia]]></category>

		<guid isPermaLink="false">http://pdfclown.wordpress.com/?p=192</guid>
		<description><![CDATA[This is a neat off-topic but, besides being a human and a european citizen, I&#8217;m an Italian: as in the next week (precisely on March 17, 2011) Italy will celebrate its 150th anniversary of unification, I&#8217;d like for a moment to speak out to all you guys coming to this blog from &#8217;round the world. [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=pdfclown.wordpress.com&amp;blog=9357253&amp;post=192&amp;subd=pdfclown&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p><img class="alignleft size-full wp-image-193" style="border:none;" title="Coccarda Unità d'Italia" src="http://pdfclown.files.wordpress.com/2011/03/coccardaitalia.jpg?w=700" alt=""   />This is a neat off-topic but, besides being a human and a european citizen, I&#8217;m an Italian: as in the next week (precisely on March 17, 2011) <a href="http://www.italiaunita150.it/programma-delle-celebrazioni.aspx"><strong>Italy</strong> will celebrate its 150th anniversary</a> of <a href="http://en.wikipedia.org/wiki/Italian_unification">unification</a>, I&#8217;d like for a moment to speak out to all you guys coming to this blog from &#8217;round the world.</p>
<p>Italy had a great past made by giants; unfortunately, it has currently a troubled present made by <a href="http://www.nytimes.com/2011/02/22/opinion/22iht-edcohen22.html">embarrassing dwarves</a>: <em>despite this sad situation, please do NOT believe to the traditional stereotypes which infamously depict <a href="http://en.wikipedia.org/wiki/Italian_people">Italians</a></em> &#8212; Italy is a complex country populated both by lovely persons and pure bastards, as any other country. <img src='http://s0.wp.com/wp-includes/images/smilies/icon_smile.gif' alt=':-)' class='wp-smiley' /><br />
<br style="clear:left;" /><br />
<span style="text-align:center; display: block;"><a href="http://pdfclown.wordpress.com/2011/03/07/celebrating-freedom/"><img src="http://img.youtube.com/vi/jrcA6j-GRE8/2.jpg" alt="" /></a></span></p>
<p>In this particular historical moment my thought goes, full of admiration, to the <strong>brave people of Northern Africa</strong> and all the other Arabs and Persians who are fighting for their freedom: as a European, I apologize for the weak political and humanitarian response of the Union&#8217;s institutions to their efforts for emancipating themselves from dictatorships. History is moving!</p>
<ul>
<li><a href="http://iamjan25.com/">I am Jan 25 &#8211; Videos and Pictures from the Egyptian Revolution</a></li>
</ul>
<p style="text-align:center;"><img class="aligncenter  wp-image-194" style="border:none;" title="Tunisia" src="http://pdfclown.files.wordpress.com/2011/03/tunisiaprotest.jpg?w=640&#038;h=350" alt="" width="640" height="350" /><img class="aligncenter  wp-image-195" style="border:none;" title="Egypt" src="http://pdfclown.files.wordpress.com/2011/03/egyptprotest.jpg?w=640&#038;h=310" alt="" width="640" height="310" /><img class="aligncenter  wp-image-196" style="border:none;" title="Libya" src="http://pdfclown.files.wordpress.com/2011/03/libyaprotest.jpg?w=640&#038;h=350" alt="" width="640" height="350" /></p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/pdfclown.wordpress.com/192/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/pdfclown.wordpress.com/192/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/pdfclown.wordpress.com/192/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/pdfclown.wordpress.com/192/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/pdfclown.wordpress.com/192/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/pdfclown.wordpress.com/192/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/pdfclown.wordpress.com/192/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/pdfclown.wordpress.com/192/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/pdfclown.wordpress.com/192/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/pdfclown.wordpress.com/192/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/pdfclown.wordpress.com/192/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/pdfclown.wordpress.com/192/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/pdfclown.wordpress.com/192/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/pdfclown.wordpress.com/192/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=pdfclown.wordpress.com&amp;blog=9357253&amp;post=192&amp;subd=pdfclown&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://pdfclown.wordpress.com/2011/03/07/celebrating-freedom/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/9b1849639b3f85a32527cf84fcfa52ff?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">stechio</media:title>
		</media:content>

		<media:content url="http://pdfclown.files.wordpress.com/2011/03/coccardaitalia.jpg" medium="image">
			<media:title type="html">Coccarda Unità d&#039;Italia</media:title>
		</media:content>

		<media:content url="http://pdfclown.files.wordpress.com/2011/03/tunisiaprotest.jpg" medium="image">
			<media:title type="html">Tunisia</media:title>
		</media:content>

		<media:content url="http://pdfclown.files.wordpress.com/2011/03/egyptprotest.jpg" medium="image">
			<media:title type="html">Egypt</media:title>
		</media:content>

		<media:content url="http://pdfclown.files.wordpress.com/2011/03/libyaprotest.jpg" medium="image">
			<media:title type="html">Libya</media:title>
		</media:content>
	</item>
		<item>
		<title>PDF Clown 0.1.0 has been released!</title>
		<link>http://pdfclown.wordpress.com/2011/03/04/pdf-clown-0-1-0-has-been-released/</link>
		<comments>http://pdfclown.wordpress.com/2011/03/04/pdf-clown-0-1-0-has-been-released/#comments</comments>
		<pubDate>Fri, 04 Mar 2011 00:50:07 +0000</pubDate>
		<dc:creator>stechio</dc:creator>
				<category><![CDATA[Release]]></category>
		<category><![CDATA[PDF]]></category>
		<category><![CDATA[software]]></category>

		<guid isPermaLink="false">http://pdfclown.wordpress.com/?p=183</guid>
		<description><![CDATA[Latest news: PDF Clown 0.1.0 has been superseded by PDF Clown 0.1.1 This release introduces support to cross-reference-stream-based PDF files (as defined since PDF 1.5 spec) along with page rendering and printing: a specialized tool provides a convenient way to convert PDF pages into images (aka rasterization). Lots of minor improvements have been applied too. [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=pdfclown.wordpress.com&amp;blog=9357253&amp;post=183&amp;subd=pdfclown&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p style="text-align:center;"><span style="color:#99cc00;">Latest news: PDF Clown 0.1.0 has been superseded by <a href="http://pdfclown.wordpress.com/2011/11/14/pdf-clown-0-1-1-has-been-released/"><strong>PDF Clown 0.1.1</strong></a></span></p>
<p>This release introduces support to <strong><a href="http://pdfclown.wordpress.com/2010/09/23/waiting-for-pdf-clown-0-1-release/">cross-reference-stream-based PDF files</a></strong> (as defined since PDF 1.5 spec) <a href="http://pdfclown.wordpress.com/2010/09/23/waiting-for-pdf-clown-0-1-release/">along with <strong>page rendering and printing</strong></a>: a specialized tool provides a convenient way to convert PDF pages into images (aka rasterization). Lots of minor improvements have been applied too.</p>
<p>Last but not least: the project&#8217;s base namespace has changed to <strong>org.pdfclown</strong></p>
<p>This release may be downloaded from:<br />
<a href="https://sourceforge.net/projects/clown/files/PDFClown-devel/0.1.0%20Alpha/">https://sourceforge.net/projects/clown/files/PDFClown-devel/0.1.0%20Alpha/</a></p>
<p>enjoy!</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/pdfclown.wordpress.com/183/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/pdfclown.wordpress.com/183/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/pdfclown.wordpress.com/183/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/pdfclown.wordpress.com/183/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/pdfclown.wordpress.com/183/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/pdfclown.wordpress.com/183/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/pdfclown.wordpress.com/183/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/pdfclown.wordpress.com/183/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/pdfclown.wordpress.com/183/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/pdfclown.wordpress.com/183/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/pdfclown.wordpress.com/183/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/pdfclown.wordpress.com/183/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/pdfclown.wordpress.com/183/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/pdfclown.wordpress.com/183/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=pdfclown.wordpress.com&amp;blog=9357253&amp;post=183&amp;subd=pdfclown&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://pdfclown.wordpress.com/2011/03/04/pdf-clown-0-1-0-has-been-released/feed/</wfw:commentRss>
		<slash:comments>20</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/9b1849639b3f85a32527cf84fcfa52ff?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">stechio</media:title>
		</media:content>
	</item>
		<item>
		<title>Waiting for PDF Clown 0.1.0 release</title>
		<link>http://pdfclown.wordpress.com/2010/09/23/waiting-for-pdf-clown-0-1-release/</link>
		<comments>http://pdfclown.wordpress.com/2010/09/23/waiting-for-pdf-clown-0-1-release/#comments</comments>
		<pubDate>Thu, 23 Sep 2010 00:23:45 +0000</pubDate>
		<dc:creator>stechio</dc:creator>
				<category><![CDATA[Development]]></category>
		<category><![CDATA[compatibility check]]></category>
		<category><![CDATA[cross-reference stream]]></category>
		<category><![CDATA[object stream]]></category>
		<category><![CDATA[PDF]]></category>
		<category><![CDATA[printing]]></category>
		<category><![CDATA[rasterization]]></category>
		<category><![CDATA[software]]></category>

		<guid isPermaLink="false">http://pdfclown.wordpress.com/?p=117</guid>
		<description><![CDATA[[NOTE: this post was updated on March 4, 2011] Latest news: on March 4, 2011 PDF Clown 0.1.0 has been released! Hi there! New features currently under development that will be available in the next (0.1.0) release: cross-reference streams and object streams version compatibility check content rasterization functions page data size (a.k.a. How to split [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=pdfclown.wordpress.com&amp;blog=9357253&amp;post=117&amp;subd=pdfclown&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p style="text-align:right;"><span style="color:#808080;">[NOTE: this post was updated on March 4, 2011]</span></p>
<p style="text-align:center;"><span style="color:#99cc00;">Latest news: on March 4, 2011 <a href="http://pdfclown.wordpress.com/2011/03/04/pdf-clown-0-1-0-has-been-released/"><strong>PDF Clown 0.1.0</strong> has been released!</a></span></p>
<p>Hi there!</p>
<p>New features currently under development that will be available in the next (0.1.0) release:</p>
<ul>
<li>cross-reference streams and object streams</li>
<li>version compatibility check</li>
<li>content rasterization</li>
<li>functions</li>
<li>page data size (a.k.a. How to split a PDF document based on maximum file size)</li>
</ul>
<p>It&#8217;s time to reveal you that I decided to consolidate the project&#8217;s identity (and simplify <a href="http://www.softdistrict.com/wp-content/uploads/2009/12/typing-software2.jpg">your typing life</a>) <em>changing its namespace prefix (it.stefanochizzolini.clown) in favor of the more succinct <strong>org.pdfclown</strong></em>: I know you were eager to strip that cluttering italian identifier! <img src='http://s1.wp.com/wp-includes/images/smilies/icon_wink.gif' alt=';-)' class='wp-smiley' /> </p>
<p>Last week I was informed that <strong><a href="http://www.usgs.gov/">USGS</a></strong> adopted PDF Clown for <em>relayering their topographic maps and attaching metadata to them</em>. Although <a href="http://www.agc.army.mil/GeoPDF/pdf/US%20geospatial%20PDF%20Working%20Group%20DC%20HLestinsky.pdf">on a technical note</a> it&#8217;s stated that its use will be only transitory, as they are converging toward a solution natively integrated with their main application suite (TerraGo), nonetheless its service in such a production environment seems to be an eloquent demonstration of its reliability. 8)</p>
<h3>1. Cross-reference streams and object streams</h3>
<p>After <a href="http://sourceforge.net/projects/clown/forums/forum/607163/topic/3434621">lots of requests</a>, I&#8217;m currently busy on the development of <strong>cross-reference stream and object stream</strong> read/write functionalities [PDF:1.6:3.4.6-7]; in particular, stream reading has been <em>partially based<em> up</em></em><em>on the code that <a href="http://en.wikipedia.org/wiki/Joshua_Tauberer">Joshua Tauberer</a> wrote some months ago</em> while he was experimenting with PDF Clown on PDF files analysis for his US Congress activity tracker, <a href="http://www.govtrack.us/">GovTrack</a>.</p>
<h3>2. Version compatibility check</h3>
<p>Working on cross-reference streams induced me to start supporting <strong>version-compatibility checking</strong> via annotations. This feature conveniently allows users to <em>transparently control that the PDF files they are creating or modifying conform to a target PDF version</em> (as specified in PDF file header) according to a configurable compatibility policy, defined through Document.Configuration.CompatibilityModeEnum &#8212; these are the alternative policies applicable:</p>
<ul>
<li>Passthrough: document&#8217;s conformance version is <em>ignored</em>; any feature is accepted without checking its compatibility.</li>
<li>Loose: document&#8217;s conformance version is <em>automatically updated</em> to support actually used features.</li>
<li>Strict: document&#8217;s conformance version is <em>mandatory</em>; any unsupported feature is forbidden and causes an exception to be thrown in case of attempted use.</li>
</ul>
<p><em>Automatic compatibility checking is very handy as users can enforce generated PDF files&#8217; conformance without manual intervention</em>; for example, you don&#8217;t have to tweak your PDF file version to 1.5 if you plan to use the optional content functionality (OCG [PDF:1.6:4.10]), just sit back and see it be done! <img src='http://s0.wp.com/wp-includes/images/smilies/icon_smile.gif' alt=':-)' class='wp-smiley' /> </p>
<h3>3. Content rasterization</h3>
<p>I&#8217;m quite impressed how naturally the existing model is integrating with <strong>PDF printing and image rasterization</strong> functionalities. <em>Leveraging the existing model means that there&#8217;s a common infrastructure</em> (see <a href="http://clown.sourceforge.net/API/it/stefanochizzolini/clown/documents/contents/ContentScanner.html">ContentScanner</a> and <a href="http://clown.sourceforge.net/API/it/stefanochizzolini/clown/documents/contents/objects/ContentObject.html">ContentObject</a> hierarchy) <em>that serves disparate purposes</em> (content creation, content analysis, content extraction, content rasterization, content editing, and so on), simplifying its understanding, use, maintenance and extension. I wanna stress that my goal is to come to an elegant viewer, NOT to a cumbersome retrofit component that&#8217;s added as an alien to fill the gap! <img src='http://s1.wp.com/wp-includes/images/smilies/icon_wink.gif' alt=';-)' class='wp-smiley' /> </p>
<p>Yes, I know <a href="http://www.stefanochizzolini.it/en/projects/clown/features.html">these goodies had been outside my official plans for a long time</a>, but during the last week of September, while crawling through the PDF Clown sources, I stumbled upon the above-mentioned <a href="http://clown.sourceforge.net/API/it/stefanochizzolini/clown/documents/contents/ContentScanner.html">ContentScanner</a> and <a href="http://clown.sourceforge.net/API/it/stefanochizzolini/clown/documents/contents/objects/ContentObject.html">ContentObject</a> hierarchy: I realized they were just ready for supporting content rendering, so I thought &#8220;What are we waiting for? Let&#8217;s do it!&#8221;&#8230; but don&#8217;t expect that 0.1 will deliver a full-fledged PDF viewer and printer &#8212; I&#8217;ll start prototyping the most basic graphics primitives such as space coordinates transformations, path drawing, color selection and so on. Advanced operations such as glyph outline drawing will necessarily appear afterwards. Anyway, I&#8217;m confident that at the end of the development process it will be possible to print and display PDF pages (and even independent parts of them such as <a href="http://clown.sourceforge.net/API/it/stefanochizzolini/clown/documents/contents/xObjects/FormXObject.html">external forms</a>) along with their thumbnails.</p>
<p>The figure below compares an example of PDF Clown&#8217;s current rasterization capabilities (on the left, via Java 2D graphics) with its equivalent generated by Adobe Reader (on the right). As you can see, path drawing is highly conformant with the reference implementation, while no text rendering has been implemented yet.</p>
<p><a href="http://pdfclown.files.wordpress.com/2010/09/render1.jpg"><img class="aligncenter size-full wp-image-152" style="border:none;" title="Rasterization comparison" src="http://pdfclown.files.wordpress.com/2010/09/render1.jpg?w=700&#038;h=423" alt="" width="700" height="423" /></a></p>
<p>Creating this figure was absolutely trivial &#8212; here it is the code sample used (line 34 executes the actual rendering of the first page of the document):</p>
<p><pre class="brush: java; highlight: [34];">
package org.pdfclown.samples;

import java.awt.Dimension;
import java.awt.image.BufferedImage;
import java.io.IOException;

import javax.imageio.ImageIO;

import org.pdfclown.documents.Document;
import org.pdfclown.files.File;
import org.pdfclown.tools.Renderer;

public class ContentRenderingSample
  extends Sample
{
  @Override
  public boolean run(
    )
  {
    String filePath = promptPdfFileChoice(&quot;Please select a PDF file&quot;);

    // 1. Open the PDF file!
    File file;
    try
    {file = new File(filePath);}
    catch(Exception e)
    {throw new RuntimeException(filePath + &quot; file access error.&quot;,e);}

    // 2. Get the PDF document!
    Document document = file.getDocument();

    // 3. Rasterize the first page!
    Renderer renderer = new Renderer();
    BufferedImage image = renderer.render(document.getPages().get(0),new Dimension(1400,850));

    // 4. Save the rasterized image!
    try
    {ImageIO.write(image,&quot;jpg&quot;,new java.io.File(getOutputPath() + java.io.File.separator + &quot;ContentRenderingSample.jpg&quot;));}
    catch(IOException e)
    {e.printStackTrace();}

    return true;
  }
}
</pre></p>
<p>As you can see in the following code chunk, Renderer.render(&#8230;) method takes care to prepare the target graphics context [line 31] delegating its rendering to the chosen <a href="http://clown.sourceforge.net/API/it/stefanochizzolini/clown/documents/contents/IContentContext.html">content context</a> [line 32] (that is, in this case, a <a href="http://clown.sourceforge.net/API/it/stefanochizzolini/clown/documents/Page.html">Page</a> object):</p>
<p><pre class="brush: java; highlight: [31,32];">
package org.pdfclown.tools;

import java.awt.Dimension;
import java.awt.image.BufferedImage;

import org.pdfclown.documents.contents.IContentContext;

/**
  Tool for rendering {@link IContentContext content contexts}.

  @author Stefano Chizzolini (http://www.stefanochizzolini.it)
  @version 0.1.0
  @since 0.1.0
*/
public class Renderer
{
    . . .

    /**
      Renders the specified content context into an image context.

      @param contentContext Source content context.
      @param size Image size expressed in device-space units (that is typically pixels).
      @return Image representing the rendered contents.
    */
    public BufferedImage render(
      IContentContext contentContext,
      Dimension size
      )
    {
      BufferedImage image = new BufferedImage(size.width,size.height,BufferedImage.TYPE_INT_BGR);
      contentContext.render(image.createGraphics(),size);

      return image;
    }
}
</pre></p>
<p>The Page object then delegates its own contents rendering to <a href="http://clown.sourceforge.net/API/it/stefanochizzolini/clown/documents/contents/ContentScanner.html">ContentScanner</a> [line 36], which sequentially scans every graphics operation producing its corresponding raster representation:</p>
<p><pre class="brush: java; highlight: [36];">
package org.pdfclown.documents;

import java.awt.Graphics2D;
import java.awt.geom.Dimension2D;
import java.awt.print.Printable;

import org.pdfclown.PDF;
import org.pdfclown.VersionEnum;
import org.pdfclown.documents.contents.ContentScanner;
import org.pdfclown.documents.contents.IContentContext;
import org.pdfclown.objects.PdfDictionary;
import org.pdfclown.objects.PdfObjectWrapper;

/**
  [PDF:1.6:3.6.2] Document page.

  @author Stefano Chizzolini (http://www.stefanochizzolini.it)
  @since 0.0.0
  @version 0.1.0
*/
@PDF(VersionEnum.PDF10)
public class Page
  extends PdfObjectWrapper
  implements IContentContext,
    Printable
{
  . . .

  @Override
  public void render(
    Graphics2D context,
    Dimension2D size
    )
  {
    ContentScanner scanner = new ContentScanner(getContents());
    scanner.render(context,size);
  }

  . . .
}
</pre></p>
<h3>4. Functions</h3>
<p>Improving the color space definitions for content rasterization is forcing me to also manage <strong>functions</strong> [PDF:1.6:3.9] in all their flavors (Type 0 (Sampled), Type 2 (Exponential Interpolation), Type 3 (Stitching) and Type 4 (PostScript Calculator)).</p>
<h3>5. Page data size (a.k.a. How to split a PDF document based on maximum file size)</h3>
<p><a href="http://clown.sourceforge.net/0.1/docs/api/org/pdfclown/tools/PageManager.html">org.pdfclown.tools.PageManager</a> has been enhanced with the introduction of <strong>an elegant algorithm that accurately calculates the data size of PDF pages</strong> keeping shared resources (like fonts, images and so on) into consideration: this practically means that you can evaluate the incremental size of each page in a document, splitting the file when the collected pages reach the maximum file size you intended for your target split PDF files, <em>without creating any cumbersome temporary file!</em></p>
<p>It&#8217;s really funny that proprietary products like PDFTron&#8217;s, which cost at least hundreds of bucks, <a href="http://www.pdftron.com/kb/questions.php?questionid=237">suggest in their official SDKs awkward trial-and-error strategies, iteratively creating horrible temporary files</a>&#8230; evidently, only clowns can solve such a task better <img src='http://s2.wp.com/wp-includes/images/smilies/icon_razz.gif' alt=':-P' class='wp-smiley' />  &#8212; <a href="http://www.fsf.org/">Free Software</a> rocks! *&lt;;o)</p>
<p><pre class="brush: java; highlight: [43];">
package org.pdfclown.samples;

import java.util.HashSet;
import java.util.Set;

import org.pdfclown.documents.Document;
import org.pdfclown.documents.Page;
import org.pdfclown.documents.Pages;
import org.pdfclown.files.File;
import org.pdfclown.objects.PdfReference;
import org.pdfclown.tools.PageManager;

public class SplitSample
  extends Sample
{
  private static final long MaxDataSize = 4 &lt;&lt; 20; // 4 MBytes (you can obviously set it at your will).
  private PageManager manager;

  @Override
  public boolean run(
    )
  {
    // 1. Opening the PDF file...
    File file;
    {
      String filePath = promptPdfFileChoice(&quot;Please select a PDF file&quot;);
      try
      {file = new File(filePath);}
      catch(Exception e)
      {throw new RuntimeException(filePath + &quot; file access error.&quot;,e);}
    }
    Document document = file.getDocument();
    Pages pages = document.getPages();

    // 2. Splitting the document...
    manager = new PageManager(document);
    int splitIndex = 0;
    long incrementalDataSize = 0;
    int beginPageIndex = 0;
    Set visitedReferences = new HashSet();
    for(Page page : pages)
    {
      long pageDifferentialDataSize = PageManager.getSize(page,visitedReferences); 
      incrementalDataSize += pageDifferentialDataSize;
      if(incrementalDataSize &gt; MaxDataSize) // Data size limit reached.
      {
        int endPageIndex = page.getIndex();

        // Split the current document page range!
        splitDocument(++splitIndex,beginPageIndex,endPageIndex);

        beginPageIndex = endPageIndex;
        incrementalDataSize = PageManager.getSize(page,visitedReferences = new HashSet());
      }
    }
    // Split the last document page range!
    splitDocument(++splitIndex,beginPageIndex,pages.size());

    return true;
  }

  private void splitDocument(
    int splitIndex,
    int beginPageIndex,
    int endPageIndex
    )
  {
    // 1. Split the document!
    Document splitDocument = manager.extract(beginPageIndex,endPageIndex);

    // 2. Serialize the split file!
    serialize(splitDocument.getFile(),this.getClass().getSimpleName() + &quot;.&quot; + (splitIndex),false);
  }
}
</pre></p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/pdfclown.wordpress.com/117/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/pdfclown.wordpress.com/117/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/pdfclown.wordpress.com/117/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/pdfclown.wordpress.com/117/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/pdfclown.wordpress.com/117/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/pdfclown.wordpress.com/117/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/pdfclown.wordpress.com/117/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/pdfclown.wordpress.com/117/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/pdfclown.wordpress.com/117/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/pdfclown.wordpress.com/117/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/pdfclown.wordpress.com/117/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/pdfclown.wordpress.com/117/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/pdfclown.wordpress.com/117/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/pdfclown.wordpress.com/117/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=pdfclown.wordpress.com&amp;blog=9357253&amp;post=117&amp;subd=pdfclown&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://pdfclown.wordpress.com/2010/09/23/waiting-for-pdf-clown-0-1-release/feed/</wfw:commentRss>
		<slash:comments>12</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/9b1849639b3f85a32527cf84fcfa52ff?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">stechio</media:title>
		</media:content>

		<media:content url="http://pdfclown.files.wordpress.com/2010/09/render1.jpg" medium="image">
			<media:title type="html">Rasterization comparison</media:title>
		</media:content>
	</item>
		<item>
		<title>PDF Clown 0.0.8: Q&amp;A</title>
		<link>http://pdfclown.wordpress.com/2010/09/04/pdf-clown-0-0-8-qa/</link>
		<comments>http://pdfclown.wordpress.com/2010/09/04/pdf-clown-0-0-8-qa/#comments</comments>
		<pubDate>Sat, 04 Sep 2010 10:23:37 +0000</pubDate>
		<dc:creator>stechio</dc:creator>
				<category><![CDATA[Help]]></category>
		<category><![CDATA[PDF]]></category>
		<category><![CDATA[software]]></category>

		<guid isPermaLink="false">http://pdfclown.wordpress.com/?p=103</guid>
		<description><![CDATA[[NOTE: this post was updated on March 21, 2011] Latest news: PDF Clown 0.0.8 functionalities are part of the latest release (PDF Clown 0.1.0) &#8212; as 0.0 version series is under decommissioning, you&#8217;re warmly invited to adopt the current 0.1 version series. Thank you! This post collects all the relevant information about issues and questions [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=pdfclown.wordpress.com&amp;blog=9357253&amp;post=103&amp;subd=pdfclown&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p style="text-align:right;"><span style="color:#999999;">[NOTE: this post was updated on March 21, 2011]</span></p>
<p style="text-align:center;"><span style="color:#999999;"><span style="color:#99cc00;">Latest news: PDF Clown 0.0.8 functionalities are part of the latest release (<a href="http://pdfclown.wordpress.com/2011/03/04/pdf-clown-0-1-0-has-been-released/"><strong>PDF Clown 0.1.0</strong></a>) &#8212; as 0.0 version series is under decommissioning, you&#8217;re warmly invited to adopt the current 0.1 version series. Thank you!</span><br />
</span></p>
<p>This post collects all the relevant information about <strong>issues and questions regarding PDF Clown 0.0.8</strong>.</p>
<p><em>If you have any doubt on topics not treated here, please apply your question to the <a href="http://sourceforge.net/projects/clown/forums/forum/607163">Help forum</a>.</em></p>
<h3>1. &#8216;GoToExternalDestination&#8217; class missing</h3>
<p>See <a href="http://sourceforge.net/projects/clown/forums/forum/607163/topic/3836075">Topic 3836075</a> in the Help forum.</p>
<h3>2. &#8216;xref&#8217; keyword not found</h3>
<p>See <a href="http://sourceforge.net/projects/clown/forums/forum/607163/topic/3434621">Topic 3434621</a> in the Help forum.</p>
<h3>3. Unknown type: Comment</h3>
<p>See <a href="http://sourceforge.net/projects/clown/forums/forum/607163/topic/3863926">Topic  3863926</a> in the Help forum.</p>
<h3>4. Text line height</h3>
<p>See <a href="https://sourceforge.net/projects/clown/forums/forum/607163/topic/3928380">Topic 3928380</a> in the Help forum.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/pdfclown.wordpress.com/103/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/pdfclown.wordpress.com/103/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/pdfclown.wordpress.com/103/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/pdfclown.wordpress.com/103/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/pdfclown.wordpress.com/103/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/pdfclown.wordpress.com/103/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/pdfclown.wordpress.com/103/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/pdfclown.wordpress.com/103/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/pdfclown.wordpress.com/103/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/pdfclown.wordpress.com/103/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/pdfclown.wordpress.com/103/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/pdfclown.wordpress.com/103/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/pdfclown.wordpress.com/103/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/pdfclown.wordpress.com/103/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=pdfclown.wordpress.com&amp;blog=9357253&amp;post=103&amp;subd=pdfclown&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://pdfclown.wordpress.com/2010/09/04/pdf-clown-0-0-8-qa/feed/</wfw:commentRss>
		<slash:comments>2</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/9b1849639b3f85a32527cf84fcfa52ff?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">stechio</media:title>
		</media:content>
	</item>
		<item>
		<title>PDF Clown 0.0.8 has been released!</title>
		<link>http://pdfclown.wordpress.com/2010/08/22/pdf-clown-0-0-8-has-been-released/</link>
		<comments>http://pdfclown.wordpress.com/2010/08/22/pdf-clown-0-0-8-has-been-released/#comments</comments>
		<pubDate>Sun, 22 Aug 2010 02:45:43 +0000</pubDate>
		<dc:creator>stechio</dc:creator>
				<category><![CDATA[Release]]></category>
		<category><![CDATA[PDF]]></category>
		<category><![CDATA[software]]></category>

		<guid isPermaLink="false">http://pdfclown.wordpress.com/?p=92</guid>
		<description><![CDATA[[NOTE: this post was updated on March 21, 2011] Latest news: PDF Clown 0.0.8 functionalities are part of the latest release (PDF Clown 0.1.0) &#8212; as 0.0 version series is under decommissioning, you&#8217;re warmly invited to adopt the current 0.1 version series. Thank you! This release is focused on text extraction support: a specialized tool [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=pdfclown.wordpress.com&amp;blog=9357253&amp;post=92&amp;subd=pdfclown&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p style="text-align:right;"><span style="color:#808080;">[NOTE: this post was updated on  March 21, 2011]</span></p>
<p style="text-align:center;"><span style="color:#99cc00;">Latest  news: PDF Clown 0.0.8 functionalities are part of the latest release (</span><a href="http://pdfclown.wordpress.com/2011/03/04/pdf-clown-0-1-0-has-been-released/"><strong>PDF  Clown 0.1.0</strong></a><span style="color:#99cc00;">) &#8212; as 0.0 version series is under  decommissioning, you&#8217;re warmly invited to adopt the current 0.1 version  series. Thank you!</span></p>
<p>This release is focused on <a href="http://pdfclown.wordpress.com/2010/01/02/upcoming-0-0-8-whats-going-to-be-new/"><strong>text extraction</strong> support</a>: <em>a specialized  tool provides, along with plain-text extraction, advanced  functionalities</em> such as full graphic state of extracted text (font, font  size, text color, text rendering mode, text position&#8230;), text  filtering by area, text grouping and sorting. Lots of minor improvements  have been applied too.</p>
<p>Java version migrated to <strong>Java 6</strong> platform, while C#/.NET version migrated  to <strong>.NET 3.5</strong>.</p>
<p><a href="http://www.gnu.org/licenses/lgpl.html"><strong> LGPL 3</strong></a> is the new license applied to the project.</p>
<p>Last but not least: the distribution&#8217;s directory structure has been  revised to simplify its navigation and ease its integration with common  IDEs (Eclipse- and Visual Studio-compatible).</p>
<p>This release may be downloaded from:<br />
<a href="https://sourceforge.net/projects/clown/files/PDFClown-devel/0.0.8%20Alpha/" target="_new">https://sourceforge.net/projects/clown/files/PDFClown-devel/0.0.8%20Alpha/</a></p>
<p>enjoy!</p>
<h3>Patches</h3>
<p>See <a href="http://sourceforge.net/tracker/?limit=25&amp;func=&amp;group_id=176158&amp;atid=876125&amp;assignee=&amp;status=&amp;category=&amp;artgroup=&amp;keyword=0.0.8.1&amp;submitter=&amp;artifact_id=&amp;assignee=&amp;status=&amp;category=&amp;artgroup=&amp;submitter=&amp;keyword=PDF+Clown+0.0.8&amp;artifact_id=&amp;submit=Filter">PDF Clown 0.0.8 patches</a>.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/pdfclown.wordpress.com/92/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/pdfclown.wordpress.com/92/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/pdfclown.wordpress.com/92/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/pdfclown.wordpress.com/92/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/pdfclown.wordpress.com/92/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/pdfclown.wordpress.com/92/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/pdfclown.wordpress.com/92/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/pdfclown.wordpress.com/92/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/pdfclown.wordpress.com/92/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/pdfclown.wordpress.com/92/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/pdfclown.wordpress.com/92/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/pdfclown.wordpress.com/92/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/pdfclown.wordpress.com/92/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/pdfclown.wordpress.com/92/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=pdfclown.wordpress.com&amp;blog=9357253&amp;post=92&amp;subd=pdfclown&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://pdfclown.wordpress.com/2010/08/22/pdf-clown-0-0-8-has-been-released/feed/</wfw:commentRss>
		<slash:comments>1</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/9b1849639b3f85a32527cf84fcfa52ff?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">stechio</media:title>
		</media:content>
	</item>
		<item>
		<title>Waiting for PDF Clown 0.0.8 release</title>
		<link>http://pdfclown.wordpress.com/2010/01/02/upcoming-0-0-8-whats-going-to-be-new/</link>
		<comments>http://pdfclown.wordpress.com/2010/01/02/upcoming-0-0-8-whats-going-to-be-new/#comments</comments>
		<pubDate>Sat, 02 Jan 2010 16:00:04 +0000</pubDate>
		<dc:creator>stechio</dc:creator>
				<category><![CDATA[Development]]></category>
		<category><![CDATA[PDF]]></category>
		<category><![CDATA[software]]></category>
		<category><![CDATA[text extraction]]></category>

		<guid isPermaLink="false">http://pdfclown.wordpress.com/?p=30</guid>
		<description><![CDATA[[NOTE: this post was updated on March 21, 2011] Latest news: PDF Clown 0.0.8 functionalities are part of the latest release (PDF Clown 0.1.0) &#8212; as 0.0 version series is under decommissioning, you&#8217;re warmly invited to adopt the current 0.1 version series. Thank you! I know, it&#8217;s been just about one year since the latest [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=pdfclown.wordpress.com&amp;blog=9357253&amp;post=30&amp;subd=pdfclown&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p style="text-align:right;"><span style="color:#999999;">[NOTE: this post was updated on March 21, 2011]</span></p>
<p style="text-align:center;"><span style="color:#999999;"><span style="color:#99cc00;">Latest news: PDF Clown 0.0.8 functionalities are part of the latest release (<a href="http://pdfclown.wordpress.com/2011/03/04/pdf-clown-0-1-0-has-been-released/"><strong>PDF Clown 0.1.0</strong></a>) &#8212; as 0.0 version series is under decommissioning, you&#8217;re warmly invited to adopt the current 0.1 version series. Thank you!</span><br />
</span></p>
<p>I know, it&#8217;s been just about one year since the latest version (<a href="http://sourceforge.net/project/shownotes.php?release_id=651012" target="_blank">0.0.7</a>) was released&#8230; please, forgive me! <img src='http://s1.wp.com/wp-includes/images/smilies/icon_wink.gif' alt=';-)' class='wp-smiley' /> </p>
<p>In the meantime PDF Clown has been growing considerably to provide a rich <strong>text extraction</strong> functionality for its next 0.0.8 version:</p>
<ul>
<li>the <a href="http://clown.sourceforge.net/API/it/stefanochizzolini/clown/documents/contents/fonts/package-summary.html" target="_blank"><strong>font model</strong></a> has been deeply revised and expanded to smoothly support character encoding issues;</li>
<li>the <a href="http://clown.sourceforge.net/API/it/stefanochizzolini/clown/documents/contents/objects/package-summary.html" target="_blank"><strong>content stream model</strong></a> has been furtherly harmonized to simplify the access to text contents;</li>
<li>the <a href="http://clown.sourceforge.net/API/it/stefanochizzolini/clown/documents/contents/ContentScanner.html" target="_blank"><strong>content scanner</strong></a> has been simplified in its iterative mechanism and enriched through a new level of abstraction to allow easy object placement detection (image and text characters coordinates);</li>
<li>a <a href="http://clown.sourceforge.net/API/it/stefanochizzolini/clown/tools/TextExtractor.html"><strong>text extraction tool</strong></a> allows sub-page region selection to extract text only from specific page areas.</li>
</ul>
<p>Waiting for the termination of the current development iteration, let&#8217;s see some new stuff!</p>
<p>NOTE: the following code samples are expressed extending the Sample class common to all the <a href="http://en.wikipedia.org/wiki/Command-line_interface">CLI</a> samples shipped with <a href="http://sourceforge.net/projects/clown/files/PDFClown-devel/0.0.8%20Alpha/">PDF Clown 0.0.8 downloadable distribution</a>.</p>
<h3>1. Basic text extraction</h3>
<p>This code sample demonstrates the most basic way to extract text content according to PDF Clown 0.0.8.</p>
<p><pre class="brush: java; highlight: [36,41,42,43,63,66,67,68,69,70,71];">
package it.stefanochizzolini.clown.samples;

import it.stefanochizzolini.clown.documents.Document;
import it.stefanochizzolini.clown.documents.Page;
import it.stefanochizzolini.clown.documents.contents.ContentScanner;
import it.stefanochizzolini.clown.documents.contents.fonts.Font;
import it.stefanochizzolini.clown.documents.contents.objects.ContainerObject;
import it.stefanochizzolini.clown.documents.contents.objects.ContentObject;
import it.stefanochizzolini.clown.documents.contents.objects.ShowText;
import it.stefanochizzolini.clown.documents.contents.objects.Text;
import it.stefanochizzolini.clown.files.File;

import java.util.HashMap;
import java.util.Map;

public class BasicTextExtractionSample
  extends Sample
{
  @Override
  public boolean run(
    )
  {
    String filePath = promptPdfFileChoice(&quot;Please select a PDF file&quot;);

    // 1. Open the PDF file!
    File file;
    try
    {file = new File(filePath);}
    catch(Exception e)
    {throw new RuntimeException(filePath + &quot; file access error.&quot;,e);}

    // 2. Get the PDF document!
    Document document = file.getDocument();

    // 3. Extracting text from the document pages...
    for(Page page : document.getPages())
    {
      if(!prompt(page))
        return false;

      extract(
        new ContentScanner(page) // Wraps the page contents into a scanner.
        );
    }

    return true;
  }

  /**
    Scans a content level looking for text.
  */
  /*
    NOTE: Page contents are represented by a sequence of content objects,
    possibly nested into multiple levels.
  */
  private void extract(
    ContentScanner level
    )
  {
    if(level == null)
      return;

    while(level.moveNext())
    {
      ContentObject content = level.getCurrent();
      if(content instanceof ShowText)
      {
        Font font = level.getState().font;
        // Extract the current text chunk, decoding it!
        System.out.println(font.decode(((ShowText)content).getText()));
      }
      else if(content instanceof Text
        || content instanceof ContainerObject)
      {
        // Scan the inner level!
        extract(level.getChildLevel());
      }
    }
  }

  private boolean prompt(
    Page page
    )
  {
    int pageIndex = page.getIndex();
    if(pageIndex &gt; 0)
    {
      Map&lt;String,String&gt; options = new HashMap&lt;String,String&gt;();
      options.put(&quot;&quot;, &quot;Scan next page&quot;);
      options.put(&quot;Q&quot;, &quot;End scanning&quot;);
      if(!promptChoice(options).equals(&quot;&quot;))
        return false;
    }

    System.out.println(&quot;\nScanning page &quot; + (pageIndex+1) + &quot;...\n&quot;);
    return true;
  }
}
</pre></p>
<p>In order to understand this sample, you have to know that the PDF Specification prescribes text content to be shown through so-called ShowText operations; so, we look for that kind of object&#8230;</p>
<p>Here it is the way it works:</p>
<ol>
<li>iterate the <strong>document pages</strong> [lines 36-44] applying the <a href="http://clown.sourceforge.net/API/it/stefanochizzolini/clown/documents/contents/ContentScanner.html" target="_blank">ContentScanner</a> to the current page [lines 39-41];</li>
<li>iterate the current <strong>page contents</strong> (through ContentScanner) looking for ShowText operations [lines 63-78], recurring into <a href="http://clown.sourceforge.net/API/it/stefanochizzolini/clown/documents/contents/objects/ContainerObject.html" target="_blank">ContainerObject</a>-s and <a href="http://clown.sourceforge.net/API/it/stefanochizzolini/clown/documents/contents/objects/Text.html" target="_blank">Text</a> objects;</li>
<li>extract the text content from <strong><a href="http://clown.sourceforge.net/API/it/stefanochizzolini/clown/documents/contents/objects/ShowText.html" target="_blank">ShowText</a> operations</strong> [line 70].</li>
</ol>
<div id="attachment_71" class="wp-caption aligncenter" style="width: 610px"><img class="size-full wp-image-71" title="Source document" src="http://pdfclown.files.wordpress.com/2010/01/textextraction1.jpg?w=700" alt=""   /><p class="wp-caption-text">Incipit of the Japanese translation of the UN Universal Declaration of Human Rights</p></div>
<p>Applying this code sample to a document such as the <a href="http://www.ohchr.org/EN/UDHR/Pages/Language.aspx?LangID=jpn">Japanese translation</a> of the <a href="http://www.ohchr.org/EN/UDHR/Pages/Introduction.aspx">UN Universal Declaration of Human Rights</a> (see above), the result is pretty accurate, despite the extracted text contains exceeding line breaks (see second row in the figure below): such discrepancy is due to the way the PDF Specification defines text data representation. Particularly, contents within ShowText operations may have been (legally) arbitrarily split by the document generator, as at the time of its inception the PDF format was primarily aimed at typographic rendition instead of content accessibility. For this purpose, the above-mentioned TextExtractor tool provides the appropriate heuristics to effortlessly organize the extracted text in a more intelligible manner (see the following paragraphs).</p>
<div id="attachment_72" class="wp-caption aligncenter" style="width: 487px"><img class="size-full wp-image-72" title="Extracted text" src="http://pdfclown.files.wordpress.com/2010/01/textextraction2.jpg?w=700" alt=""   /><p class="wp-caption-text">Text extracted by PDF Clown from the incipit of the Japanese translation of the UN Universal Declaration of Human Rights.</p></div>
<h3>2. Extended text extraction</h3>
<p>This code sample shows <em>how to exploit the new abstraction level provided by the content scanner of PDF Clown 0.0.8</em>, which allows you to get a rich set of information describing the <strong>graphic state of extracted text</strong> (font, font size, text color, text rendering mode, text bounding box&#8230;).</p>
<p>In order to demonstrate its precision in detecting text position, the following code also draws the bounding box of each single character appearing on the pages.</p>
<p><pre class="brush: java; highlight: [82,84,99];">
package it.stefanochizzolini.clown.samples;

import it.stefanochizzolini.clown.documents.Document;
import it.stefanochizzolini.clown.documents.Page;
import it.stefanochizzolini.clown.documents.contents.ContentScanner;
import it.stefanochizzolini.clown.documents.contents.TextChar;
import it.stefanochizzolini.clown.documents.contents.colorSpaces.DeviceRGBColor;
import it.stefanochizzolini.clown.documents.contents.composition.PrimitiveFilter;
import it.stefanochizzolini.clown.documents.contents.objects.ContainerObject;
import it.stefanochizzolini.clown.documents.contents.objects.ContentObject;
import it.stefanochizzolini.clown.documents.contents.objects.Text;
import it.stefanochizzolini.clown.files.File;
import it.stefanochizzolini.clown.tools.PageStamper;

import java.awt.geom.Rectangle2D;

public class TextInfoExtractionSample
  extends Sample
{
  private DeviceRGBColor[] textCharBoxColors = new DeviceRGBColor[]
    {
      new DeviceRGBColor(200f/255,100f/255,100f/255),
      new DeviceRGBColor(100f/255,200f/255,100f/255),
      new DeviceRGBColor(100f/255,100f/255,200f/255)
    };
  private DeviceRGBColor textStringBoxColor = DeviceRGBColor.Black;

  @Override
  public boolean run(
    )
  {
    String filePath = promptPdfFileChoice(&quot;Please select a PDF file&quot;);

    // 1. Open the PDF file!
    File file;
    try
    {file = new File(filePath);}
    catch(Exception e)
    {throw new RuntimeException(filePath + &quot; file access error.&quot;,e);}

    // 2. Get the PDF document!
    Document document = file.getDocument();

    PageStamper stamper = new PageStamper(); // NOTE: Page stamper is used to draw contents on existing pages.

    // 3. Iterating through the document pages...
    for(Page page : document.getPages())
    {
      System.out.println(&quot;\nScanning page &quot; + (page.getIndex()+1) + &quot;...\n&quot;);

      stamper.setPage(page);

      extract(
        new ContentScanner(page), // Wraps the page contents into a scanner.
        stamper.getForeground()
        );

      stamper.flush();
    }

    serialize(file,false);

    return true;
  }

  /**
    Scans a content level looking for text.
  */
  private void extract(
    ContentScanner level,
    PrimitiveFilter builder
    )
  {
    if(level == null)
      return;

    while(level.moveNext())
    {
      ContentObject content = level.getCurrent();
      if(content instanceof Text)
      {
        ContentScanner.TextWrapper text = (ContentScanner.TextWrapper)level.getCurrentWrapper();
        int colorIndex = 0;
        for(ContentScanner.TextStringWrapper textString : text.getTextStrings())
        {
          Rectangle2D stringBox = textString.getBox();
          System.out.println(
            &quot;Text [&quot;
              + &quot;x:&quot; + Math.round(stringBox.getX()) + &quot;,&quot;
              + &quot;y:&quot; + Math.round(stringBox.getY()) + &quot;,&quot;
              + &quot;w:&quot; + Math.round(stringBox.getWidth()) + &quot;,&quot;
              + &quot;h:&quot; + Math.round(stringBox.getHeight())
              + &quot;]: &quot; + textString.getText()
              );

          // Drawing text character bounding boxes...
          colorIndex = (colorIndex + 1) % textCharBoxColors.length;
          builder.setStrokeColor(textCharBoxColors[colorIndex]);
          for(TextChar textChar : textString.getTextChars())
          {
            /*
              NOTE: You can get further text information
              (font, font size, text color, text rendering mode)
              through textChar.style.
             */
            builder.drawRectangle(textChar.box);
            builder.stroke();
          }

          // Drawing text string bounding box...
          builder.beginLocalState();
          builder.setLineDash(0, 5);
          builder.setStrokeColor(textStringBoxColor);
          builder.drawRectangle(textString.getBox());
          builder.stroke();
          builder.end();
        }
      }
      else if(content instanceof ContainerObject)
      {
        // Scan the inner level!
        extract(level.getChildLevel(),builder);
      }
    }
  }
}
</pre></p>
<p>This sample works exactly the same way as the previous &#8220;1. Basic text extraction&#8221; sample, but it dramatically empowers the extraction functionality providing decoded text along with its graphic attributes, such as font, font size, bounding box, text color, and so on:</p>
<ol>
<li>ContentScanner.TextWrapper represents a <strong>text object</strong> extracted from the ContentScanner [line 82];</li>
<li>each ContentScanner.TextWrapper contains a list of <strong>text chunks </strong>(ContentScanner.TextStringWrapper) [line 84];</li>
<li>each ContentScanner.TextStringWrapper contains a list of <strong>text characters</strong> (TextChar) [line 99];</li>
<li>each TextChar provides information about the <strong>character state </strong>(position and style).</li>
</ol>
<p>The figure below shows the result of this code running over the <a href="http://www.ohchr.org/EN/UDHR/Pages/Language.aspx?LangID=grk">greek translation</a> of the <a href="http://www.ohchr.org/EN/UDHR/Pages/Introduction.aspx">UN Universal Declaration of Human Rights</a>.</p>
<div id="attachment_77" class="wp-caption aligncenter" style="width: 710px"><a href="http://pdfclown.files.wordpress.com/2010/01/textextraction3.jpg"><img class="size-full wp-image-77" title="Text bounding box detection" src="http://pdfclown.files.wordpress.com/2010/01/textextraction3.jpg?w=700&#038;h=390" alt="" width="700" height="390" /></a><p class="wp-caption-text">Text characters framed within their respective bounding boxes.</p></div>
<h3>3. Advanced text extraction</h3>
<p>PDF Clown supports a third level of text extraction functionality built upon the others (basic and extended, as seen above): the <strong>TextExtractor</strong> tool.</p>
<p>Its purpose is to <em>leverage the extended text extraction features for sorting, aggregating and integrating the retrieved text chunks</em>. With TextExtractor you can:</p>
<ul>
<li>extract <strong>full text information</strong> (text content along with graphic attributes for each single character (font, font size, text color, text rendering mode, text bounding box…)) or just <strong>plain text</strong>;</li>
<li>extract <strong>all the text content</strong> in a page (or any other <a href="http://clown.sourceforge.net/API/it/stefanochizzolini/clown/documents/contents/IContentContext.html">IContentContext</a>, such as <a href="http://clown.sourceforge.net/API/it/stefanochizzolini/clown/documents/contents/xObjects/FormXObject.html">FormXObject</a>) or filter just <strong>partial page areas</strong>.</li>
</ul>
<h4>3.1. Plain text extraction</h4>
<p>This sample demonstrates the <em>extreme simplicity involved in extracting plain text from a page</em>: after you have instantiated the TextExtractor [line 31], it&#8217;s just a matter of passing your page [line 38] &#8212; nothing but 1 line of code!</p>
<p><pre class="brush: java; highlight: [31,38];">
package it.stefanochizzolini.clown.samples;

import it.stefanochizzolini.clown.documents.Document;
import it.stefanochizzolini.clown.documents.Page;
import it.stefanochizzolini.clown.files.File;
import it.stefanochizzolini.clown.tools.TextExtractor;

import java.util.HashMap;
import java.util.Map;

public class AdvancedPlainTextExtractionSample
  extends Sample
{
  @Override
  public boolean run(
    )
  {
    String filePath = promptPdfFileChoice(&quot;Please select a PDF file&quot;);

    // 1. Open the PDF file!
    File file;
    try
    {file = new File(filePath);}
    catch(Exception e)
    {throw new RuntimeException(filePath + &quot; file access error.&quot;,e);}

    // 2. Get the PDF document!
    Document document = file.getDocument();

    // 3. Extracting plain text from the document pages...
    TextExtractor extractor = new TextExtractor();
    for(Page page : document.getPages())
    {
      if(!prompt(page))
        return false;

      // Extract plain text from the current page!
      System.out.println(extractor.extractPlain(page));
    }

    return true;
  }

  private boolean prompt(
    Page page
    )
  {
    int pageIndex = page.getIndex();
    if(pageIndex &gt; 0)
    {
      Map&lt;String,String&gt; options = new HashMap&lt;String,String&gt;();
      options.put(&quot;&quot;, &quot;Scan next page&quot;);
      options.put(&quot;Q&quot;, &quot;End scanning&quot;);
      if(!promptChoice(options).equals(&quot;&quot;))
        return false;
    }

    System.out.println(&quot;\nScanning page &quot; + (pageIndex+1) + &quot;...\n&quot;);
    return true;
  }
}
</pre></p>
<h4>3.2. Full text extraction</h4>
<p>In this case <em>text content is extracted along with its graphic attributes</em> (font, font size, text color, text rendering mode, text bounding box&#8230;).<br />
Note that, as we didn&#8217;t specify any particular page area, text strings are all gathered within the default area (the page itself), identified by the <code>null</code> key [line 40].</p>
<p><pre class="brush: java; highlight: [40];">
package it.stefanochizzolini.clown.samples;

import it.stefanochizzolini.clown.documents.Document;
import it.stefanochizzolini.clown.documents.Page;
import it.stefanochizzolini.clown.documents.contents.ITextString;
import it.stefanochizzolini.clown.files.File;
import it.stefanochizzolini.clown.tools.TextExtractor;

import java.awt.geom.Rectangle2D;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class AdvancedTextExtractionSample
  extends Sample
{
  @Override
  public boolean run(
    )
  {
    String filePath = promptPdfFileChoice(&quot;Please select a PDF file&quot;);

    // 1. Open the PDF file!
    File file;
    try
    {file = new File(filePath);}
    catch(Exception e)
    {throw new RuntimeException(filePath + &quot; file access error.&quot;,e);}

    // 2. Get the PDF document!
    Document document = file.getDocument();

    // 3. Extracting text from the document pages...
    TextExtractor extractor = new TextExtractor();
    for(Page page : document.getPages())
    {
      if(!prompt(page))
        return false;

      List&lt;ITextString&gt; textStrings = extractor.extract(page).get(null);
      for(ITextString textString : textStrings)
      {
        Rectangle2D textStringBox = textString.getBox();
        System.out.println(
          &quot;Text [&quot;
            + &quot;x:&quot; + Math.round(textStringBox.getX()) + &quot;,&quot;
            + &quot;y:&quot; + Math.round(textStringBox.getY()) + &quot;,&quot;
            + &quot;w:&quot; + Math.round(textStringBox.getWidth()) + &quot;,&quot;
            + &quot;h:&quot; + Math.round(textStringBox.getHeight())
            + &quot;]: &quot; + textString.getText()
            );
      }
    }

    return true;
  }

  private boolean prompt(
    Page page
    )
  {
    int pageIndex = page.getIndex();
    if(pageIndex &gt; 0)
    {
      Map&lt;String,String&gt; options = new HashMap&lt;String,String&gt;();
      options.put(&quot;&quot;, &quot;Scan next page&quot;);
      options.put(&quot;Q&quot;, &quot;End scanning&quot;);
      if(!promptChoice(options).equals(&quot;&quot;))
        return false;
    }

    System.out.println(&quot;\nScanning page &quot; + (pageIndex+1) + &quot;...\n&quot;);
    return true;
  }
}
</pre></p>
<h4>3.3. Page area filtering</h4>
<p><em><strong>Text filtering by page area</strong> can be done both before and after extracting the page text</em>:</p>
<ul>
<li>pre-filtering: TextExtractor.getAreas()/setAreas(&#8230;) methods allow the user to define the relevant page areas before extracting the text;</li>
<li>post-filtering: TextExtractor.filter(&#8230;) methods allow the user to select text by area from previously-extracted text (useful in case of multi-stage processing).</li>
</ul>
<p>In this case we apply the text filtering to a common task: retrieving the text associated to link annotations on a page [lines 80-82] (maybe you don&#8217;t know that text links on PDF pages are just superimposed to the &#8220;associated&#8221; text, so a location inference is needed in order to match the position of a link annotation with the respective text &#8212; such a tough work <img src='http://s0.wp.com/wp-includes/images/smilies/icon_biggrin.gif' alt=':-D' class='wp-smiley' /> ).</p>
<p><pre class="brush: java; highlight: [80,81,82];">
package it.stefanochizzolini.clown.samples;

import it.stefanochizzolini.clown.documents.Document;
import it.stefanochizzolini.clown.documents.Page;
import it.stefanochizzolini.clown.documents.PageAnnotations;
import it.stefanochizzolini.clown.documents.contents.ITextString;
import it.stefanochizzolini.clown.documents.fileSpecs.FileSpec;
import it.stefanochizzolini.clown.documents.interaction.actions.Action;
import it.stefanochizzolini.clown.documents.interaction.actions.GoToDestination;
import it.stefanochizzolini.clown.documents.interaction.actions.GoToEmbedded;
import it.stefanochizzolini.clown.documents.interaction.actions.GoToNonLocal;
import it.stefanochizzolini.clown.documents.interaction.actions.GoToURI;
import it.stefanochizzolini.clown.documents.interaction.actions.GoToEmbedded.TargetObject;
import it.stefanochizzolini.clown.documents.interaction.annotations.Annotation;
import it.stefanochizzolini.clown.documents.interaction.annotations.Link;
import it.stefanochizzolini.clown.documents.interaction.navigation.document.Destination;
import it.stefanochizzolini.clown.files.File;
import it.stefanochizzolini.clown.objects.PdfObjectWrapper;
import it.stefanochizzolini.clown.tools.TextExtractor;

import java.awt.geom.Rectangle2D;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class LinkTextExtractionSample
  extends Sample
{
  @Override
  public boolean run(
    )
  {
    String filePath = promptPdfFileChoice(&quot;Please select a PDF file&quot;);

    // 1. Open the PDF file!
    File file;
    try
    {file = new File(filePath);}
    catch(Exception e)
    {throw new RuntimeException(filePath + &quot; file access error.&quot;,e);}

    // 2. Get the PDF document!
    Document document = file.getDocument();

    // 3. Extracting links text from the document pages...
    TextExtractor extractor = new TextExtractor();
    extractor.setAreaTolerance(2); // 2 pt tolerance on area boundary detection.
    for(Page page : document.getPages())
    {
      if(!prompt(page))
        return false;

      Map&lt;Rectangle2D,List&lt;ITextString&gt;&gt; textStrings = null;

      // Get the page annotations!
      PageAnnotations annotations = page.getAnnotations();
      if(annotations == null)
      {
        System.out.println(&quot;No annotations here.&quot;);
        continue;
      }

      boolean linkFound = false;
      for(Annotation annotation : annotations)
      {
        if(annotation instanceof Link)
        {
          linkFound = true;

          if(textStrings == null)
          {textStrings = extractor.extract(page);}

          Link link = (Link)annotation;
          Rectangle2D linkBox = link.getBox();
          /*
            Extracting text superimposed by the link...
            NOTE: As links have no strong relation to page text but a weak location correspondence,
            we have to filter extracted text by link area.
          */
          StringBuilder linkTextBuilder = new StringBuilder();
          for(ITextString linkTextString : extractor.filter(textStrings,linkBox))
          {linkTextBuilder.append(linkTextString.getText());}
          System.out.println(&quot;Link '&quot; + linkTextBuilder + &quot;' &quot;);
          System.out.println(
            &quot;    Position: &quot;
              + &quot;x:&quot; + Math.round(linkBox.getX()) + &quot;,&quot;
              + &quot;y:&quot; + Math.round(linkBox.getY()) + &quot;,&quot;
              + &quot;w:&quot; + Math.round(linkBox.getWidth()) + &quot;,&quot;
              + &quot;h:&quot; + Math.round(linkBox.getHeight())
              );
          System.out.print(&quot;    Target: &quot;);
          PdfObjectWrapper&lt;?&gt; target = link.getTarget();
          if(target instanceof Destination)
          {printDestination((Destination)target);}
          else if(target instanceof Action)
          {printAction((Action)target);}
          else if(target == null)
          {System.out.println(&quot;[not available]&quot;);}
          else
          {System.out.println(&quot;[unknown type: &quot; + target.getClass().getSimpleName() + &quot;]&quot;);}
        }
      }
      if(!linkFound)
      {
        System.out.println(&quot;No links here.&quot;);
        continue;
      }
    }

    return true;
  }

  private void printAction(
    Action action
    )
  {
    System.out.println(&quot;Action [&quot; + action.getClass().getSimpleName() + &quot;] &quot; + action.getBaseObject());
    if(action instanceof GoToDestination&lt;?&gt;)
    {
      if(action instanceof GoToNonLocal&lt;?&gt;)
      {
        FileSpec fileSpec = ((GoToNonLocal&lt;?&gt;)action).getFileSpec();
        if(fileSpec != null)
        {System.out.println(&quot;    Filename: &quot; + fileSpec.getFilename());}

        if(action instanceof GoToEmbedded)
        {
          TargetObject target = ((GoToEmbedded)action).getTarget();
          System.out.println(&quot;    EmbeddedFilename: &quot; + target.getEmbeddedFileName() + &quot; Relation: &quot; + target.getRelation());
        }
      }
      System.out.print(&quot;    &quot;);
      printDestination(((GoToDestination&lt;?&gt;)action).getDestination());
    }
    else if(action instanceof GoToURI)
    {System.out.println(&quot;    URI: &quot; + ((GoToURI)action).getURI());}
  }

  private void printDestination(
    Destination destination
    )
  {
    System.out.println(destination.getClass().getSimpleName() + &quot; &quot; + destination.getBaseObject());
    System.out.print(&quot;    Page &quot;);
    Object pageRef = destination.getPageRef();
    if(pageRef instanceof Page)
    {
      Page refPage = (Page)pageRef;
      System.out.println((refPage.getIndex()+1) + &quot; [ID: &quot; + refPage.getBaseObject() + &quot;]&quot;);
    }
    else
    {System.out.println(((Integer)pageRef+1));}
  }

  private boolean prompt(
    Page page
    )
  {
    int pageIndex = page.getIndex();
    if(pageIndex &gt; 0)
    {
      Map&lt;String,String&gt; options = new HashMap&lt;String,String&gt;();
      options.put(&quot;&quot;, &quot;Scan next page&quot;);
      options.put(&quot;Q&quot;, &quot;End scanning&quot;);
      if(!promptChoice(options).equals(&quot;&quot;))
        return false;
    }

    System.out.println(&quot;\nScanning page &quot; + (pageIndex+1) + &quot;...\n&quot;);
    return true;
  }
}
</pre></p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/pdfclown.wordpress.com/30/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/pdfclown.wordpress.com/30/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/pdfclown.wordpress.com/30/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/pdfclown.wordpress.com/30/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/pdfclown.wordpress.com/30/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/pdfclown.wordpress.com/30/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/pdfclown.wordpress.com/30/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/pdfclown.wordpress.com/30/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/pdfclown.wordpress.com/30/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/pdfclown.wordpress.com/30/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/pdfclown.wordpress.com/30/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/pdfclown.wordpress.com/30/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/pdfclown.wordpress.com/30/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/pdfclown.wordpress.com/30/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=pdfclown.wordpress.com&amp;blog=9357253&amp;post=30&amp;subd=pdfclown&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://pdfclown.wordpress.com/2010/01/02/upcoming-0-0-8-whats-going-to-be-new/feed/</wfw:commentRss>
		<slash:comments>20</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/9b1849639b3f85a32527cf84fcfa52ff?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">stechio</media:title>
		</media:content>

		<media:content url="http://pdfclown.files.wordpress.com/2010/01/textextraction1.jpg" medium="image">
			<media:title type="html">Source document</media:title>
		</media:content>

		<media:content url="http://pdfclown.files.wordpress.com/2010/01/textextraction2.jpg" medium="image">
			<media:title type="html">Extracted text</media:title>
		</media:content>

		<media:content url="http://pdfclown.files.wordpress.com/2010/01/textextraction3.jpg" medium="image">
			<media:title type="html">Text bounding box detection</media:title>
		</media:content>
	</item>
	</channel>
</rss>
