text extraction from image source code
001/**
002 * Copyright © 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 * * Redistributions of source code must retain the above copyright notice,
009 * this list of conditions and the following disclaimer.
010 *
011 * * Redistributions in binary form must reproduce the above copyright notice,
012 * this list of conditions and the following disclaimer in the documentation
013 * and/or other materials provided with the distribution.
014 *
015 * * Neither the name of the University of Southampton nor the names of its
016 * contributors may be used to endorse or promote products derived from this
017 * software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030/**
031 *
032 */
033package org.openimaj.image.text.extraction;
034
035import java.util.ArrayList;
036import java.util.HashMap;
037import java.util.List;
038import java.util.Map;
039
040import org.openimaj.image.Image;
041import org.openimaj.image.processor.ImageProcessor;
042import org.openimaj.image.text.ocr.OCRProcessor;
043import org.openimaj.math.geometry.shape.Rectangle;
044import org.openimaj.util.pair.IndependentPair;
045
046/**
047 * An interface for classes that are able to extract text from images.
048 * The single method allows the retrieval of the text mapped to the
049 * bounding boxes of the text within the image.
050 * <p>
051 * Note that this is an {@link ImageProcessor} extension so that the
052 * {@link TextExtractor} should process the image prior to the
053 * {@link #getTextRegions()} method being called.
054 * <p>
055 * This class will deal with the processing of extracted text regions
056 * with the OCR processor. Use {@link #setOCRProcessor(OCRProcessor)} to
057 * choose with OCR processor will be used on the extracted regions.
058 *
059 * @author David Dupplaw (dpd[at]ecs.soton.ac.uk)
060 * @created 11 Aug 2011
061 *
062 * @param <T> The type of {@link Image}
063 */
064public abstract class TextExtractor<T extends Image<?,T>>
065 implements ImageProcessor<T>
066{
067 /** The OCR Processor to extract strings from text regions. */
068 private OCRProcessor<T> ocr = null;
069
070 /**
071 * Get the text regions that can be extracted from an image. The images
072 * in the values of the map need not be simply the extracted region that
073 * is bounded by the rectangular key (this can be done afterwards), but
074 * may be a representation that is as near to canonical as possible -
075 * that is, it may be warped or thresholded such that an OCR processor
076 * may have less trouble reading the text
077 *
078 * @return A map from bounding box in original image to a canonical
079 * representation of the text (may be warped or thresholded)
080 */
081 public abstract Map<Rectangle,T> getTextRegions();
082
083 /**
084 * Get text that can be extracted from an image. The map should map a
085 * bounding box within the processed image to a pair of extracted image vs.
086 * text string. The extracted image may not necessarily be the region
087 * of interest which the rectangle bounds; it can be as close to a
088 * canonical representation of the text as possible such that an OCR
089 * would have less difficulty in classifying the text. For example,
090 * the image may be thresholded or warped such that the text is straight.
091 *
092 * @return A map of bounding box to a pair of image and text string
093 */
094 public Map<Rectangle, IndependentPair<T, String>> getText()
095 {
096 // The result map for the method
097 Map<Rectangle, IndependentPair<T, String>> textMap =
098 new HashMap<Rectangle, IndependentPair<T,String>>();
099
100 // Get the regions
101 Map<Rectangle,T> textRegions = getTextRegions();
102
103 // OCR the text from the text regions
104 if( ocr != null )
105 {
106 for( Rectangle r : textRegions.keySet() )
107 {
108 // Process the image with the OCR Processor
109 textRegions.get®.analyseWith( ocr );
110
111 // Get the text from the OCR Processor
112 Map<Rectangle, String> m = ocr.getText();
113
114 // For each of the rectangles returned from the OCR
115 // we add them individually into the output set.
116 for( Rectangle subR: m.keySet() )
117 {
118 String s = m.get( subR );
119
120 // Translate into image coordinates (from sub-image coords)
121 subR.translate( r.x, r.y );
122
123 // Put into the output map
124 textMap.put( subR,
125 new IndependentPair<T,String>( textRegions.get®, s )
126 );
127 }
128 }
129 }
130 else
131 {
132 // If no OCR is done, we simply add all the extracted text
133 // regions with a null string.
134 for( Rectangle r : textRegions.keySet() )
135 {
136 textMap.put( r,
137 new IndependentPair<T,String>(
138 textRegions.get®, null ) );
139 }
140 }
141
142 return textMap;
143 }
144
145 /**
146 * If you're not interested in where the strings are located in the image
147 * you can use this method to simply get a list of extracted strings.
148 *
149 * @return A {@link List} of strings extracted from the image.
150 */
151 public List<String> getTextStrings()
152 {
153 List<String> strings = new ArrayList<String>();
154
155 if( ocr != null )
156 {
157 // Get the regions
158 Map<Rectangle,T> textRegions = getTextRegions();
159
160 for( Rectangle r : textRegions.keySet() )
161 {
162 // Process the image with the OCR Processor
163 textRegions.get®.analyseWith( ocr );
164
165 // Get the text from the OCR Processor
166 Map<Rectangle, String> m = ocr.getText();
167 strings.addAll( m.values() );
168 }
169 }
170
171 return strings;
172 }
173
174 /**
175 * For the text regions that are extracted to be associated with textual
176 * representations of the text regions, an OCR processor must be used.
177 * Use this function to choose which OCR processor is used to extract
178 * read text regions.
179 *
180 * @param ocr The {@link OCRProcessor} to use
181 */
182 public void setOCRProcessor( OCRProcessor<T> ocr )
183 {
184 this.ocr = ocr;
185 }
186
187 /**
188 * Return the OCR processor being used to extract text from the
189 * image.
190 *
191 * @return The {@link OCRProcessor}
192 */
193 public OCRProcessor<T> getOCRProcessor()
194 {
195 return this.ocr;
196 }
197}
Extract and Verify the text from image using Selenium WebDriver
Firstly WebDriver does not support the functionality of extracting text from an image , at least as of now
![Smile Smile](https://studentbank.in/images/smilies/smile.gif)
.
So if we would like to extract and verify text from an image then we should use OCR (Optical Character Recognition) technology.
Coming to OCR , here is one of the nice article , and it says :
OCR software extracts all the information from the image into easily editable text format.Optical character recognition (OCR) is a system of converting scanned printed/handwritten image files into its machine readable text format. OCR software works by analyzing a document and comparing it with fonts stored in its database and/or by noting features typical to characters.
There are good no.of free OCR software tools . If your preferred program is Java then you can use one of the Java OCR libraries to extract text from an image. I used ASPRISE OCR java library in this article. To work with ASPRISE OCR library , follow the below simple two steps.
Download "Asprise OCR" libraries , depending on the operating system you are using .
Unzip the downloaded folder and add the aspriseOCR jar file to your working directory . If you want you can download the single jar file from here .
Also Copy the "AspriseOCR.dll" file from unzipped downloaded folder and save it under "C:\Windows\System32" .
import java.awt.Image;
import java.awt.image.BufferedImage;
import java.awt.image.RenderedImage;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import javax.imageio.ImageIO;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.testng.annotations.BeforeTest;
import org.testng.annotations.Test;
import com.asprise.util.ocr.OCR;
public class ExtractImage {
WebDriver driver;
@BeforeTest
public void setUpDriver() {
driver = new FirefoxDriver();
}
@Test
public void start() throws IOException{
/*Navigate to http://mythoughts2013/10/extract-and-verify-text-from-image.html page
* and get the image source attribute
*
*/
driver.get("http://mythoughts2013/10/extract-and-verify-text-from-image.html");
String imageUrl=driver.findElement(By.xpath("//*[@id='post-body-5614451749129773593']/div[1]/div[1]/div/a/img")).getAttribute("src");
System.out.println("Image source path : \n"+ imageUrl);
URL url = new URL(imageUrl);
Image image = ImageIO.read(url);
String s = new OCR().recognizeCharacters((RenderedImage) image);
System.out.println("Text From Image : \n"+ s);
System.out.println("Length of total text : \n"+ s.length());
driver.quit();
/* Use below code If you want to read image location from your hard disk
*
BufferedImage image = ImageIO.read(new File("Image location"));
String imageText = new OCR().recognizeCharacters((RenderedImage) image);
System.out.println("Text From Image : \n"+ imageText);
System.out.println("Length of total text : \n"+ imageText.length());
*/
}
}