JAVA and HADOOP
7/11/2016
2/25/2016
Zip File Using Java or ExecutorService Real time Example
From below two programs helps to do concurrent zipping of all files in folder using java.
import java.io.File; import java.io.FilenameFilter; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import com.google.common.collect.Lists; public class ZipIt { public static void main(String[] args){ try { File dir=new File("<DIR PATH>"); File[] xmlFiles=null; if (dir.isDirectory()) { xmlFiles = dir.listFiles(new FilenameFilter() { @Override public boolean accept(File folder, String name) { return name.toLowerCase().endsWith(".xml"); } }); } ExecutorService executorService = Executors.newScheduledThreadPool(60); List<List<File>> smallerLists = Lists.partition(Arrays.asList(xmlFiles), 60); List<Future<Integer>> futures=null; //20 independent threads used to generate 20 images. List<Callable<Integer>> callables = new ArrayList<Callable<Integer>>(); for (List<File> list : smallerLists) { callables.add(new ZipFilesThread(list)); } try { futures = executorService.invokeAll(callables); } finally { executorService.shutdown(); } for (Future<Integer> future : futures) { System.out.println("File converted to Zip:"+future.isDone()); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
import java.io.File; import java.util.List; import java.util.concurrent.Callable; import java.util.logging.Logger; import com.mercuria.etl.util.ZipUtil; public class ZipFilesThread implements Callable<Integer> { List<File> files; private static Logger logger = Logger.getLogger(ZipFilesThread.class.getName()); public ZipFilesThread(List<File> files) { super(); this.files=files; } @Override public Integer call() throws Exception { for (File xmlFile : files) { ZipUtil.zipFile(xmlFile.getAbsolutePath()); xmlFile.delete(); } return files.size(); } }
9/01/2015
Execute putty (Plink) commands (On remote Linux machine) from java
Execute putty commands (On remote Linux machine) from java
1. Before executing this program
we need to download putty and Plink from below link. Put the
downloaded files in one folder. (ex: C:\putty). After download istall or run once putty.exe and plink.exe
2. We need to set this folder to
class path or In java program set the right folder path in below line.
And execute the program.
Below is the example java program. you can test your commend replacing the correct host,username and password. and line # 26 with right command.
Note: Make sure that your commend must end with \n.
Note: Make sure that your commend must end with \n.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 | import java.io.InputStream; import java.io.OutputStream; public class PuttyTest { private static String host = "***"; private static String userName = "***"; private static String password = "****"; public static void main(String args[]) throws Exception { PuttyTest test=new PuttyTest(); System.out.println(test.getLimServerStatus()); } public String getLimServerStatus() throws Exception { try { String command = "c:/putty/plink -v "+host+" -l "+userName+" -pw "+password; Runtime r = Runtime.getRuntime (); Process p = r.exec (command); Thread.sleep (1000); InputStream std = p.getInputStream (); OutputStream out = p.getOutputStream (); //InputStream err = p.getErrorStream (); out.write ("tail -n 1000 config/load_updates.hst | grep \"Error\" | wc -l\n".getBytes ()); out.flush (); Thread.sleep (3000); int value = 0; String otherString=null; if (std.available () > 0) { value = std.read(); otherString=String.valueOf((char) value); while (std.available() > 0) { value = std.read(); otherString+=String.valueOf((char) value); } } int count=0; String[] lines = otherString.split("\r\n|\r|\n"); for (String string : lines) { System.out.println(string+" :"+count++); } p.destroy (); return lines[lines.length-2]; // needed output is in third line. }catch(Exception e) { e.printStackTrace(); } return null; } } |
7/02/2015
Extract Text From (Image, PDF, Image embedded in PDF)
Extract Text From (Image, PDF, Image embedded in PDF)
----------------------------------------------------------------------------------------------------
Extracting text from the PDF is easy but extract text from the PDF that you received through scan is bit difficult. Because each scanned page is embedded in PDF as image.
Logic
-------
So in these kind of PDFs, first we have to extract images from PDF than extract text from images.
step1:
If you have maven project add below dependency to your pom. This decency required to extract images from the PDF.
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>2.0.0</version>
</dependency>
Step2:
To extract text from the image we need to install tesseract-ocr. Download .exe from the site and install the EXE https://code.google.com/p/tesseract-ocr/
Once after installing the EXE add home directory to the PATH. I added installed folder path to PATH system variable (Properties->Advance settings->Environment Variable)
C:\Program Files (x86)\Tesseract-OCR
Step3:
Open the windows commend prompt and run the text "tesseract" . You should not get command not exist error here . if you get error check for how to set the path.
Step4:
Once all set restart the eclipse and execute the below Program with your PDF. It generate the text file of pdfs in given folder.
----------------------------------------------------------------------------------------------------
Extracting text from the PDF is easy but extract text from the PDF that you received through scan is bit difficult. Because each scanned page is embedded in PDF as image.
Logic
-------
So in these kind of PDFs, first we have to extract images from PDF than extract text from images.
step1:
If you have maven project add below dependency to your pom. This decency required to extract images from the PDF.
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>2.0.0</version>
</dependency>
Step2:
To extract text from the image we need to install tesseract-ocr. Download .exe from the site and install the EXE https://code.google.com/p/tesseract-ocr/
Once after installing the EXE add home directory to the PATH. I added installed folder path to PATH system variable (Properties->Advance settings->Environment Variable)
C:\Program Files (x86)\Tesseract-OCR
Step3:
Open the windows commend prompt and run the text "tesseract" . You should not get command not exist error here . if you get error check for how to set the path.
Step4:
Once all set restart the eclipse and execute the below Program with your PDF. It generate the text file of pdfs in given folder.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | import java.io.File; import net.sourceforge.tess4j.util.PdfUtilities; import org.apache.commons.io.FileUtils; public class TesseractExample { static String imageFolderPath="C:/santosh/PNG"; static public void main(String[] args) { try { File[] imageFile = PdfUtilities.convertPdf2Png(new File( "C:/santosh/1999_001.pdf")); File dir=new File("C:/santosh/IMAGE_TEXT"); if(!dir.exists()) { if (dir.mkdir()) { System.out.println("Directory is created!"); } } else { FileUtils.cleanDirectory(dir); } int i=1; for (File file : imageFile) { Runtime.getRuntime().exec("tesseract "+file.getAbsolutePath()+ " "+dir+File.separator+"imageText"+i); i++; } } catch (Exception e) { System.err.println(e.getMessage()); } } |
4/01/2015
SFTP/FTP JAVA file upload or download from FTP Server
SFTP/FTP JAVA file upload or download from FTP Server
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | private String host="host"; private String user="user"; private String pwd="password"; private String localFileFullPath="C:/temp/file.txt"; private String remoteFileName="ftpFileName"; private String sftpWorkingDir="/ftpFolder/test; public void uploadFile() throws Exception{ JSch jsch = new JSch(); Session session = jsch.getSession(user, host,22); session.setPassword(pwd); java.util.Properties config = new java.util.Properties(); config.put("StrictHostKeyChecking", "no"); session.setConfig(config); session.connect(); Channel channel = session.openChannel("sftp"); channel.connect(); ChannelSftp channelSftp = (ChannelSftp)channel; channelSftp.cd(sftpWorkingDir); channelSftp.put(new FileInputStream(new File(localFileFullPath)), remoteFileName); channelSftp.exit(); session.disconnect(); } public void downloadFile() throws Exception{ JSch jsch = new JSch(); Session session = jsch.getSession(user, host,22); session.setPassword(pwd); java.util.Properties config = new java.util.Properties(); config.put("StrictHostKeyChecking", "no"); session.setConfig(config); session.connect(); Channel channel = session.openChannel("sftp"); channel.connect(); ChannelSftp channelSftp = (ChannelSftp)channel; channelSftp.cd(sftpWorkingDir); //channelSftp.put(new FileInputStream(new File(localFileFullPath)), remoteFileName); channelSftp.get(remoteFileName, new FileOutputStream(new File(localFileFullPath))); channelSftp.exit(); session.disconnect(); } |
3/06/2015
Split large multi header .csv file to multiple files in Java
Split large multi header .csv file to multiple files in Java
Problem
----------------
if we have large .csv file with multiple header something like below. In multi Thread ETL process we need to split this file to multiple files on Header.
AAA
Column1 | Column2 | Column3 ..................
for the above format I used the regular expression is split point.
Problem
----------------
if we have large .csv file with multiple header something like below. In multi Thread ETL process we need to split this file to multiple files on Header.
AAA
Column1 | Column2 | Column3 ..................
row1 row2 .... ;;;;; BBB Column1 | Column2 | Column3 ..................
|
for the above format I used the regular expression is split point.
String regex = "^.*[A-Z]$"
you can change this expression in below method as per your column header in below function before intended to use in your problem.
The below function go through the large file line by line and as header encounter move the lines to new file. Save function do the save or creation of new file.
you can change this expression in below method as per your column header in below function before intended to use in your problem.
The below function go through the large file line by line and as header encounter move the lines to new file. Save function do the save or creation of new file.
private String parentFolder = "C:/ETL/copy/";
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | public void split(String fileName) throws IOException { try { // opens the file in a string buffer File headFile=new File(parentFolder + fileName + ".csv"); BufferedReader bufferedReader = new BufferedReader(new FileReader(headFile)); StringBuffer stringBuffer = new StringBuffer(); // performs the splitting String line; int row = 0; int counter = 1; while ((line = bufferedReader.readLine()) != null) { String regex = "^.*[A-Z]$"; boolean isMatch = Pattern.matches(regex, line.trim()); if (isMatch) { logger.info(line); } if (isMatch && row != 0) { saveFile(stringBuffer, fileName + counter + ".csv",headFile.lastModified()); counter++; stringBuffer = new StringBuffer(); stringBuffer.append(line); stringBuffer.append(NEWLINE); } else { stringBuffer.append(line); stringBuffer.append(NEWLINE); } row++; } saveFile(stringBuffer,fileName + counter + ".csv",headFile.lastModified()); bufferedReader.close(); } catch (IOException e) { e.printStackTrace(); } }
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
private void saveFile(StringBuffer stringBuffer, String filename,long lastModifiedTime) throws IOException { File file = new File(parentFolder + "splittedFile"); file.mkdir(); FileWriter output = null; try { file = new File(parentFolder + "splittedFile/" + filename); file.setLastModified(lastModifiedTime); output = new FileWriter(file); output.write(stringBuffer.toString()); // System.out.println("file " + file.getAbsolutePath() + // " written"); } catch (IOException e) { e.printStackTrace(); } finally { try { output.close(); } catch (IOException e) { // do nothing the file wasn't been even opened } } } |
1/21/2015
From JAVA to HDFS File operations : Read, write, copy, delete, create
From JAVA to HDFS File operations : Read, write, copy, delete, create
1. creating the directory in the HDFS
2. deleting the directory in the HDF
3. copying file from local to HDFS
4. Read File From HDFS
5. Write File To HDFS
Change below configuration to your .xml files and use it.
Configuration conf = new Configuration(); conf.addResource(new Path("C:/hadoop-2.5.1/etc/hadoop/core-site.xml")); conf.addResource(new Path("C:/hadoop-2.5.1/etc/hadoop/hdfs-site.xml"));-----------------------------------------------------------------------
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 | package com.my.cert.example; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; public class HDFSFileOperations { public static void main(String args[]) throws IOException { Configuration conf = new Configuration(); conf.addResource(new Path("C:/hadoop-2.5.1/etc/hadoop/core-site.xml")); conf.addResource(new Path("C:/hadoop-2.5.1/etc/hadoop/hdfs-site.xml")); FileSystem hdfs = FileSystem.get(conf); System.out.println("Home Dir: " + getHomedirectory(hdfs)); System.out.println("Create Directory : "+ createDirectory(hdfs, "/testFolder")); System.out.println("copy File From Local: "+ copyFileFromLocal(hdfs, "testFolder","C:/hadoop/test/test.txt")); readDataFromHDFSFile(hdfs, "testFolder/test.txt"); writingDataToHDFS(hdfs, "testFolder/test.txt"); } public static Path getHomedirectory(FileSystem hdfc) throws IOException { Path homeDir = hdfc.getHomeDirectory(); return homeDir; } /* * creating the directory in the HDFS */ public static boolean createDirectory(FileSystem hdfs, String dirName) throws IOException { Path homeDir = getHomedirectory(hdfs); Path newFolderPath = new Path(dirName); newFolderPath = Path.mergePaths(homeDir, newFolderPath); if (hdfs.exists(newFolderPath)) { hdfs.delete(newFolderPath, true); } return hdfs.mkdirs(newFolderPath); } /* * deleting the directory in the HDFS */ public static boolean deleteDirectory(FileSystem hdfs, String dirName) throws IOException { Path deleteFolderName = new Path(dirName); if (hdfs.exists(deleteFolderName)) { return hdfs.delete(deleteFolderName, true); } return false; } /* * copying file from local to HDFS */ public static boolean copyFileFromLocal(FileSystem hdfs, String hdfsFolderName, String localFileAbsPath) throws IOException { Path localFilePath = new Path(localFileAbsPath); String localFileName = new File(localFileAbsPath).getName(); Path hdfsFolderpath = new Path(hdfsFolderName + "/" + localFileName); if (!hdfs.exists(hdfsFolderpath)) { hdfs.createNewFile(hdfsFolderpath); } hdfs.copyFromLocalFile(localFilePath, hdfsFolderpath); return true; } public static void readDataFromHDFSFile(FileSystem hdfs, String filePath) throws IllegalArgumentException, IOException { BufferedReader bfr = new BufferedReader(new InputStreamReader( hdfs.open(new Path(filePath)))); String str = null; while ((str = bfr.readLine()) != null) { System.out.println(str); } } public static void writingDataToHDFS(FileSystem hdfs, String filePath) throws IllegalArgumentException, IOException { StringBuilder sb = new StringBuilder(); for (int i = 1; i <= 5; i++) { sb.append("Test creating file" + i); sb.append("\n"); } byte[] byt = sb.toString().getBytes(); FSDataOutputStream fsOutStream = hdfs.create(new Path(filePath)); fsOutStream.write(byt); fsOutStream.close(); } } |
Subscribe to:
Posts (Atom)