/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * this parser was copied and modified to fit into YaCy from the apache tika project */ package net.yacy.document.parser; import java.io.InputStream; import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.VocabularyScraper; import net.yacy.kelondro.util.MemoryControl; import org.apache.poi.util.StringUtil; public class dwgParser extends AbstractParser implements Parser { private static final String HEADER_2000_PROPERTIES_MARKER_STR = "DWGPROPS COOKIE"; private static final byte[] HEADER_2000_PROPERTIES_MARKER = new byte[HEADER_2000_PROPERTIES_MARKER_STR.length()]; static { StringUtil.putCompressedUnicode( HEADER_2000_PROPERTIES_MARKER_STR, HEADER_2000_PROPERTIES_MARKER, 0); } /** * How far to skip after the last standard property, before * we find any custom properties that might be there. */ //private static final int CUSTOM_PROPERTIES_SKIP = 20; public dwgParser() { super("DWG (CAD Drawing) parser (very basic)"); this.SUPPORTED_EXTENSIONS.add("dwg"); this.SUPPORTED_MIME_TYPES.add("application/dwg"); this.SUPPORTED_MIME_TYPES.add("applications/vnd.dwg"); } @Override public Document[] parse( final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser if (!MemoryControl.request(200 * 1024 * 1024, true)) throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(), location); return null; // First up, which version of the format are we handling? /* byte[] header = new byte[128]; IOUtils.readFully(source, header); String version = new String(header, 0, 6, "US-ASCII"); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); if (version.equals("AC1015")) { metadata.set(Metadata.CONTENT_TYPE, TYPE.toString()); if (skipTo2000PropertyInfoSection(stream, header)) { get2000Props(stream,metadata,xhtml); } } else if (version.equals("AC1018")) { metadata.set(Metadata.CONTENT_TYPE, TYPE.toString()); if (skipToPropertyInfoSection(stream, header)) { get2004Props(stream,metadata,xhtml); } } else if (version.equals("AC1021") || version.equals("AC1024")) { metadata.set(Metadata.CONTENT_TYPE, TYPE.toString()); if (skipToPropertyInfoSection(stream, header)) { get2007and2010Props(stream,metadata,xhtml); } } else { throw new TikaException( "Unsupported AutoCAD drawing version: " + version); } xhtml.endDocument(); String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null; if (info != null) { docTitle = info.getTitle(); docSubject = info.getSubject(); docAuthor = info.getAuthor(); docPublisher = info.getProducer(); if (docPublisher == null || docPublisher.isEmpty()) docPublisher = info.getCreator(); docKeywordStr = info.getKeywords(); } if (docTitle == null || docTitle.isEmpty()) { docTitle = MultiProtocolURI.unescape(location.getFileName()); } String[] docKeywords = null; if (docKeywordStr != null) { docKeywords = docKeywordStr.split(" |,"); } if (docTitle == null) { docTitle = docSubject; } byte[] contentBytes; return new Document[]{new Document( location, mimeType, "UTF-8", this, null, docKeywords, docTitle, docAuthor, docPublisher, null, null, 0.0f, 0.0f, contentBytes, null, null, null, false)}; */ } /* private void get2004Props( InputStream stream, Metadata metadata, XHTMLContentHandler xhtml) throws IOException, TikaException, SAXException { // Standard properties for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) { String headerValue = read2004String(stream); handleHeader(i, headerValue, metadata, xhtml); } // Custom properties int customCount = skipToCustomProperties(stream); for (int i = 0; i < customCount; i++) { String propName = read2004String(stream); String propValue = read2004String(stream); if(propName.length() > 0 && propValue.length() > 0) { metadata.add(propName, propValue); } } } private String read2004String(InputStream stream) throws IOException, TikaException { int stringLen = EndianUtils.readUShortLE(stream); byte[] stringData = new byte[stringLen]; IOUtils.readFully(stream, stringData); // Often but not always null terminated if (stringData[stringLen-1] == 0) { stringLen--; } String value = StringUtil.getFromCompressedUnicode(stringData, 0, stringLen); return value; } // Stored as UCS2, so 16 bit "unicode" private void get2007and2010Props( InputStream stream, Metadata metadata, XHTMLContentHandler xhtml) throws IOException, TikaException, SAXException { // Standard properties for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) { String headerValue = read2007and2010String(stream); handleHeader(i, headerValue, metadata, xhtml); } // Custom properties int customCount = skipToCustomProperties(stream); for (int i = 0; i < customCount; i++) { String propName = read2007and2010String(stream); String propValue = read2007and2010String(stream); if(propName.length() > 0 && propValue.length() > 0) { metadata.add(propName, propValue); } } } private String read2007and2010String(InputStream stream) throws IOException, TikaException { int stringLen = EndianUtils.readUShortLE(stream); byte[] stringData = new byte[stringLen * 2]; IOUtils.readFully(stream, stringData); String value = StringUtil.getFromUnicodeLE(stringData); // Some strings are null terminated if(value.charAt(value.length()-1) == 0) { value = value.substring(0, value.length()-1); } return value; } private void get2000Props( InputStream stream, Metadata metadata, XHTMLContentHandler xhtml) throws IOException, TikaException, SAXException { int propCount = 0; while(propCount < 30) { int propIdx = EndianUtils.readUShortLE(stream); int length = EndianUtils.readUShortLE(stream); int valueType = stream.read(); if(propIdx == 0x28) { // This one seems not to follow the pattern length = 0x19; } else if(propIdx == 90) { // We think this means the end of properties break; } byte[] value = new byte[length]; IOUtils.readFully(stream, value); if(valueType == 0x1e) { // Normal string, good String val = StringUtil.getFromCompressedUnicode(value, 0, length); // Is it one we can look up by index? if(propIdx < HEADER_2000_PROPERTIES_ENTRIES.length) { metadata.add(HEADER_2000_PROPERTIES_ENTRIES[propIdx], val); xhtml.element("p", val); } else if(propIdx == 0x012c) { int splitAt = val.indexOf('='); if(splitAt > -1) { String propName = val.substring(0, splitAt); String propVal = val.substring(splitAt+1); metadata.add(propName, propVal); } } } else { // No idea... } propCount++; } } private void handleHeader( int headerNumber, String value, Metadata metadata, XHTMLContentHandler xhtml) throws SAXException { if(value == null || value.isEmpty()) { return; } String headerProp = HEADER_PROPERTIES_ENTRIES[headerNumber]; if(headerProp != null) { metadata.set(headerProp, value); } xhtml.element("p", value); } // Grab the offset, then skip there private boolean skipToPropertyInfoSection(InputStream stream, byte[] header) throws IOException, TikaException { // The offset is stored in the header from 0x20 onwards long offsetToSection = EndianUtils.getLongLE(header, 0x20); long toSkip = offsetToSection - header.length; if(offsetToSection == 0){ return false; } while (toSkip > 0) { byte[] skip = new byte[Math.min((int) toSkip, 0x4000)]; IOUtils.readFully(stream, skip); toSkip -= skip.length; } return true; } //We think it can be anywhere... private boolean skipTo2000PropertyInfoSection(InputStream stream, byte[] header) throws IOException { int val = 0; while(val != -1) { val = stream.read(); if(val == HEADER_2000_PROPERTIES_MARKER[0]) { boolean going = true; for(int i=1; i<HEADER_2000_PROPERTIES_MARKER.length && going; i++) { val = stream.read(); if(val != HEADER_2000_PROPERTIES_MARKER[i]) going = false; } if(going) { // Bingo, found it return true; } } } return false; } private int skipToCustomProperties(InputStream stream) throws IOException, TikaException { // There should be 4 zero bytes next byte[] padding = new byte[4]; IOUtils.readFully(stream, padding); if(padding[0] == 0 && padding[1] == 0 && padding[2] == 0 && padding[3] == 0) { // Looks hopeful, skip on padding = new byte[CUSTOM_PROPERTIES_SKIP]; IOUtils.readFully(stream, padding); // We should now have the count int count = EndianUtils.readUShortLE(stream); // Sanity check it if(count > 0 && count < 0x7f) { // Looks plausible return count; } else { // No properties / count is too high to trust return 0; } } else { // No padding. That probably means no custom props return 0; } } public static void main(final String[] args) { if (args.length > 0 && args[0].length() > 0) { // file final File dwgFile = new File(args[0]); if(dwgFile.canRead()) { System.out.println(dwgFile.getAbsolutePath()); final long startTime = System.currentTimeMillis(); // parse final AbstractParser parser = new dwgParser(); Document document = null; try { document = Document.mergeDocuments(null, "application/dwg", parser.parse(null, "application/dwg", null, new FileInputStream(dwgFile))); } catch (final Parser.Failure e) { System.err.println("Cannot parse file " + dwgFile.getAbsolutePath()); Log.logException(e); } catch (final InterruptedException e) { System.err.println("Interrupted while parsing!"); Log.logException(e); } catch (final NoClassDefFoundError e) { System.err.println("class not found: " + e.getMessage()); } catch (final FileNotFoundException e) { Log.logException(e); } // statistics System.out.println("\ttime elapsed: " + (System.currentTimeMillis() - startTime) + " ms"); // output if (document == null) { System.out.println("\t!!!Parsing without result!!!"); } else { System.out.println("\tParsed text with " + document.getTextLength() + " chars of text and " + document.getAnchors().size() + " anchors"); try { // write file FileUtils.copy(document.getText(), new File("parsedPdf.txt")); } catch (final IOException e) { System.err.println("error saving parsed document"); Log.logException(e); } } } else { System.err.println("Cannot read file "+ dwgFile.getAbsolutePath()); } } else { System.out.println("Please give a filename as first argument."); } } */ }