Remove lines less then 100

Version 13 - Updated on 19 Nov 2017 at 5:01AM by Joachim Hansen

Description

Have to do it in the malware dataset


cat /home/search/Downloads/Datasets/batchMalware/mB00  |  sed -e 's/^/{"content":"/' | gawk 'NF{print $0 " \"}"}' | gawk ' {print;} NR % 1 == 0 { print "{ \"index\":{} }"; }' | gawk 'BEGIN{print "{ \"index\":{} }";}{print;}' |head -n -1 | gawk 'END { print "";}{print;}' >> /home/search/Downloads/Datasets/batchMalwareElasticImport/mBE00



#!/bin/bash
# Have to be run as full path to output folder (ending with /) as command line argument 1
# Have to be run in the same current working folder (PWD) as the batch files to be preproccesed
currDir=$(pwd);

for f in $(ls -p | grep -v '/');
do
fileToBePreproccesed=(echo "$currDir/$f");
outputPath=$(echo "$1$f");
cat $fileToBePreproccesed | sed -e 's/^/{"content":"/' | gawk 'NF{print $0 " \"}"}' | gawk ' {print;} NR % 1 == 0 { print "{ \"index\":{} }"; }' | gawk 'BEGIN{print "{ \"index\":{} }";}{print;}' |head -n -1 | gawk 'END { print "";}{print;}' >> $outputPath;
done


chmod 755 ElasticBatchPreproccing.sh
Command run to make the bash script executeble.

#!/bin/bash
# Have to be run as full path to output folder (ending with /) as command line argument 1
# Have to be run in the same current working folder (PWD) as the batch files to be preproccesed
currDir=$(pwd);

for f in $(ls -p | grep -v '/');
do
fileToBePreproccesed="$currDir/$f";
outputPath="$1$f";
cat $fileToBePreproccesed | sed -e 's/^/{"content":"/' | gawk 'NF{print $0 " \"}"}' | gawk ' {print;} NR % 1 == 0 { print "{ \"index\":{} }"; }' | gawk 'BEGIN{print "{ \"index\":{} }";}{print;}' |head -n -1 | gawk 'END { print "";}{print;}' >> $outputPath;
done


The bash above worked to preprocess all batch files in Elastic preproccing.


time curl --verbose -s -H "Content-Type: application/x-ndjson" -XPOST localhost:9200/malware-dataset-test6/malware/_bulk?pretty --data-binary @/home/search/Downloads/Datasets/batchMalwareElasticImport/mBE00 | head -100



#!/bin/bash
# Have to be run as full path to output folder (ending with /) as command line argument 1
# Have to be run in the same current working folder (PWD) as the batch files to be preproccesed
currDir=$(pwd);
indexname=$1;
for f in $(ls -p | grep -v '/');
do
fileToBePreproccesed="$currDir/$f";
curl -s -H "Content-Type: application/x-ndjson" -XPOST localhost:9200/$indexname/malware/_bulk --data-binary @$fileToBePreproccesed;
done


chomod 755 ElasticSearchIndexBatch.sh to make it executable



#!/bin/bash
# have to provide index name in command line argument 1 (first and only argument)
# index name have to be in lowercase
# Have to be run in the same current working folder (PWD) as the batch files to be preproccesed
currDir=$(pwd);
indexname=$1;
for f in $(ls -p | grep -v '/');
do
fileToBePreproccesed="$currDir/$f";
curl -s -H "Content-Type: application/x-ndjson" -XPOST localhost:9200/$indexname/somedoctype/_bulk --data-binary @$fileToBePreproccesed;
done




#!/bin/bash
# have to provide index name in command line argument 1 (first and only argument)
# index name have to be in lowercase
# Have to be run in the same current working folder (PWD) as the batch files to be preproccesed
# Need ?pretty to get took in line for itself
currDir=$(pwd);
indexname=$1;
sumTook=0;
declare -i took;
for f in $(ls -p | grep -v '/');
do
fileToBePreproccesed="$currDir/$f";
took=$(curl -s -H "Content-Type: application/x-ndjson" -XPOST localhost:9200/$indexname/somedoctype/_bulk?pretty --data-binary @$fileToBePreproccesed | grep 'took' | tr -dc '0-9');
((sumTook += $took));
done
# (($sumTook += $took)); was apparently incorrect
echo "Took in total: $sumTook";